12.9 pytorch版本的chatglm2-6b部署
一、前置条件
- 已完成操作系统安装(ubuntu、ctyunos、openeuler、kylin等系统)
- 服务器已连通外网权限(非必要,便于下载文件包,若没有则手动上传)
- 使用磁盘空间建议>1TB
- 准备docker安装包、docker镜像、代码、预训练权重、数据集(后续准备工作中会提供链接)
二、应用部署
1.docker部署
#下载安装包
wget https://download.docker.com/linux/static/stable/aarch64/docker-18.09.8.tgz --no-check-certificate
#解压与安装
tar xvpf docker-18.09.8.tgz
cp -p -f docker/* /usr/bin
#准备环境
#为使Docker可以正常使用,还需要关闭防火墙。
setenforce 0
systemctl stop firewalld
systemctl disable firewalld
#配置docker.service服务 vim /usr/lib/systemd/system/docker.service
#按i进入编辑粘贴以下内容
[Unit]
Description=Docker Application Container Engine
Documentation=http://docs.docker.com
After=network.target docker.socket
[Service]
Type=notify
EnvironmentFile=-/run/flannel/docker
WorkingDirectory=/usr/local/bin
ExecStart=/usr/bin/dockerd -H tcp://0.0.0.0:4243 -H unix:///var/run/docker.sock --selinux-enabled=false --log-opt max-size=1g
ExecReload=/bin/kill -s HUP
# Having non-zero Limit*s causes performance problems due to accounting overhead
# in the kernel. We recommend using cgroups to do container-local accounting.
LimitNOFILE=infinity
LimitNPROC=infinity
LimitCORE=infinity
# Uncomment TasksMax if your systemd version supports it.
# Only systemd 226 and above support this version.
#TasksMax=infinity
TimeoutStartSec=0
# set delegate yes so that systemd does not reset the cgroups of docker containers
Delegate=yes
# kill only the docker process, not all processes in the cgroup
KillMode=process
Restart=on-failure
[Install]
WantedBy=multi-user.target
#启动相关服务
systemctl daemon-reload
systemctl status docker
systemctl restart docker
systemctl status docker
systemctl enable docker
#注意!启动后需要确认docker的文件路径是否有足够的空间,一般会默认在/usr下,若空间不足,请根据以下操作更换路径
vi /etc/docker/daemon.json
#添加以下内容,注意修改以下路径/home/docker是个样例
{
"data-root":"/home/docker"
}
退出wq
sudo systemctl daemon-reload
sudo systemctl restart docker
2.准备资源
cd /home/work
#docker镜像下载
wget https://czy.obs.cn-east-324.fjaicc.com/chatglm2-6b-pytorch/images/chatglm2-6b-pytorch.tar
#加载镜像
docker load -i chatglm2-6b-pytorch.tar
#代码(含数据集与权重)下载
#代码解压
tar -zxvf ModelZoo-PyTorch.tar.gz
3.启动容器 #启动容器这里请注意修改<???>中的内容
docker run -itd -u root --ipc=host \
--device=/dev/davinci0 \
--device=/dev/davinci1 \
--device=/dev/davinci2 \
--device=/dev/davinci3 \
--device=/dev/davinci4 \
--device=/dev/davinci5 \
--device=/dev/davinci6 \
--device=/dev/davinci7 \
--device=/dev/davinci_manager \
--device=/dev/devmm_svm \
--device=/dev/hisi_hdc \
-v /usr/local/Ascend/driver:/usr/local/Ascend/driver \
-v /usr/local/Ascend/add-ons/:/usr/local/Ascend/add-ons/ \
-v /usr/local/sbin/npu-smi:/usr/local/sbin/npu-smi \
-v /usr/local/sbin/:/usr/local/sbin/ \
-v /var/log/npu/conf/slog/slog.conf:/var/log/npu/conf/slog/slog.conf \
-v /var/log/npu/slog/:/var/log/npu/slog \
-v /var/log/npu/profiling/:/var/log/npu/profiling \
-v /var/log/npu/dump/:/var/log/npu/dump \
-v /var/log/npu/:/usr/slog \
-v <主机中映射的工作路径里面包含了代码权重数据集等如:/home/work/>:/home/work \
--name <容器的名称> \
chatglm2-6b-pytorch:v1.0 \
/bin/bash
4.进入容器,开启微调
docker exec -it <容器ID> bash
cd /home/work/ModelZoo-PyTorch/PyTorch/built-in/foundation/ChatGLM2-6B/ptuning/
#ptuning v2 单卡微调
bash train.sh
#8卡全参数fintune
bash ds_train_finetune.sh
#lora微调
bash ds_train_lora.sh