GPU服务器安装驱动
环境
-
系统和内核
shelluname -r # 5.15.0-163-generic lsb_release -a # Ubuntu 22.04.5 LTS -
docker
shelldocker version # 29.1.2 -
GPU
shellnvidia-smi # Tesla T4 NVIDIA-SMI 550.163.01 Driver Version: 550.163.01 CUDA Version: 12.4 -
nvcc
shellnvcc --version # 12.4, V12.4.131 , cuda_12.4.r12.4/compiler.34097967_0
安装
-
docker
shellapt purge -y docker-ce docker-ce-cli containerd.io docker-compose-plugin docker.io rm -rf /var/lib/docker rm -rf /var/lib/containerd rm -rf /etc/docker rm -f /etc/apt/sources.list.d/docker.list apt clean rm -rf /var/lib/apt/lists/* apt update && apt install -y ca-certificates curl gnupg lsb-release curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" | tee /etc/apt/sources.list.d/docker.list > /dev/null apt update apt install -y docker-ce docker-ce-cli containerd.io docker-compose-plugin systemctl start docker systemctl enable docker systemctl status docker ls -l /var/run/docker.sock # 查看这个文件是否存在 -
安装GPU驱动和Cuda
shellapt purge -y nvidia-* cuda-* apt autoremove -y && apt autoclean rm -rf /usr/local/cuda* rm -rf /var/lib/nvidia* apt update && apt upgrade -y apt install -y build-essential dkms gcc g++ cmake libglvnd-dev pkg-config tee /etc/modprobe.d/blacklist-nouveau.conf <<EOF blacklist nouveau blacklist lbm-nouveau options nouveau modeset=0 alias nouveau off alias lbm-nouveau off EOF update-initramfs -u reboot lsmod | grep nouveau # 无输出则成功 rm -f /etc/apt/sources.list.d/nvidia-cuda.list rm -f /etc/apt/sources.list.d/nvidia-cuda.list.bak 2>/dev/null rm -f /etc/apt/sources.list.d/nvidia-machine-learning.list apt update apt install -y software-properties-common apt-transport-https curl curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/3bf863cc.pub | gpg --dearmor -o /usr/share/keyrings/nvidia-cuda-keyring.gpg echo "deb [signed-by=/usr/share/keyrings/nvidia-cuda-keyring.gpg] https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/ /" | tee /etc/apt/sources.list.d/nvidia-cuda.list apt update apt install -y linux-headers-$(uname -r) apt install -y linux-headers-generic build-essential dkms gcc g++ rm -rf /var/lib/dkms/nvidia/ 2>/dev/null dpkg --configure -a apt install -y nvidia-driver-550 nvidia-dkms-550 modprobe nvidia reboot nvidia-smi -
安装python
shellapt install -y git curl zlib1g-dev libssl-dev libreadline-dev libsqlite3-dev libbz2-dev libffi-dev gcc make git clone https://github.com/pyenv/pyenv.git ~/.pyenv echo 'export PYENV_ROOT="$HOME/.pyenv"' >> ~/.bashrc echo 'command -v pyenv >/dev/null || export PATH="$PYENV_ROOT/bin:$PATH"' >> ~/.bashrc echo 'eval "$(pyenv init -)"' >> ~/.bashrc source ~/.bashrc mkdir ~/.pyenv/cache cd ~/.pyenv/cache wget https://mirrors.aliyun.com/python-release/source/Python-3.10.19.tar.xz apt install -y liblzma-dev xz-utils pyenv install 3.10.19 # 会提示ModuleNotFoundError: No module named '_tkinter',忽略,我们不需要图形界面 pyenv global 3.10.19 python -m venv ~/cuda124-py310 source ~/cuda124-py310/bin/activate mkdir -p ~/.config/pip && vi ~/.config/pip/pip.conf # 2. 写入以下内容(清华源,包含PyTorch适配) # [global] # index-url = https://pypi.tuna.tsinghua.edu.cn/simple # trusted-host = pypi.tuna.tsinghua.edu.cn # extra-index-url = https://download.pytorch.org/whl/cu124 pip install torch torchvision torchaudio -i https://pypi.tuna.tsinghua.edu.cn/simple --index-url https://download.pytorch.org/whl/cu124 -
安装nvidia-container-toolkit
shelldistribution=$(. /etc/os-release;echo $ID$VERSION_ID) curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | apt-key add - curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | tee /etc/apt/sources.list.d/nvidia-docker.list apt-get update && sudo apt-get install -y nvidia-container-toolkit nvidia-ctk runtime configure --runtime=docker systemctl restart docker
验证
shell
docker run --rm --gpus all nvidia/cuda:11.8.0-base-ubuntu20.04 nvidia-smi # 如果有输出驱动信息,则正常,没有则不正常