GPU服务器安装驱动

GPU服务器安装驱动

环境

  • 系统和内核

    shell 复制代码
    uname -r  # 5.15.0-163-generic
    lsb_release -a # Ubuntu 22.04.5 LTS
  • docker

    shell 复制代码
    docker version # 29.1.2         
  • GPU

    shell 复制代码
    nvidia-smi # Tesla T4 NVIDIA-SMI 550.163.01             Driver Version: 550.163.01     CUDA Version: 12.4
  • nvcc

    shell 复制代码
    nvcc --version # 12.4, V12.4.131 , cuda_12.4.r12.4/compiler.34097967_0

安装

  • docker

    shell 复制代码
    apt purge -y docker-ce docker-ce-cli containerd.io docker-compose-plugin docker.io
    rm -rf /var/lib/docker
    rm -rf /var/lib/containerd
    rm -rf /etc/docker
    rm -f /etc/apt/sources.list.d/docker.list
    apt clean
    rm -rf /var/lib/apt/lists/*
    apt update &&  apt install -y ca-certificates curl gnupg lsb-release
    curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg
    echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" | tee /etc/apt/sources.list.d/docker.list > /dev/null
    apt update
    apt install -y docker-ce docker-ce-cli containerd.io docker-compose-plugin
    systemctl start docker
    systemctl enable docker
    systemctl status docker
    ls -l /var/run/docker.sock # 查看这个文件是否存在
  • 安装GPU驱动和Cuda

    shell 复制代码
    apt purge -y nvidia-* cuda-*
    apt autoremove -y && apt autoclean
    rm -rf /usr/local/cuda*
    rm -rf /var/lib/nvidia*
    apt update && apt upgrade -y
    apt install -y build-essential dkms gcc g++ cmake libglvnd-dev pkg-config
    
    tee /etc/modprobe.d/blacklist-nouveau.conf <<EOF
    blacklist nouveau
    blacklist lbm-nouveau
    options nouveau modeset=0
    alias nouveau off
    alias lbm-nouveau off
    EOF
    
    update-initramfs -u
    reboot
    lsmod | grep nouveau # 无输出则成功
    
    rm -f /etc/apt/sources.list.d/nvidia-cuda.list
    rm -f /etc/apt/sources.list.d/nvidia-cuda.list.bak 2>/dev/null
    rm -f /etc/apt/sources.list.d/nvidia-machine-learning.list
    apt update
    apt install -y software-properties-common apt-transport-https curl
    curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/3bf863cc.pub | gpg --dearmor -o /usr/share/keyrings/nvidia-cuda-keyring.gpg
    echo "deb [signed-by=/usr/share/keyrings/nvidia-cuda-keyring.gpg] https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/ /" | tee /etc/apt/sources.list.d/nvidia-cuda.list
    apt update
    
    apt install -y linux-headers-$(uname -r)
    apt install -y linux-headers-generic build-essential dkms gcc g++
    rm -rf /var/lib/dkms/nvidia/ 2>/dev/null
    dpkg --configure -a
    apt install -y nvidia-driver-550 nvidia-dkms-550
    modprobe nvidia
    reboot
    nvidia-smi
  • 安装python

    shell 复制代码
    apt install -y git curl zlib1g-dev libssl-dev libreadline-dev libsqlite3-dev libbz2-dev libffi-dev gcc make
    git clone https://github.com/pyenv/pyenv.git ~/.pyenv
    echo 'export PYENV_ROOT="$HOME/.pyenv"' >> ~/.bashrc
    echo 'command -v pyenv >/dev/null || export PATH="$PYENV_ROOT/bin:$PATH"' >> ~/.bashrc
    echo 'eval "$(pyenv init -)"' >> ~/.bashrc
    source ~/.bashrc
    mkdir ~/.pyenv/cache
    cd  ~/.pyenv/cache
    wget https://mirrors.aliyun.com/python-release/source/Python-3.10.19.tar.xz
    apt install -y liblzma-dev xz-utils  
    pyenv install 3.10.19  # 会提示ModuleNotFoundError: No module named '_tkinter',忽略,我们不需要图形界面
    pyenv global 3.10.19
    python -m venv ~/cuda124-py310
    source ~/cuda124-py310/bin/activate
    mkdir -p ~/.config/pip && vi ~/.config/pip/pip.conf  
          # 2. 写入以下内容(清华源,包含PyTorch适配)
          # [global]
          # index-url = https://pypi.tuna.tsinghua.edu.cn/simple
          # trusted-host = pypi.tuna.tsinghua.edu.cn
          # extra-index-url = https://download.pytorch.org/whl/cu124
    pip install torch torchvision torchaudio -i https://pypi.tuna.tsinghua.edu.cn/simple --index-url https://download.pytorch.org/whl/cu124
  • 安装nvidia-container-toolkit

    shell 复制代码
    distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
    curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | apt-key add -
    curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | tee /etc/apt/sources.list.d/nvidia-docker.list
    apt-get update && sudo apt-get install -y nvidia-container-toolkit
    nvidia-ctk runtime configure --runtime=docker
    systemctl restart docker

验证

shell 复制代码
docker run --rm --gpus all nvidia/cuda:11.8.0-base-ubuntu20.04 nvidia-smi # 如果有输出驱动信息,则正常,没有则不正常
相关推荐
AOwhisky17 分钟前
学习自测与解析:MySQL第五、六、七期核心知识点详解
运维·数据库·笔记·学习·mysql·云计算
无限进步_21 分钟前
从零实现一个迷你Shell——深入理解Linux命令行解释器
linux·运维·服务器·开发语言·c++·chrome
阿标在干嘛27 分钟前
政策平台的推送系统:消息队列、定时任务、AB测试的工程实践
服务器·数据库·ab测试
Adorable老犀牛42 分钟前
nginx_exporter:Prometheus 监控 Nginx 基础指标
运维·nginx·prometheus
山里幽默的程序员43 分钟前
DevOps 必备:盘点2026 年最强RESTful API 接口测试方案
运维·restful·devops·api开发·api开发工具
happymaker06261 小时前
Linux常见命令总结
linux·运维·服务器
加农炮手Jinx1 小时前
Flutter for OpenHarmony:pub_updater 命令行工具自动更新专家(DevOps 运维必备) 深度解析与鸿蒙适配指南
android·运维·网络·flutter·华为·harmonyos·devops
不念霉运1 小时前
Gitee领跑2025中国DevOps市场:本土力量崛起
运维
无心水1 小时前
【Hermes:团队、企业、生态与边界】47、Hermes 在 CI/CD 中的完整 DevOps 流水线:从 PR 审查到自动部署,让 Agent 接管你的发布流程
运维·人工智能·devops·openclaw·养龙虾·hermes·honcho
开源量化GO1 小时前
期货 K 线算信号 tick 级止损:天勤双序列 wait_update 触发规则
linux·运维·服务器·python