GPU服务器安装驱动
环境
- 系统和内核
uname -r
lsb_release -a
- docker
docker version
- GPU
nvidia-smi
- nvcc
nvcc --version
安装
- docker
apt purge -y docker-ce docker-ce-cli containerd.io docker-compose-plugin docker.io
rm -rf /var/lib/docker
rm -rf /var/lib/containerd
rm -rf /etc/docker
rm -f /etc/apt/sources.list.d/docker.list
apt clean
rm -rf /var/lib/apt/lists/*
apt update && apt install -y ca-certificates curl gnupg lsb-release
curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg
echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" | tee /etc/apt/sources.list.d/docker.list > /dev/null
apt update
apt install -y docker-ce docker-ce-cli containerd.io docker-compose-plugin
systemctl start docker
systemctl enable docker
systemctl status docker
ls -l /var/run/docker.sock
- 安装GPU驱动和Cuda
apt purge -y nvidia-* cuda-*
apt autoremove -y && apt autoclean
rm -rf /usr/local/cuda*
rm -rf /var/lib/nvidia*
apt update && apt upgrade -y
apt install -y build-essential dkms gcc g++ cmake libglvnd-dev pkg-config
tee /etc/modprobe.d/blacklist-nouveau.conf <<EOF
blacklist nouveau
blacklist lbm-nouveau
options nouveau modeset=0
alias nouveau off
alias lbm-nouveau off
EOF
update-initramfs -u
reboot
lsmod | grep nouveau
rm -f /etc/apt/sources.list.d/nvidia-cuda.list
rm -f /etc/apt/sources.list.d/nvidia-cuda.list.bak 2>/dev/null
rm -f /etc/apt/sources.list.d/nvidia-machine-learning.list
apt update
apt install -y software-properties-common apt-transport-https curl
curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/3bf863cc.pub | gpg --dearmor -o /usr/share/keyrings/nvidia-cuda-keyring.gpg
echo "deb [signed-by=/usr/share/keyrings/nvidia-cuda-keyring.gpg] https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/ /" | tee /etc/apt/sources.list.d/nvidia-cuda.list
apt update
apt install -y linux-headers-$(uname -r)
apt install -y linux-headers-generic build-essential dkms gcc g++
rm -rf /var/lib/dkms/nvidia/ 2>/dev/null
dpkg --configure -a
apt install -y nvidia-driver-550 nvidia-dkms-550
modprobe nvidia
reboot
nvidia-smi
- 安装python
apt install -y git curl zlib1g-dev libssl-dev libreadline-dev libsqlite3-dev libbz2-dev libffi-dev gcc make
git clone https://github.com/pyenv/pyenv.git ~/.pyenv
echo 'export PYENV_ROOT="$HOME/.pyenv"' >> ~/.bashrc
echo 'command -v pyenv >/dev/null || export PATH="$PYENV_ROOT/bin:$PATH"' >> ~/.bashrc
echo 'eval "$(pyenv init -)"' >> ~/.bashrc
source ~/.bashrc
mkdir ~/.pyenv/cache
cd ~/.pyenv/cache
wget https://mirrors.aliyun.com/python-release/source/Python-3.10.19.tar.xz
apt install -y liblzma-dev xz-utils
pyenv install 3.10.19
pyenv global 3.10.19
python -m venv ~/cuda124-py310
source ~/cuda124-py310/bin/activate
mkdir -p ~/.config/pip && vi ~/.config/pip/pip.conf
pip install torch torchvision torchaudio -i https://pypi.tuna.tsinghua.edu.cn/simple --index-url https://download.pytorch.org/whl/cu124
- 安装nvidia-container-toolkit
distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | apt-key add -
curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | tee /etc/apt/sources.list.d/nvidia-docker.list
apt-get update && sudo apt-get install -y nvidia-container-toolkit
nvidia-ctk runtime configure --runtime=docker
systemctl restart docker
验证
docker run --rm --gpus all nvidia/cuda:11.8.0-base-ubuntu20.04 nvidia-smi