参考文档:https://verl.readthedocs.io/en/latest/start/install.html#install-dependencies
准备conda环境
shell
conda create -n verl python==3.12
conda activate verl
安装
shell
# Make sure you have activated verl conda env
# If you need to run with megatron
bash scripts/install_vllm_sglang_mcore.sh
# Or if you simply need to run with FSDP
USE_MEGATRON=0 bash scripts/install_vllm_sglang_mcore.sh
网络原因导致TransformerEngine没有安装成功,需要手动安装。
手动安装TransformerEngine
1、clone仓库
shell
git clone https://github.com/NVIDIA/TransformerEngine.git
cd TransformerEngine
2、指定版本并更新子模块
shell
git checkout v2.6
git submodule update --init --recursive
3、执行安装
shell
NVTE_FRAMEWORK=pytorch pip3 install --no-deps --no-build-isolation .
安装遇到了cmake编译报错,找不到cudnn,但是可以通过pip检查到已经安装:
shell
> pip show nvidia-cudnn-cu12
Name: nvidia-cudnn-cu12
Version: 9.10.2.21
Summary: cuDNN runtime libraries
Home-page: https://developer.nvidia.com/cuda-zone
Author: Nvidia CUDA Installer Team
Author-email: compute_installer@nvidia.com
License: LicenseRef-NVIDIA-Proprietary
Location: /home/sw/.conda/envs/verl/lib/python3.12/site-packages
Requires: nvidia-cublas-cu12
Required-by: torch
配置全局编译器路径并重新编译:
shell
# 1. 清理缓存
rm -rf build/
rm -rf transformer_engine.egg-info
# 2. 定义基础路径
export CONDA_ENV_PATH=/home/sw/.conda/envs/verl
export CUDNN_ROOT=$CONDA_ENV_PATH/lib/python3.12/site-packages/nvidia/cudnn
# 3. 设置 CPATH (告诉编译器去哪里找头文件)
export CPATH=$CUDNN_ROOT/include:$CPATH
# 4. 设置 LIBRARY_PATH (告诉编译器去哪里找库文件进行链接)
export LIBRARY_PATH=$CUDNN_ROOT/lib:$LIBRARY_PATH
# 5. 设置 LD_LIBRARY_PATH (告诉系统运行时去哪里找动态库)
export LD_LIBRARY_PATH=$CUDNN_ROOT/lib:$LD_LIBRARY_PATH
# 6. 再次执行安装
NVTE_FRAMEWORK=pytorch pip3 install --no-deps --no-build-isolation .
编译安装成功:
shell
...
Preparing metadata (pyproject.toml) ... done
Building wheels for collected packages: transformer_engine
Building wheel for transformer_engine (pyproject.toml) ... done
Created wheel for transformer_engine: filename=transformer_engine-2.6.0+c90a7207-cp312-cp312-linux_x86_64.whl size=243922631 sha256=3c4e706de6d5626d641563a7f0bb672aab7dddc76199a9cae9cc8e3f0abf6b5c
Stored in directory: /tmp/pip-ephem-wheel-cache-fazev1_w/wheels/ac/bc/ce/4c6381eafba27055735110a8656f8ffd1ad0581b962c5c4b2b
Successfully built transformer_engine
Installing collected packages: transformer_engine
Successfully installed transformer_engine-2.6.0+c90a7207
手动安装 Megatron-LM (core_v0.13.1)
shell
git clone https://github.com/NVIDIA/Megatron-LM.git
cd Megatron-LM
git checkout core_v0.13.1
pip3 install --no-deps .
安装verl
shell
git clone https://github.com/volcengine/verl.git
cd verl
pip install --no-deps -e .
运行verl
数据集格式转换
shell
python3 examples/data_preprocess/gsm8k.py --local_dataset_path /home/sw/hw/downloads/openai/gsm8k --local_save_dir /home/sw/hw/downloads/gsm8k
运行demo
shell
MODEL_PATH="/hub/weights/Qwen/Qwen2.5-0.5B"
TRAIN_DATA="/home/sw/hw/downloads/gsm8k/train.parquet"
TEST_DATA="/home/sw/hw/downloads/gsm8k/test.parquet"
export HF_HUB_OFFLINE=1
export TRANSFORMERS_OFFLINE=1
PYTHONUNBUFFERED=1 python3 -m verl.trainer.main_ppo \
data.train_files=$TRAIN_DATA \
data.val_files=$TEST_DATA \
data.train_batch_size=256 \
data.max_prompt_length=512 \
data.max_response_length=512 \
actor_rollout_ref.model.path=$MODEL_PATH \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.actor.ppo_mini_batch_size=64 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=8 \
actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
critic.optim.lr=1e-5 \
critic.model.path=$MODEL_PATH \
critic.ppo_micro_batch_size_per_gpu=4 \
algorithm.kl_ctrl.kl_coef=0.001 \
trainer.logger=console \
trainer.val_before_train=False \
trainer.n_gpus_per_node=1 \
trainer.nnodes=1 \
trainer.save_freq=10 \
trainer.test_freq=10 \
trainer.total_epochs=15 2>&1 | tee verl_demo.log