version: '3.8'
services:
lbot-14b:
image: quay.io/ascend/vllm-ascend:v0.11.0rc0-openeuler
container_name: lbot-14b
privileged: true
ipc: host
shm_size: 96g
devices:
- /dev/davinci0
- /dev/davinci1
- /dev/davinci_manager
- /dev/devmm_svm
- /dev/hisi_hdc
volumes:
- /usr/local/dcmi:/usr/local/dcmi
- /usr/local/bin/npu-smi:/usr/local/bin/npu-smi
- /usr/local/sbin/:/usr/local/sbin
- /usr/local/Ascend/driver:/usr/local/Ascend/driver
- /etc/ascend_install.info:/etc/ascend_install.info
- /etc/vnpu.cfg:/etc/vnpu.cfg
- /data/disk2/alphainf/models/Qwen/Qwen3-14B:/mnt/data/models/Qwen3-14B
- /data/disk2/alphainf/lora:/mnt/data/auto-train/volumes/save
environment:
- VLLM_ALLOW_RUNTIME_LORA_UPDATING=True
- VLLM_LORA_RESOLVER_CACHE_DIR=/mnt/data/auto-train/volumes/save
- OMP_NUM_THREADS=48
- NPU_MEMORY_FRACTION=0.8
- MINDIE_LOG_TO_STDOUT=1
- MINDIE_LOG_TO_FILE=1
- MINDIE_LOG_VERBOSE=1
- VLLM_USE_V1=1
- VLLM_PLUGINS=lora_filesystem_resolver
- OMP_NUM_THREADS=4
ports:
- 6214:6214
entrypoint: []
command:
- "bash"
- "-c"
- "python3 -m vllm.entrypoints.openai.api_server \
--model=/mnt/data/models/Qwen3-14B --served-model-name lbot-14b --host 0.0.0.0 --port 6214 --gpu-memory-utilization 0.8 --max-num-seqs 200 --max-seq-len-to-capture 32768 --max-model-len 32768 --enable-prefix-caching --quantization=fp8 --reasoning-parser qwen3 --enable-lora --max-lora-rank 64 --tensor-parallel-size 2" 也能写成一句docker run吧
可以将这个 docker-compose.yml 配置转换为等效的 docker run 命令。以下是转换后的命令(为了可读性拆分为多行,实际使用时可合并为一行):
docker run -d \
--name lbot-14b \
--privileged \
--ipc=host \
--shm-size=96g \
--device=/dev/davinci0 \
--device=/dev/davinci1 \
--device=/dev/davinci_manager \
--device=/dev/devmm_svm \
--device=/dev/hisi_hdc \
-v /usr/local/dcmi:/usr/local/dcmi \
-v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
-v /usr/local/sbin/:/usr/local/sbin \
-v /usr/local/Ascend/driver:/usr/local/Ascend/driver \
-v /etc/ascend_install.info:/etc/ascend_install.info \
-v /etc/vnpu.cfg:/etc/vnpu.cfg \
-v /data/disk2/alphainf/models/Qwen/Qwen3-14B:/mnt/data/models/Qwen3-14B \
-v /data/disk2/alphainf/lora:/mnt/data/auto-train/volumes/save \
-e VLLM_ALLOW_RUNTIME_LORA_UPDATING=True \
-e VLLM_LORA_RESOLVER_CACHE_DIR=/mnt/data/auto-train/volumes/save \
-e OMP_NUM_THREADS=4 \
-e NPU_MEMORY_FRACTION=0.8 \
-e MINDIE_LOG_TO_STDOUT=1 \
-e MINDIE_LOG_TO_FILE=1 \
-e MINDIE_LOG_VERBOSE=1 \
-e VLLM_USE_V1=1 \
-e VLLM_PLUGINS=lora_filesystem_resolver \
-p 6214:6214 \
quay.io/ascend/vllm-ascend:v0.11.0rc0-openeuler \
bash -c "python3 -m vllm.entrypoints.openai.api_server \
--model=/mnt/data/models/Qwen3-14B \
--served-model-name lbot-14b \
--host 0.0.0.0 \
--port 6214 \
--gpu-memory-utilization 0.8 \
--max-num-seqs 200 \
--max-seq-len-to-capture 32768 \
--max-model-len 32768 \
--enable-prefix-caching \
--quantization=fp8 \
--reasoning-parser qwen3 \
--enable-lora \
--max-lora-rank 64 \
--tensor-parallel-size 2"