vllm自动化压测脚本

生成结果benchmark_summary.txt如下所示:

bash 复制代码
input_len,output_len,num_prompts,concurrency,output_throughput,total_throughput,duration_sec,mean_ttft_ms,mean_tpot_ms,mean_itl_ms
2048,2048,4,1,523.4,1046.8,12.3,21.5,11.2,9.7
32768,2048,16,4,487.1,982.6,35.6,48.3,13.9,10.4

日志保存在logs/下,result-filename保存在results/下,自动化压测脚本如下:

bash 复制代码
#!/usr/bin/env bash

set -euo pipefail

###############################
# ===== 参数配置区 =====
###############################
BACKEND="vllm"
BASE_URL="http://127.0.0.1:30734"
ENDPOINT="/v1/completions"
DATASET_NAME="random"
MODEL="qwen3.5"
TOKENIZER="/model/Qwen3.5-27B"
SEED="12345"

RANDOM_INPUT_LENS=(2048 32768 65536 131072)
RANDOM_OUTPUT_LENS=(2048)
MAX_CONCURRENCY_LIST=(1 4 8 16)

LOG_DIR="logs"
RESULT_DIR="results"
SUMMARY_TXT="benchmark_summary.txt"

mkdir -p "${LOG_DIR}" "${RESULT_DIR}"

if [[ ! -f "${SUMMARY_TXT}" ]]; then
  cat > "${SUMMARY_TXT}" <<EOF
input_len,output_len,num_prompts,concurrency,output_throughput,total_throughput,duration_sec,mean_ttft_ms,mean_tpot_ms,mean_itl_ms
EOF
fi

###############################
# ===== 工具函数 =====
###############################
parse_metric() {
  local json_file="$1"
  local key="$2"
  jq -r ".${key} // \"0\"" "$json_file"
}

###############################
# ===== 主循环 =====
###############################
for input_len in "${RANDOM_INPUT_LENS[@]}"; do
  for output_len in "${RANDOM_OUTPUT_LENS[@]}"; do
    for max_concurrency in "${MAX_CONCURRENCY_LIST[@]}"; do

      num_prompts=$((max_concurrency * 4))

      base_name="input=${input_len},output=${output_len},num_prompts=${num_prompts},concurrency=${max_concurrency}"
      log_path="${LOG_DIR}/${base_name}.log"
      result_json="${RESULT_DIR}/${base_name}.json"

      echo "=========================================="
      echo "RUNNING:"
      echo "  input_len=${input_len}"
      echo "  output_len=${output_len}"
      echo "  max_concurrency=${max_concurrency}"
      echo "  num_prompts=${num_prompts}"
      echo "  log=${log_path}"
      echo "  result=${result_json}"
      echo "=========================================="

      vllm bench serve \
        --backend "${BACKEND}" \
        --base-url "${BASE_URL}" \
        --endpoint "${ENDPOINT}" \
        --dataset-name "${DATASET_NAME}" \
        --model "${MODEL}" \
        --tokenizer "${TOKENIZER}" \
        --seed "${SEED}" \
        --random-input-len "${input_len}" \
        --random-output-len "${output_len}" \
        --num-prompts "${num_prompts}" \
        --max-concurrency "${max_concurrency}" \
        --save-result \
        --result-dir "${RESULT_DIR}" \
        --result-filename "$(basename "${result_json}")" \
        > "${log_path}" 2>&1

      echo "DONE: ${log_path}"
      echo

      # =========================
      # 解析指标
      # =========================
      if [[ -f "${result_json}" ]]; then
        output_throughput=$(parse_metric "${result_json}" "output_throughput")
        total_throughput=$(parse_metric "${result_json}" "total_token_throughput")
        duration=$(parse_metric "${result_json}" "duration")
        mean_ttft=$(parse_metric "${result_json}" "mean_ttft_ms")
        mean_tpot=$(parse_metric "${result_json}" "mean_tpot_ms")
        mean_itl=$(parse_metric "${result_json}" "mean_itl_ms")

        # ✅ 使用 printf 统一保留一位小数
        printf "%d,%d,%d,%d,%.1f,%.1f,%.1f,%.1f,%.1f,%.1f\n" \
          "${input_len}" \
          "${output_len}" \
          "${num_prompts}" \
          "${max_concurrency}" \
          "${output_throughput}" \
          "${total_throughput}" \
          "${duration}" \
          "${mean_ttft}" \
          "${mean_tpot}" \
          "${mean_itl}" \
          >> "${SUMMARY_TXT}"
      else
        echo "WARNING: Result JSON not found: ${result_json}"
      fi

    done
  done
done

echo "=========================================="
echo "All benchmarks finished."
echo "Logs are stored in: ${LOG_DIR}"
echo "Results are stored in: ${RESULT_DIR}"
echo "Summary TXT: ${SUMMARY_TXT}"
echo "=========================================="
相关推荐
AI小百科3 天前
llama.cpp vs vLLM:深度解析与选型指南
llama·vllm
IRevers3 天前
【大模型】Gemma4在ROCm和vLLM部署
人工智能·pytorch·深度学习·大模型·datawhale·vllm·amdev
下班走回家4 天前
本地部署大模型的三种方式:Ollama vs vLLM vs llama.cpp
人工智能·llama·vllm
花间相见4 天前
【大模型部署01】—— vLLM 部署大模型服务实操:从 0 到 1 搭建 OpenAI 兼容 API
vllm
有来有去95274 天前
【训推框架】Vime-大规模 LLM/VLM 强化学习训练框架
人工智能·深度学习·语言模型·gpu算力·vllm
安如衫6 天前
【Hello-ROCm】vLLM 跑通 Gemma4-E4B
datawhale·vllm·amdev
毒爪的小新6 天前
Linux 环境极速部署 vLLM:从零搭建生产级大模型推理服务
linux·人工智能·ai·语言模型·vllm
像风一样自由20206 天前
17.推理框架横评:vLLM / TGI / TensorRT-LLM / SGLang 全面对比
人工智能·大模型·vllm·sglang
rebibabo6 天前
KV Cache 与 PagedAttention 详解:理论推导 + RTX 3090 实测数据
人工智能·vllm·推理加速·大模型部署·kvcache