vLLM推理引擎教程4-离线推理功能

本文通过设置vLLM属性实现推理性能优化。

1、基本使用方法

（1）基本使用

最基本的批量文本生成。

代码：

python 复制代码

from vllm import LLM, SamplingParams

# Sample prompts.
prompts = [
    "Hello, my name is",
    "The president of the United States is",
    "The capital of France is",
    "The future of AI is",
]
# Create a sampling params object.
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)


def main():
    # Create an LLM.
    llm = LLM(model="facebook/opt-125m")
    # Generate texts from the prompts.
    # The output is a list of RequestOutput objects
    # that contain the prompt, generated text, and other information.
    outputs = llm.generate(prompts, sampling_params)
    # Print the outputs.
    print("\nGenerated Outputs:\n" + "-" * 60)
    for output in outputs:
        prompt = output.prompt
        generated_text = output.outputs[0].text
        print(f"Prompt:    {prompt!r}")
        print(f"Output:    {generated_text!r}")
        print("-" * 60)


if __name__ == "__main__":
    main()

（2）对话式推理

代码：

python 复制代码

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

from vllm import LLM, EngineArgs
from vllm.utils.argparse_utils import FlexibleArgumentParser


def create_parser():
    parser = FlexibleArgumentParser()
    # Add engine args
    EngineArgs.add_cli_args(parser)
    parser.set_defaults(model="/data/xiehao/workspace/models/Qwen/Qwen2.5-1.5B-Instruct")
    # Add sampling params
    sampling_group = parser.add_argument_group("Sampling parameters")
    sampling_group.add_argument("--max-tokens", type=int)
    sampling_group.add_argument("--temperature", type=float)
    sampling_group.add_argument("--top-p", type=float)
    sampling_group.add_argument("--top-k", type=int)
    # Add example params
    parser.add_argument("--chat-template-path", type=str)

    return parser


def main(args: dict):
    # Pop arguments not used by LLM
    max_tokens = args.pop("max_tokens")
    temperature = args.pop("temperature")
    top_p = args.pop("top_p")
    top_k = args.pop("top_k")
    chat_template_path = args.pop("chat_template_path")

    # Create an LLM
    llm = LLM(**args)

    # Create sampling params object
    sampling_params = llm.get_default_sampling_params()
    if max_tokens is not None:
        sampling_params.max_tokens = max_tokens
    if temperature is not None:
        sampling_params.temperature = temperature
    if top_p is not None:
        sampling_params.top_p = top_p
    if top_k is not None:
        sampling_params.top_k = top_k

    def print_outputs(outputs):
        print("\nGenerated Outputs:\n" + "-" * 80)
        for output in outputs:
            prompt = output.prompt
            generated_text = output.outputs[0].text
            print(f"Prompt: {prompt!r}\n")
            print(f"Generated text: {generated_text!r}")
            print("-" * 80)

    print("=" * 80)

    # In this script, we demonstrate how to pass input to the chat method:
    conversation = [
        {"role": "system", "content": "You are a helpful assistant"},
        {"role": "user", "content": "Hello"},
        {"role": "assistant", "content": "Hello! How can I assist you today?"},
        {
            "role": "user",
            "content": "Write an essay about the importance of higher education.",
        },
    ]
    outputs = llm.chat(conversation, sampling_params, use_tqdm=False)
    print_outputs(outputs)

    # You can run batch inference with llm.chat API
    conversations = [conversation for _ in range(10)]

    # We turn on tqdm progress bar to verify it's indeed running batch inference
    outputs = llm.chat(conversations, sampling_params, use_tqdm=True)
    print_outputs(outputs)

    # A chat template can be optionally supplied.
    # If not, the model will use its default chat template.
    if chat_template_path is not None:
        with open(chat_template_path) as f:
            chat_template = f.read()

        outputs = llm.chat(
            conversations,
            sampling_params,
            use_tqdm=False,
            chat_template=chat_template,
        )
        print_outputs(outputs)


if __name__ == "__main__":
    parser = create_parser()
    args: dict = vars(parser.parse_args())
    main(args)

（3）分类

准备工作：从modelscope下载Qwen2.5-1.5B-apeach模型，该模型用于文本分类任务（2分类），别的模型会报错。

代码：

python 复制代码

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

from argparse import Namespace

from vllm import LLM, EngineArgs
from vllm.utils.argparse_utils import FlexibleArgumentParser


def parse_args():
    parser = FlexibleArgumentParser()
    parser = EngineArgs.add_cli_args(parser)
    # Set example specific arguments
    parser.set_defaults(
        model="/data/xiehao/workspace/models/Qwen/Qwen2.5-1.5B-apeach",
        runner="pooling",
        enforce_eager=True,
    )
    return parser.parse_args()


def main(args: Namespace):
    # Sample prompts.
    prompts = [
        "Hello, my name is",
        "The president of the United States is",
        "The capital of France is",
        "The future of AI is",
    ]

    # Create an LLM.
    # You should pass runner="pooling" for classification models
    llm = LLM(**vars(args))

    # Generate logits. The output is a list of ClassificationRequestOutputs.
    outputs = llm.classify(prompts)

    # Print the outputs.
    print("\nGenerated Outputs:\n" + "-" * 60)
    for prompt, output in zip(prompts, outputs):
        probs = output.outputs.probs
        probs_trimmed = (str(probs[:16])[:-1] + ", ...]") if len(probs) > 16 else probs
        print(
            f"Prompt: {prompt!r} \n"
            f"Class Probabilities: {probs_trimmed} (size={len(probs)})"
        )
        print("-" * 60)


if __name__ == "__main__":
    args = parse_args()
    main(args)

vLLM默认行为：生成式推理，runner="decoding"，用于文本生成任务，如聊天、续写。

runner="pooling"，使用"池化执行器"来运行模型，前向推理模式，用于只做一次前向传播的任务。适用于非自回归任务，如文本分类、嵌入提取、句子相似度等。

（4）求embedding

下载intfloat/e5-small-v2模型，不同的任务需要使用不同的模型，否则没有对应的api。

代码：

python 复制代码

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

from argparse import Namespace

from vllm import LLM, EngineArgs
from vllm.utils.argparse_utils import FlexibleArgumentParser


def parse_args():
    parser = FlexibleArgumentParser()
    parser = EngineArgs.add_cli_args(parser)
    # Set example specific arguments
    parser.set_defaults(
        model="/data/xiehao/workspace/models/Qwen/e5-small-v2",
        runner="pooling",
        enforce_eager=True,
    )
    return parser.parse_args()


def main(args: Namespace):
    # Sample prompts.
    prompts = [
        "Hello, my name is",
        "The president of the United States is",
        "The capital of France is",
        "The future of AI is",
    ]

    # Create an LLM.
    # You should pass runner="pooling" for embedding models
    llm = LLM(**vars(args))

    # Generate embedding. The output is a list of EmbeddingRequestOutputs.
    outputs = llm.embed(prompts)

    # Print the outputs.
    print("\nGenerated Outputs:\n" + "-" * 60)
    for prompt, output in zip(prompts, outputs):
        embeds = output.outputs.embedding
        embeds_trimmed = (
            (str(embeds[:16])[:-1] + ", ...]") if len(embeds) > 16 else embeds
        )
        print(f"Prompt: {prompt!r} \nEmbeddings: {embeds_trimmed} (size={len(embeds)})")
        print("-" * 60)


if __name__ == "__main__":
    args = parse_args()
    main(args)

2、自动前缀缓存功能

Automatic Prefix Caching功能，其核心目的是：当多个推理请求共享相同前缀时，vLLM通过缓存KV Cache来显著加速后续生成。

使用场景：一批LLM请求共享相同的prompts前缀（比如1000个token），只有少量是独有的。比如对于长上下文（如文档、日志、数据库导出等）的解读。

启用自动前缀缓存的代码：

llm = LLM(model=<model_id>, enable_prefix_caching=True)

启用后，vLLM会：

自动为已处理的prompt前缀构建KV Cache（Key-Value缓存）
当新请求的前缀与已有缓存匹配时，跳过重复计算，直接复用缓存。

代码示例：

python 复制代码

import time

from vllm import LLM, SamplingParams

# A prompt containing a large markdown table. The table is randomly generated by GPT-4.
LONG_PROMPT = (
    "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as follows.\n# Table\n"
    + """
| ID  | Name          | Age | Occupation    | Country       | Email                  | Phone Number   | Address                       |
|-----|---------------|-----|---------------|---------------|------------------------|----------------|------------------------------|
| 1   | John Doe      | 29  | Engineer      | USA           | john.doe@example.com   | 555-1234       | 123 Elm St, Springfield, IL  |
| 2   | Jane Smith    | 34  | Doctor        | Canada        | jane.smith@example.com | 555-5678       | 456 Oak St, Toronto, ON      |
| 3   | Alice Johnson | 27  | Teacher       | UK            | alice.j@example.com    | 555-8765       | 789 Pine St, London, UK      |
| 4   | Bob Brown     | 45  | Artist        | Australia     | bob.b@example.com      | 555-4321       | 321 Maple St, Sydney, NSW    |
| 5   | Carol White   | 31  | Scientist     | New Zealand   | carol.w@example.com    | 555-6789       | 654 Birch St, Wellington, NZ |
| 6   | Dave Green    | 28  | Lawyer        | Ireland       | dave.g@example.com     | 555-3456       | 987 Cedar St, Dublin, IE     |
| 7   | Emma Black    | 40  | Musician      | USA           | emma.b@example.com     | 555-1111       | 246 Ash St, New York, NY     |
| 8   | Frank Blue    | 37  | Chef          | Canada        | frank.b@example.com    | 555-2222       | 135 Spruce St, Vancouver, BC |
| 9   | Grace Yellow  | 50  | Engineer      | UK            | grace.y@example.com    | 555-3333       | 864 Fir St, Manchester, UK   |
| 10  | Henry Violet  | 32  | Artist        | Australia     | henry.v@example.com    | 555-4444       | 753 Willow St, Melbourne, VIC|
| 11  | Irene Orange  | 26  | Scientist     | New Zealand   | irene.o@example.com    | 555-5555       | 912 Poplar St, Auckland, NZ  |
| 12  | Jack Indigo   | 38  | Teacher       | Ireland       | jack.i@example.com     | 555-6666       | 159 Elm St, Cork, IE         |
| 13  | Karen Red     | 41  | Lawyer        | USA           | karen.r@example.com    | 555-7777       | 357 Cedar St, Boston, MA     |
| 14  | Leo Brown     | 30  | Chef          | Canada        | leo.b@example.com      | 555-8888       | 246 Oak St, Calgary, AB      |
| 15  | Mia Green     | 33  | Musician      | UK            | mia.g@example.com      | 555-9999       | 975 Pine St, Edinburgh, UK   |
| 16  | Noah Yellow   | 29  | Doctor        | Australia     | noah.y@example.com     | 555-0000       | 864 Birch St, Brisbane, QLD  |
| 17  | Olivia Blue   | 35  | Engineer      | New Zealand   | olivia.b@example.com   | 555-1212       | 753 Maple St, Hamilton, NZ   |
| 18  | Peter Black   | 42  | Artist        | Ireland       | peter.b@example.com    | 555-3434       | 912 Fir St, Limerick, IE     |
| 19  | Quinn White   | 28  | Scientist     | USA           | quinn.w@example.com    | 555-5656       | 159 Willow St, Seattle, WA   |
| 20  | Rachel Red    | 31  | Teacher       | Canada        | rachel.r@example.com   | 555-7878       | 357 Poplar St, Ottawa, ON    |
| 21  | Steve Green   | 44  | Lawyer        | UK            | steve.g@example.com    | 555-9090       | 753 Elm St, Birmingham, UK   |
| 22  | Tina Blue     | 36  | Musician      | Australia     | tina.b@example.com     | 555-1213       | 864 Cedar St, Perth, WA      |
| 23  | Umar Black    | 39  | Chef          | New Zealand   | umar.b@example.com     | 555-3435       | 975 Spruce St, Christchurch, NZ|
| 24  | Victor Yellow | 43  | Engineer      | Ireland       | victor.y@example.com   | 555-5657       | 246 Willow St, Galway, IE    |
| 25  | Wendy Orange  | 27  | Artist        | USA           | wendy.o@example.com    | 555-7879       | 135 Elm St, Denver, CO       |
| 26  | Xavier Green  | 34  | Scientist     | Canada        | xavier.g@example.com   | 555-9091       | 357 Oak St, Montreal, QC     |
| 27  | Yara Red      | 41  | Teacher       | UK            | yara.r@example.com     | 555-1214       | 975 Pine St, Leeds, UK       |
| 28  | Zack Blue     | 30  | Lawyer        | Australia     | zack.b@example.com     | 555-3436       | 135 Birch St, Adelaide, SA   |
| 29  | Amy White     | 33  | Musician      | New Zealand   | amy.w@example.com      | 555-5658       | 159 Maple St, Wellington, NZ |
| 30  | Ben Black     | 38  | Chef          | Ireland       | ben.b@example.com      | 555-7870       | 246 Fir St, Waterford, IE    |
"""
)


def get_generation_time(llm, sampling_params, prompts):
    # time the generation
    start_time = time.time()
    output = llm.generate(prompts, sampling_params=sampling_params)
    end_time = time.time()
    # print the output and generation time
    print("-" * 30)
    print(f"Output: {output[0].outputs[0].text}")
    print(f"Generation time: {end_time - start_time} seconds.")
    print("-" * 30)


def main():
    # set enable_prefix_caching=True to enable APC
    llm = LLM(model="/data/xiehao/workspace/models/Qwen/Qwen2.5-1.5B-Instruct", enable_prefix_caching=True)

    sampling_params = SamplingParams(temperature=0, max_tokens=100)

    # Querying the age of John Doe
    get_generation_time(
        llm,
        sampling_params,
        LONG_PROMPT
        + "Question: what is the age of John Doe? Your answer: The age of John Doe is ",
    )

    # Querying the age of Zack Blue
    # This query will be faster since vllm avoids computing the KV cache of LONG_PROMPT again.
    get_generation_time(
        llm,
        sampling_params,
        LONG_PROMPT
        + "Question: what is the age of Zack Blue? Your answer: The age of Zack Blue is ",
    )


if __name__ == "__main__":
    main()

3、上下文扩展

使用vLLM扩展模型的上下文长度（context length），通过YARN（Yet Another RoPE extentionN）方法来支持更长的输入序列。

这是通过修改RoPE参数实现的，而YARN是一种先进的位置编码外推技术，能有效缓解长上下文中的位置信息失真问题。

代码：

python 复制代码

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
This script demonstrates how to extend the context length
of a Qwen model using the YARN method (rope_parameters)
and run a simple chat example.

Usage:
    python examples/offline_inference/context_extension.py
"""

from vllm import LLM, SamplingParams


def create_llm():
    rope_theta = 1000000
    original_max_position_embeddings = 32768
    factor = 4.0

    # Use yarn to extend context
    hf_overrides = {
        "rope_parameters": {
            "rope_theta": rope_theta,
            "rope_type": "yarn",
            "factor": factor,
            "original_max_position_embeddings": original_max_position_embeddings,
        },
        "max_model_len": int(original_max_position_embeddings * factor),
    }

    llm = LLM(model="/data/xiehao/workspace/models/Qwen/Qwen2.5-1.5B-Instruct", hf_overrides=hf_overrides)
    return llm


def run_llm_chat(llm):
    sampling_params = SamplingParams(
        temperature=0.8,
        top_p=0.95,
        max_tokens=128,
    )

    conversation = [
        {"role": "system", "content": "You are a helpful assistant"},
        {"role": "user", "content": "Hello"},
        {"role": "assistant", "content": "Hello! How can I assist you today?"},
    ]
    outputs = llm.chat(conversation, sampling_params, use_tqdm=False)
    return outputs


def print_outputs(outputs):
    print("\nGenerated Outputs:\n" + "-" * 80)
    for output in outputs:
        prompt = output.prompt
        generated_text = output.outputs[0].text
        print(f"Prompt: {prompt!r}\n")
        print(f"Generated text: {generated_text!r}")
        print("-" * 80)


def main():
    llm = create_llm()
    outputs = run_llm_chat(llm)
    print_outputs(outputs)


if __name__ == "__main__":
    main()

hf_overrides会覆盖模型config中的相关字段，使vLLM在加载时应用这些设置。

4、encoder-decoder多模态

下面展示利用vLLM对Whisper的Encoder-Decoder架构的多模态大模型进行离线语音转文本（ASR）任务。

python 复制代码

import os
from vllm import LLM, SamplingParams
from vllm.assets.audio import AudioAsset

os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"

def main():
    audio = AudioAsset("mary_had_lamb").audio_and_sample_rate


    prompts = [{
    "encoder_prompt": {
        "prompt": "",
        "multi_modal_data": {"audio": audio}
        },
    "decoder_prompt": "<|startoftranscript|><|en|><|transcribe|>"
    }]

    llm = LLM(
        model="/data/xiehao/workspace/models/Qwen/whisper-large-v3-turbo",
        max_model_len=448,
        limit_mm_per_prompt={"audio": 1},
        dtype="half"
        )

    sampling_params = SamplingParams(
        temperature=0,
        max_tokens=64,
        skip_special_tokens=False
        )

    outputs = llm.generate(prompts, sampling_params)

    for out in outputs:
        print("Transcription:", repr(out.outputs[0].text))

if __name__ == "__main__":
    main()

Whisper是OpenAI开发的语音识别模型，采用Encoder-Decoder架构，Encoder接收音频输入（多模态数据），Decoder基于提示（prompt）生成文本。
为兼容 GPU 多进程安全，需设置 VLLM_WORKER_MULTIPROC_METHOD=spawn。
加载内置测试音频（"Mary Had a Lamb"）
使用显式 encoder/decoder prompt 格式，精确控制模型行为
指定语言（<|en|>）和任务类型（<|transcribe|>），实现英文语音转写
支持半精度（dtype="half"）加速推理，降低显存占用

5、LLMEngine简单使用

LLMEngine是低层API，手动管理请求和step。

LLM.generate是高层API，一键推理。

代码示例：

python 复制代码

from vllm import EngineArgs, LLMEngine, SamplingParams


engine_args = EngineArgs(
        model="/data/xiehao/workspace/models/Qwen/Qwen2.5-1.5B-Instruct",
        dtype="half",
        max_model_len=2048,
        gpu_memory_utilization=0.9,
        )

engine = LLMEngine.from_engine_args(engine_args)

prompt = "你是谁？"
sampling_params = SamplingParams(
        temperature=0.7,
        top_p=0.95,
        max_tokens=64
        )

engine.add_request("1", prompt, params=sampling_params)

print("生成中...")
while engine.has_unfinished_requests():
    request_outputs = engine.step()
    for output in request_outputs:
        if output.finished:
            print("\n最终输出:")
            print(output.outputs[0].text)