/v1/chat/completions 调用流程
text
┌─────────────────────────────────────────────────────────────────┐
│ HTTP POST /v1/chat/completions │
└───────────────────────────────┬─────────────────────────────────┘
▼
┌─────────────────────────────────────────────────────────────────┐
│ OpenAIServingChat.create_chat_completion │
│ → self.engine_client.generate(...) │
└───────────────────────────────┬─────────────────────────────────┘
▼
┌─────────────────────────────────────────────────────────────────┐
│ AsyncLLM.generate │
│ → add_request → _add_request │
└───────────────────────────────┬─────────────────────────────────┘
▼
┌─────────────────────────────────────────────────────────────────┐
│ DPAsyncMPClient.add_request_async │
│ ├─ 设置 request.current_wave/client_index │
│ ├─ chosen_engine = get_core_engine_for_request(request) │
│ ├─ [条件] 发送 "FIRST_REQ" 通知协调器 │
│ └─ await self._send_input(ADD, request, chosen_engine) │
└───────────────────────────────┬─────────────────────────────────┘
▼
┌─────────────────────────────────────────────────────────────────┐
│ AsyncMPClient._send_input │
│ └─ message = (ADD.value, *encoder.encode(request)) │
│ └─ return _send_input_message(message, engine, request) │
└───────────────────────────────┬─────────────────────────────────┘
▼
┌─────────────────────────────────────────────────────────────────┐
│ AsyncMPClient._send_input_message │
│ ├─ msg = (engine,) + message │
│ ├─ if 无大对象: │
│ │ return input_socket.send_multipart(msg, copy=False) │
│ └─ else: │
│ future = input_socket.send_multipart(msg, copy=False, │
│ track=True) │
│ future.add_done_callback(add_pending) # 保留objects │
│ return future │
└───────────────────────────────┬─────────────────────────────────┘
▼
┌─────────────────────────────────────────────────────────────────┐
│ ZMQ (ZeroMQ) 将消息发送至 EngineCore 进程 │
│ → 消息格式: (engine_id, "ADD", msgpack(EngineCoreRequest)) │
└─────────────────────────────────────────────────────────────────┘
api注册
python
#https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/openai/chat_completion/api_router.py#L32
def chat(request: Request) -> OpenAIServingChat | None:
return request.app.state.openai_serving_chat
@router.post(
"/v1/chat/completions",
dependencies=[Depends(validate_json_request)],
responses={
HTTPStatus.OK.value: {"content": {"text/event-stream": {}}},
HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
HTTPStatus.NOT_FOUND.value: {"model": ErrorResponse},
HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
HTTPStatus.NOT_IMPLEMENTED.value: {"model": ErrorResponse},
},
)
@with_cancellation
@load_aware_call
async def create_chat_completion(request: ChatCompletionRequest, raw_request: Request):
metrics_header_format = raw_request.headers.get(
ENDPOINT_LOAD_METRICS_FORMAT_HEADER_LABEL, ""
)
handler = chat(raw_request)
if handler is None:
raise NotImplementedError("The model does not support Chat Completions API")
generator = await handler.create_chat_completion(request, raw_request)
if isinstance(generator, ErrorResponse):
return JSONResponse(
content=generator.model_dump(), status_code=generator.error.code
)
elif isinstance(generator, ChatCompletionResponse):
return JSONResponse(
content=generator.model_dump(),
headers=metrics_header(metrics_header_format),
)
return StreamingResponse(content=generator, media_type="text/event-stream")
从handler.create_chat_completion到engine_client.generate
python
# https://github.com/vllm-project/vllm/blob/v0.20.1/vllm/entrypoints/openai/chat_completion/serving.py#L86
class OpenAIServingChat(OpenAIServing):
def __init__(
self,
engine_client: EngineClient,
models: OpenAIServingModels,
response_role: str,
*,
openai_serving_render: "OpenAIServingRender",
request_logger: RequestLogger | None,
chat_template: str | None,
chat_template_content_format: ChatTemplateContentFormatOption,
trust_request_chat_template: bool = False,
return_tokens_as_token_ids: bool = False,
reasoning_parser: str = "",
enable_auto_tools: bool = False,
exclude_tools_when_tool_choice_none: bool = False,
tool_parser: str | None = None,
enable_prompt_tokens_details: bool = False,
enable_force_include_usage: bool = False,
enable_log_outputs: bool = False,
enable_log_deltas: bool = True,
default_chat_template_kwargs: dict[str, Any] | None = None,
) -> None:
super().__init__(
engine_client=engine_client,
models=models,
request_logger=request_logger,
return_tokens_as_token_ids=return_tokens_as_token_ids,
)
self.openai_serving_render = openai_serving_render
self.response_role = response_role
self.chat_template = chat_template
self.chat_template_content_format: Final = chat_template_content_format
self.trust_request_chat_template = trust_request_chat_template
self.default_chat_template_kwargs = default_chat_template_kwargs or {}
self.enable_log_outputs = enable_log_outputs
self.enable_log_deltas = enable_log_deltas
async def create_chat_completion(
self,
request: ChatCompletionRequest,
raw_request: Request | None = None,
) -> AsyncGenerator[str, None] | ChatCompletionResponse | ErrorResponse:
"""
Chat Completion API similar to OpenAI's API.
See https://platform.openai.com/docs/api-reference/chat/create
for the API specification. This API mimics the OpenAI
Chat Completion API.
"""
# Streaming response
tokenizer = self.renderer.tokenizer
assert tokenizer is not None
chat_template_kwargs = self._effective_chat_template_kwargs(request)
reasoning_parser: ReasoningParser | None = None
if self.reasoning_parser_cls:
reasoning_parser = self.reasoning_parser_cls(
tokenizer,
chat_template_kwargs=chat_template_kwargs, # type: ignore[call-arg]
)
result = await self.render_chat_request(request)
if isinstance(result, ErrorResponse):
return result
conversation, engine_inputs = result
request_id = (
f"chatcmpl-{self._base_request_id(raw_request, request.request_id)}"
)
request_metadata = RequestResponseMetadata(request_id=request_id)
if raw_request:
raw_request.state.request_metadata = request_metadata
lora_request = self._maybe_get_adapters(request, supports_default_mm_loras=True)
model_name = self.models.model_name(lora_request)
# Extract data_parallel_rank from header (router can inject it)
data_parallel_rank = self._get_data_parallel_rank(raw_request)
# Schedule the request and get the result generator.
max_model_len = self.model_config.max_model_len
generators: list[AsyncGenerator[RequestOutput, None]] = []
for i, engine_input in enumerate(engine_inputs):
prompt_token_ids = self._extract_prompt_components(engine_input).token_ids
# If we are creating sub requests for multiple prompts, ensure that they
# have unique request ids.
sub_request_id = (
request_id if len(engine_inputs) == 1 else f"{request_id}_{i}"
)
max_tokens = get_max_tokens(
max_model_len,
request.max_completion_tokens
if request.max_completion_tokens is not None
else request.max_tokens,
self._extract_prompt_len(engine_input),
self.default_sampling_params,
self.override_max_tokens,
)
sampling_params: SamplingParams | BeamSearchParams
if request.use_beam_search:
sampling_params = request.to_beam_search_params(
max_tokens, self.default_sampling_params
)
else:
sampling_params = request.to_sampling_params(
max_tokens,
self.default_sampling_params,
)
self._log_inputs(
sub_request_id,
engine_input,
params=sampling_params,
lora_request=lora_request,
)
trace_headers = (
None
if raw_request is None
else await self._get_trace_headers(raw_request.headers)
)
if isinstance(sampling_params, BeamSearchParams):
generator = self.beam_search(
prompt=engine_input,
request_id=sub_request_id,
params=sampling_params,
lora_request=lora_request,
trace_headers=trace_headers,
)
else:
if not request.include_reasoning:
reasoning_ended = True
elif request._grammar_from_tool_parser:
# The Mistral grammar already includes an optional
# `think?` rule that handles both reasoning and
# non-reasoning outputs.
reasoning_ended = True
elif reasoning_parser:
reasoning_ended = reasoning_parser.is_reasoning_end(
prompt_token_ids or []
)
else:
reasoning_ended = None
generator = self.engine_client.generate(
engine_input,
sampling_params,
sub_request_id,
lora_request=lora_request,
trace_headers=trace_headers,
priority=request.priority,
data_parallel_rank=data_parallel_rank,
reasoning_ended=reasoning_ended,
reasoning_parser_kwargs={
"chat_template_kwargs": chat_template_kwargs,
}
if reasoning_parser
else None,
)
generators.append(generator)
assert len(generators) == 1
(result_generator,) = generators
if request.stream:
return self.chat_completion_stream_generator(
request,
result_generator,
request_id,
model_name,
conversation,
tokenizer,
request_metadata,
reasoning_parser,
chat_template_kwargs=chat_template_kwargs,
)
return await self.chat_completion_full_generator(
request,
result_generator,
request_id,
model_name,
conversation,
tokenizer,
request_metadata,
reasoning_parser,
)
DPAsyncMPClient::add_request_async
python
# https://github.com/vllm-project/vllm/blob/v0.20.1/vllm/v1/engine/core_client.py#L1296
class DPAsyncMPClient(AsyncMPClient):
async def add_request_async(self, request: EngineCoreRequest) -> None:
self._ensure_stats_update_task()
request.current_wave = self.current_wave
request.client_index = self.client_index
chosen_engine = self.get_core_engine_for_request(request)
to_await = self._send_input(EngineCoreRequestType.ADD, request, chosen_engine)
if not self.engines_running:
# Notify coordinator that we're sending a request
req_msg = msgspec.msgpack.encode(("FIRST_REQ", chosen_engine))
await self.first_req_send_socket.send(req_msg)
await to_await
self._ensure_output_queue_task()
# https://github.com/vllm-project/vllm/blob/v0.20.1/vllm/v1/engine/core_client.py#L1001
class AsyncMPClient(MPClient):
def _send_input(
self,
request_type: EngineCoreRequestType,
request: Any,
engine: EngineIdentity | None = None,
) -> Awaitable[Any]:
if engine is None:
engine = self.core_engine
message = (request_type.value, *self.encoder.encode(request))
return self._send_input_message(message, engine, request)
def _send_input_message(
self, message: tuple[bytestr, ...], engine: EngineIdentity, objects: Any
) -> Awaitable[Any]:
"""
objects is a reference to retain until zmq is finished with the
buffers, in case they were extracted from tensors in the request.
"""
self.ensure_alive()
self.free_pending_messages()
msg = (engine,) + message
if not objects or len(msg) <= 3:
# No auxiliary buffers => no tensor backing buffers in request.
return self.input_socket.send_multipart(msg, copy=False)
future: asyncio.Future[zmq.MessageTracker]
future = self.input_socket.send_multipart(msg, copy=False, track=True)
def add_pending(f: asyncio.Future[zmq.MessageTracker]):
with contextlib.suppress(BaseException):
self.add_pending_message(f.result(), objects)
future.add_done_callback(add_pending)
return future
OpenAIServingChat 的创建过程
创建流程
text
run_multi_api_server(args)
│
├─ 1. launch_core_engines() 启动后端核心引擎
│
└─ 2. APIServerProcessManager 创建 N 个 API 子进程
│
└─ 每个子进程执行:
run_api_server_worker_proc(listen_address, sock, args, client_config)
│
└─ uvloop.run( run_server_worker(...) )
│
└─ run_server_worker()
│
├─ build_async_engine_client(client_config) → engine_client
│
└─ build_and_serve(engine_client, listen_address, sock, args)
│
├─ build_app() → 创建 FastAPI app
│
├─ init_app_state(engine_client, app.state, args, supported_tasks)
│ │
│ ├─ 创建 OpenAIServingRender 实例,存入 app.state.openai_serving_render
│ │
│ └─ 若 "generate" in supported_tasks:
│ └─ 创建 OpenAIServingChat 实例 → 存入 app.state.openai_serving_chat
│
└─ serve_http(app, ...) 启动 HTTP 服务器
代码调用
python
# https://github.com/vllm-project/vllm/blob/v0.20.1/vllm/entrypoints/cli/serve.py#L231
def run_multi_api_server(args: argparse.Namespace):
with launch_core_engines(
vllm_config, executor_class, log_stats, addresses, num_api_servers
) as (local_engine_manager, coordinator, addresses, tensor_queue):
# Construct common args for the APIServerProcessManager up-front.
stats_update_address = None
if coordinator:
stats_update_address = coordinator.get_stats_publish_address()
# Start API servers.
api_server_manager = APIServerProcessManager(
listen_address=listen_address,
sock=sock,
args=args,
num_servers=num_api_servers,
input_addresses=addresses.inputs,
output_addresses=addresses.outputs,
stats_update_address=stats_update_address,
tensor_queue=tensor_queue,
)
# https://github.com/vllm-project/vllm/blob/v0.20.1/vllm/v1/utils.py#L162
class APIServerProcessManager:
"""Manages a group of API server processes.
Handles creation, monitoring, and termination of API server worker
processes. Also monitors extra processes to check if they are healthy.
"""
def __init__(
self,
listen_address: str,
sock: Any,
args: argparse.Namespace,
num_servers: int,
input_addresses: list[str],
output_addresses: list[str],
target_server_fn: Callable | None = None,
stats_update_address: str | None = None,
tensor_queue: Queue | None = None,
):
for i, in_addr, out_addr in zip(
range(num_servers), input_addresses, output_addresses
):
client_config = {
"input_address": in_addr,
"output_address": out_addr,
"client_count": num_servers,
"client_index": i,
}
if stats_update_address is not None:
client_config["stats_update_address"] = stats_update_address
if tensor_queue is not None:
client_config["tensor_queue"] = tensor_queue
proc = spawn_context.Process(
target=target_server_fn or run_api_server_worker_proc,
name=f"ApiServer_{i}",
args=(listen_address, sock, args, client_config),
)
self.processes.append(proc)
proc.start()
def run_api_server_worker_proc(
listen_address, sock, args, client_config=None, **uvicorn_kwargs
) -> None:
"""Entrypoint for individual API server worker processes."""
from vllm.entrypoints.openai.api_server import run_server_worker
client_config = client_config or {}
server_index = client_config.get("client_index", 0)
# Set process title and add process-specific prefix to stdout and stderr.
set_process_title("APIServer", str(server_index))
decorate_logs()
uvloop.run(
run_server_worker(listen_address, sock, args, client_config, **uvicorn_kwargs)
)
# https://github.com/vllm-project/vllm/blob/v0.20.1/vllm/entrypoints/openai/api_server.py#L681
async def run_server_worker(
listen_address, sock, args, client_config=None, **uvicorn_kwargs
) -> None:
"""Run a single API server worker."""
if args.tool_parser_plugin and len(args.tool_parser_plugin) > 3:
ToolParserManager.import_tool_parser(args.tool_parser_plugin)
if args.reasoning_parser_plugin and len(args.reasoning_parser_plugin) > 3:
ReasoningParserManager.import_reasoning_parser(args.reasoning_parser_plugin)
async with build_async_engine_client(
args,
client_config=client_config,
) as engine_client:
shutdown_task = await build_and_serve(
engine_client, listen_address, sock, args, **uvicorn_kwargs
)
# https://github.com/vllm-project/vllm/blob/v0.20.1/vllm/entrypoints/openai/api_server.py#L578
async def build_and_serve(
engine_client: EngineClient,
listen_address: str,
sock: socket.socket,
args: Namespace,
**uvicorn_kwargs,
) -> asyncio.Task:
app = build_app(args, supported_tasks, model_config)
await init_app_state(engine_client, app.state, args, supported_tasks)
logger.info("Starting vLLM server on %s", listen_address)
return await serve_http(
app,
sock=sock,
enable_ssl_refresh=args.enable_ssl_refresh,
host=args.host,
port=args.port,
log_level=args.uvicorn_log_level,
# NOTE: When the 'disable_uvicorn_access_log' value is True,
# no access log will be output.
access_log=not args.disable_uvicorn_access_log,
timeout_keep_alive=envs.VLLM_HTTP_TIMEOUT_KEEP_ALIVE,
ssl_keyfile=args.ssl_keyfile,
ssl_certfile=args.ssl_certfile,
ssl_ca_certs=args.ssl_ca_certs,
ssl_cert_reqs=args.ssl_cert_reqs,
ssl_ciphers=args.ssl_ciphers,
h11_max_incomplete_event_size=args.h11_max_incomplete_event_size,
h11_max_header_count=args.h11_max_header_count,
**uvicorn_kwargs,
)
# https://github.com/vllm-project/vllm/blob/d4b00484040c9a0bbd4ee2d55983df7a50ab1fd3/vllm/entrypoints/openai/api_server.py#L317
async def init_app_state(
engine_client: EngineClient,
state: State,
args: Namespace,
supported_tasks: tuple["SupportedTask", ...] | None = None,
) -> None:
state.openai_serving_render = OpenAIServingRender(
model_config=engine_client.model_config,
renderer=engine_client.renderer,
model_registry=state.openai_serving_models.registry,
request_logger=request_logger,
chat_template=resolved_chat_template,
chat_template_content_format=args.chat_template_content_format,
trust_request_chat_template=args.trust_request_chat_template,
enable_auto_tools=args.enable_auto_tool_choice,
exclude_tools_when_tool_choice_none=args.exclude_tools_when_tool_choice_none,
tool_parser=args.tool_call_parser,
reasoning_parser=args.structured_outputs_config.reasoning_parser,
default_chat_template_kwargs=args.default_chat_template_kwargs,
log_error_stack=args.log_error_stack,
)
state.openai_serving_tokenization = OpenAIServingTokenization(
engine_client,
state.openai_serving_models,
state.openai_serving_render,
request_logger=request_logger,
chat_template=resolved_chat_template,
chat_template_content_format=args.chat_template_content_format,
default_chat_template_kwargs=args.default_chat_template_kwargs,
trust_request_chat_template=args.trust_request_chat_template,
)
if "generate" in supported_tasks:
from vllm.entrypoints.openai.generate.api_router import init_generate_state
await init_generate_state(
engine_client, state, args, request_logger, supported_tasks
)
#https://github.com/vllm-project/vllm/blob/v0.20.1/vllm/entrypoints/openai/generate/api_router.py#L45
async def init_generate_state(
engine_client: "EngineClient",
state: "State",
args: "Namespace",
request_logger: RequestLogger | None,
supported_tasks: tuple["SupportedTask", ...],
):
state.openai_serving_chat = (
OpenAIServingChat(**_chat_kwargs) if "generate" in supported_tasks else None
)
从 EngineCore 获取输出内容 → 最终返回给 API 调用方
text
【后台任务】output_handler (async 协程)
│
├─ 1. 调用 engine_core.get_output_async()
│ │
│ └─ 阻塞等待 EngineCore 进程返回一批 EngineCoreOutput 对象
│ (outputs.outputs: List[EngineCoreOutput])
│
├─ 2. 将 outputs.outputs 按 chunk_size 分块(避免长时间占用事件循环)
│
└─ 3. 对每个块调用 output_processor.process_outputs(block, timestamp, stats)
│
└─ 进入 OutputProcessor 内部循环 (遍历每个 EngineCoreOutput)
┌─────────────────────────────────────────────────────────────┐
│ OutputProcessor.process_outputs 内部处理流程(核心) │
└─────────────────────────────────────────────────────────────┘
对每个 EngineCoreOutput (engine_core_output):
│
├─ a. 根据 engine_core_output.request_id 找到 RequestState
│ (存储在 OutputProcessor.request_states 字典中)
│
├─ b. 更新统计信息(如迭代计数、缓存命中数)
│
├─ c. 如果是新 token 且 request 需要 detokenization:
│ req_state.detokenizer.update(new_token_ids)
│ └─ 可能检测到 stop_string → 覆盖 finish_reason = STOP
│
├─ d. 更新 logprobs 处理器(如果请求要求)
│
├─ e. 调用 req_state.make_request_output(...) 创建 RequestOutput
│ (内部会组装文本、logprobs、finish_reason 等)
│
├─ f. 将 RequestOutput 放入该请求对应的队列
│ if req_state.queue is not None:
│ req_state.queue.put(request_output) ← 关键!
│
└─ g. 如果 finish_reason 不为 None(请求结束):
├─ 清理 RequestState,释放资源
├─ 如果 finish_reason 由 stop_string 引起但 EngineCore 未标记 finished
│ → 将请求 ID 加入 reqs_to_abort 列表(后续通知 EngineCore 中止)
└─ 更新最终统计、tracing 等
循环结束后:
├─ 返回 OutputProcessorOutput(request_outputs=[], reqs_to_abort=...)
│ (因为 AsyncLLM 模式下 request_outputs 保持为空,结果已经通过 queue 传递)
│
└─ output_handler 根据 reqs_to_abort 调用 engine_core.abort_requests_async()
【队列交互】RequestOutputCollector 的 put / get 机制
┌─────────────────────────────────────────────────────────────────┐
│ RequestOutputCollector (每个请求独立) │
│ │
│ queue.put(request_output): │
│ ├─ 如果 self.output 为 None 或遇到 Exception: │
│ │ 直接覆盖 self.output = request_output │
│ │ self.ready.set() # 唤醒等待的 get() │
│ │ │
│ └─ 如果已有 RequestOutput 且新 output 也是 RequestOutput:│
│ self.output.add(new_output, aggregate=self.aggregate)│
│ # aggregate=True (DELTA模式) 时合并增量输出, │
│ # 避免队列堆积;aggregate=False (FINAL模式) 时覆盖 │
│ │
│ queue.get() / get_nowait(): │
│ ├─ 若无输出且非阻塞则 await self.ready.wait() │
│ ├─ 取出 self.output,清空并 clear(ready) │
│ └─ 若 output 是 Exception 则抛出 │
└─────────────────────────────────────────────────────────────────┘
【消费侧】AsyncLLM.generate() 协程 (每个请求独立)
│
├─ 调用 add_request() → 创建 RequestOutputCollector 实例
│ └─ 该 collector 被保存在 OutputProcessor.request_states[req_id].queue
│
├─ 进入循环 while not finished:
│ │
│ ├─ out = q.get_nowait() or await q.get() ← 阻塞或立即取
│ │
│ ├─ 如果 out 是 RequestOutput:
│ │ finished = out.finished
│ │ yield out # 返回给 API Server,最终传给客户端
│ │
│ └─ 如果 out 是 STREAM_FINISHED:退出循环
│
└─ 异常处理(CancelledError, EngineDeadError, ValueError, ...)
└─ 调用 abort() 并清理
output_handler代码:
https://github.com/vllm-project/vllm/blob/v0.20.1/vllm/v1/engine/async_llm.py#L656
OutputProcessor.process_outputs代码:
https://github.com/vllm-project/vllm/blob/v0.20.1/vllm/v1/engine/output_processor.py#L572