
模型部署与量化:从训练到生产
一、为什么需要模型部署?
1.1 训练 vs 推理的区别
python
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import time
import warnings
warnings.filterwarnings('ignore')
print("=" * 60)
print("模型部署:从研究到生产")
print("=" * 60)
# 训练 vs 推理对比
comparison = """
╔══════════════════╦══════════════════════════════════════════════════════════════╗
║ 阶段 ║ 特点 ║
╠══════════════════╬══════════════════════════════════════════════════════════════╣
║ 训练 (Training) ║ • 需要反向传播、梯度计算 ║
║ ║ • 需要大量内存(梯度、优化器状态) ║
║ ║ • 支持多种精度(FP32/FP16) ║
║ ║ • 可接受较慢速度 ║
╠══════════════════╬══════════════════════════════════════════════════════════════╣
║ 推理 (Inference) ║ • 只需前向传播 ║
║ ║ • 内存占用小 ║
║ ║ • 可用更低精度(INT8/INT4) ║
║ ║ • 要求低延迟、高吞吐 ║
╚══════════════════╩══════════════════════════════════════════════════════════════╝
"""
print(comparison)
# 训练 vs 推理内存对比
def memory_comparison():
"""训练与推理内存对比"""
categories = ['模型参数', '梯度', '优化器状态', '激活值', '总计']
train_memory = [100, 100, 200, 150, 550]
inference_memory = [100, 0, 0, 50, 150]
x = np.arange(len(categories))
width = 0.35
fig, ax = plt.subplots(figsize=(10, 6))
bars1 = ax.bar(x - width/2, train_memory, width, label='训练', color='lightcoral')
bars2 = ax.bar(x + width/2, inference_memory, width, label='推理', color='lightgreen')
ax.set_ylabel('内存占用 (MB)')
ax.set_title('训练 vs 推理内存对比')
ax.set_xticks(x)
ax.set_xticklabels(categories)
ax.legend()
for bar, val in zip(bars1, train_memory):
ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 5,
f'{val}MB', ha='center', va='bottom', fontsize=9)
for bar, val in zip(bars2, inference_memory):
ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 5,
f'{val}MB', ha='center', va='bottom', fontsize=9)
plt.tight_layout()
plt.show()
memory_comparison()
print("\n💡 部署的核心挑战:")
print(" 1. 延迟 (Latency): 单次推理耗时")
print(" 2. 吞吐量 (Throughput): 每秒处理请求数")
print(" 3. 内存占用: 模型大小和运行时内存")
print(" 4. 硬件限制: CPU/GPU/边缘设备")
二、模型导出格式
2.1 PyTorch模型导出
python
def pytorch_export_demo():
"""PyTorch模型导出演示"""
print("\n" + "=" * 60)
print("PyTorch模型导出")
print("=" * 60)
# 创建一个简单模型
class SimpleModel(nn.Module):
def __init__(self):
super().__init__()
self.fc1 = nn.Linear(10, 20)
self.relu = nn.ReLU()
self.fc2 = nn.Linear(20, 2)
def forward(self, x):
x = self.relu(self.fc1(x))
return self.fc2(x)
model = SimpleModel()
model.eval() # 切换到推理模式
# 1. 保存完整模型(不推荐)
torch.save(model, 'model_full.pt')
print("1. 保存完整模型: model_full.pt")
# 2. 保存state_dict(推荐)
torch.save(model.state_dict(), 'model_state.pt')
print("2. 保存state_dict: model_state.pt")
# 3. TorchScript (JIT)
example_input = torch.randn(1, 10)
traced_model = torch.jit.trace(model, example_input)
traced_model.save('model_traced.pt')
print("3. TorchScript追踪: model_traced.pt")
# 4. TorchScript (脚本化)
scripted_model = torch.jit.script(model)
scripted_model.save('model_scripted.pt')
print("4. TorchScript脚本: model_scripted.pt")
# 性能对比
print("\n📊 推理速度对比:")
def benchmark(model, input_tensor, n_runs=1000):
# 预热
for _ in range(100):
_ = model(input_tensor)
start = time.time()
for _ in range(n_runs):
_ = model(input_tensor)
elapsed = time.time() - start
return elapsed / n_runs * 1000 # 毫秒
models = {
'PyTorch': model,
'TorchScript (trace)': traced_model,
'TorchScript (script)': scripted_model
}
for name, m in models.items():
avg_time = benchmark(m, example_input)
print(f" {name}: {avg_time:.3f} ms")
pytorch_export_demo()
2.2 ONNX导出
python
def onnx_export_demo():
"""ONNX模型导出"""
print("\n" + "=" * 60)
print("ONNX模型导出")
print("=" * 60)
print("\n📐 ONNX (Open Neural Network Exchange)")
print(" 优点:")
print(" - 跨框架兼容(PyTorch → ONNX → TensorFlow)")
print(" - 支持多种推理后端")
print(" - 可进行图优化")
# 创建模型
class SimpleModel(nn.Module):
def __init__(self):
super().__init__()
self.conv = nn.Conv2d(3, 16, 3, padding=1)
self.bn = nn.BatchNorm2d(16)
self.relu = nn.ReLU()
self.pool = nn.AdaptiveAvgPool2d(1)
self.fc = nn.Linear(16, 10)
def forward(self, x):
x = self.relu(self.bn(self.conv(x)))
x = self.pool(x)
x = x.view(x.size(0), -1)
return self.fc(x)
model = SimpleModel()
model.eval()
# 导出ONNX
example_input = torch.randn(1, 3, 32, 32)
print("\n📐 ONNX导出代码:")
print("""
import torch.onnx
torch.onnx.export(
model, # 模型
example_input, # 示例输入
"model.onnx", # 输出文件
input_names=['input'], # 输入名称
output_names=['output'], # 输出名称
dynamic_axes={ # 动态轴(可变batch)
'input': {0: 'batch_size'},
'output': {0: 'batch_size'}
},
opset_version=11
)
""")
# 模拟导出
try:
torch.onnx.export(model, example_input, "model.onnx",
input_names=['input'], output_names=['output'],
opset_version=11, verbose=False)
print("\n✅ ONNX模型已导出: model.onnx")
# 验证ONNX模型
import onnx
onnx_model = onnx.load("model.onnx")
onnx.checker.check_model(onnx_model)
print("✅ ONNX模型验证通过")
except Exception as e:
print(f"导出失败: {e}")
print("\n💡 ONNX推理后端:")
print(" - ONNX Runtime (CPU/GPU)")
print(" - TensorRT (NVIDIA GPU)")
print(" - OpenVINO (Intel)")
print(" - TVM")
onnx_export_demo()
三、模型量化
3.1 量化原理
python
def quantization_principle():
"""量化原理讲解"""
print("\n" + "=" * 60)
print("模型量化原理")
print("=" * 60)
# 精度对比
precisions = {
'FP32': 32,
'FP16': 16,
'INT8': 8,
'INT4': 4
}
# 量化公式
print("\n📐 量化公式:")
print(" q = round(scale * (x - zero_point))")
print(" scale = (max - min) / (q_max - q_min)")
print(" x = (q - zero_point) / scale")
# 精度与模型大小对比
precisions_list = ['FP32', 'FP16', 'INT8', 'INT4']
model_sizes = [100, 50, 25, 12.5]
acc_loss = [0, 0.1, 0.5, 1.5]
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
axes[0].bar(precisions_list, model_sizes, color='lightblue')
axes[0].set_ylabel('相对模型大小 (%)')
axes[0].set_title('不同精度模型大小对比')
axes[0].grid(True, alpha=0.3, axis='y')
for bar, size in zip(axes[0].bars, model_sizes):
axes[0].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1,
f'{size}%', ha='center', va='bottom')
axes[1].bar(precisions_list, acc_loss, color='lightcoral')
axes[1].set_ylabel('精度损失 (%)')
axes[1].set_title('量化精度损失')
axes[1].grid(True, alpha=0.3, axis='y')
for bar, loss in zip(axes[1].bars, acc_loss):
axes[1].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.05,
f'{loss}%', ha='center', va='bottom')
plt.tight_layout()
plt.show()
print("\n📊 量化类型:")
print(" 1. 训练后量化 (PTQ): 不需要重新训练,直接量化")
print(" 2. 量化感知训练 (QAT): 训练中模拟量化,精度更高")
print("\n📐 PyTorch量化:")
print("""
# 训练后动态量化(权重INT8,激活FP32)
model = torch.quantization.quantize_dynamic(
model, {nn.Linear, nn.Conv2d}, dtype=torch.qint8
)
# 训练后静态量化(权重和激活都是INT8)
model.qconfig = torch.quantization.get_default_qconfig('fbgemm')
torch.quantization.prepare(model, inplace=True)
# 校准...
torch.quantization.convert(model, inplace=True)
# 量化感知训练
model.qconfig = torch.quantization.get_default_qat_qconfig('fbgemm')
torch.quantization.prepare_qat(model, inplace=True)
# 训练...
torch.quantization.convert(model, inplace=True)
""")
quantization_principle()
3.2 量化实战
python
def quantization_practice():
"""量化实战演示"""
print("\n" + "=" * 60)
print("量化实战")
print("=" * 60)
# 创建一个简单的分类模型
class SimpleCNN(nn.Module):
def __init__(self):
super().__init__()
self.conv1 = nn.Conv2d(1, 32, 3, 1)
self.conv2 = nn.Conv2d(32, 64, 3, 1)
self.dropout1 = nn.Dropout(0.25)
self.dropout2 = nn.Dropout(0.5)
self.fc1 = nn.Linear(9216, 128)
self.fc2 = nn.Linear(128, 10)
def forward(self, x):
x = self.conv1(x)
x = nn.functional.relu(x)
x = self.conv2(x)
x = nn.functional.relu(x)
x = nn.functional.max_pool2d(x, 2)
x = self.dropout1(x)
x = torch.flatten(x, 1)
x = self.fc1(x)
x = nn.functional.relu(x)
x = self.dropout2(x)
x = self.fc2(x)
return x
# 创建模型
model = SimpleCNN()
model.eval()
# 模拟训练后量化
print("\n模拟量化效果:")
# 模拟不同精度的模型大小
param_count = sum(p.numel() for p in model.parameters())
fp32_size = param_count * 4 / 1024 / 1024 # MB
fp16_size = param_count * 2 / 1024 / 1024
int8_size = param_count * 1 / 1024 / 1024
print(f"参数量: {param_count:,}")
print(f"FP32模型大小: {fp32_size:.2f} MB")
print(f"FP16模型大小: {fp16_size:.2f} MB")
print(f"INT8模型大小: {int8_size:.2f} MB")
# 量化效果可视化
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
# 模型大小对比
models = ['FP32', 'FP16', 'INT8']
sizes = [fp32_size, fp16_size, int8_size]
colors = ['lightcoral', 'lightblue', 'lightgreen']
axes[0].bar(models, sizes, color=colors)
axes[0].set_ylabel('模型大小 (MB)')
axes[0].set_title('不同精度模型大小对比')
for bar, size in zip(axes[0].bars, sizes):
axes[0].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.5,
f'{size:.1f}MB', ha='center', va='bottom')
# 推理速度对比
speeds = [1.0, 1.8, 3.5]
axes[1].bar(models, speeds, color=colors)
axes[1].set_ylabel('相对推理速度')
axes[1].set_title('量化加速效果')
for bar, speed in zip(axes[1].bars, speeds):
axes[1].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.1,
f'{speed}x', ha='center', va='bottom')
plt.tight_layout()
plt.show()
print("\n💡 量化建议:")
print(" - 精度要求高 → FP16或INT8")
print(" - 边缘设备 → INT8/INT4")
print(" - 服务器部署 → FP16")
print(" - 需要最大加速 → INT4")
quantization_practice()
四、TensorRT加速
4.1 TensorRT简介
python
def tensorrt_demo():
"""TensorRT加速介绍"""
print("\n" + "=" * 60)
print("TensorRT:NVIDIA推理加速器")
print("=" * 60)
print("\n📐 TensorRT优化技术:")
optimizations = [
"1. 层融合 (Layer Fusion): 合并相邻操作",
"2. 精度校准 (Precision Calibration): INT8/FP16优化",
"3. 内核自动调优 (Kernel Auto-tuning): 选择最佳CUDA内核",
"4. 张量格式优化 (Tensor Layout): 内存访问优化",
"5. 动态内存管理 (Dynamic Memory): 减少内存占用"
]
for opt in optimizations:
print(f" {opt}")
# TensorRT加速效果模拟
models = ['ResNet50', 'BERT', 'YOLOv5', 'GPT-2']
fp32_latency = [15, 25, 20, 40]
fp16_latency = [8, 12, 10, 20]
int8_latency = [5, 7, 6, 12]
x = np.arange(len(models))
width = 0.25
fig, ax = plt.subplots(figsize=(12, 6))
ax.bar(x - width, fp32_latency, width, label='FP32', color='lightcoral')
ax.bar(x, fp16_latency, width, label='FP16', color='lightblue')
ax.bar(x + width, int8_latency, width, label='INT8', color='lightgreen')
ax.set_xlabel('模型')
ax.set_ylabel('延迟 (ms)')
ax.set_title('TensorRT加速效果')
ax.set_xticks(x)
ax.set_xticklabels(models)
ax.legend()
for i, (f32, f16, i8) in enumerate(zip(fp32_latency, fp16_latency, int8_latency)):
ax.text(i - width, f32 + 1, f'{f32}ms', ha='center', va='bottom', fontsize=8)
ax.text(i, f16 + 1, f'{f16}ms', ha='center', va='bottom', fontsize=8)
ax.text(i + width, i8 + 1, f'{i8}ms', ha='center', va='bottom', fontsize=8)
plt.tight_layout()
plt.show()
print("\n📐 TensorRT使用流程:")
print("""
# 1. 导出ONNX
torch.onnx.export(model, dummy_input, "model.onnx")
# 2. 使用trtexec转换
# trtexec --onnx=model.onnx --saveEngine=model.plan --fp16
# 3. Python API
import tensorrt as trt
logger = trt.Logger(trt.Logger.WARNING)
builder = trt.Builder(logger)
network = builder.create_network()
parser = trt.OnnxParser(network, logger)
parser.parse_from_file("model.onnx")
config = builder.create_builder_config()
config.set_flag(trt.BuilderFlag.FP16)
engine = builder.build_engine(network, config)
""")
print("\n💡 TensorRT适用场景:")
print(" - NVIDIA GPU环境")
print(" - 对延迟要求高的场景")
print(" - 批量推理优化")
tensorrt_demo()
五、部署方案对比
5.1 部署方案总结
python
def deployment_comparison():
"""部署方案对比"""
print("\n" + "=" * 60)
print("部署方案对比")
print("=" * 60)
comparison_table = """
╔══════════════════╦══════════════════════════════════════════════════════════════╗
║ 方案 ║ 特点 ║
╠══════════════════╬══════════════════════════════════════════════════════════════╣
║ PyTorch Native ║ 简单、灵活、调试方便,速度一般 ║
║ TorchScript ║ 生产就绪、C++部署、图优化 ║
║ ONNX + ONNX RT ║ 跨框架、跨平台、CPU/GPU都支持 ║
║ TensorRT ║ NVIDIA GPU上最快、延迟最低 ║
║ OpenVINO ║ Intel CPU/GPU优化、边缘设备友好 ║
║ TFLite ║ 移动端、嵌入式、Android/iOS ║
║ Core ML ║ Apple生态(iOS/macOS) ║
╚══════════════════╩══════════════════════════════════════════════════════════════╝
"""
print(comparison_table)
# 选择决策树
fig, ax = plt.subplots(figsize=(12, 8))
ax.axis('off')
decision_tree = """
🎯 部署方案选择决策树:
开始
│
▼
在什么环境部署?
│
┌─────────────┼─────────────┐
▼ ▼ ▼
NVIDIA GPU Intel CPU 移动端
│ │ │
▼ ▼ ▼
需要最低延迟? 需要跨平台? iOS/Android?
│ │ │
┌────┴────┐ ┌───┴───┐ ┌───┴───┐
▼ ▼ ▼ ▼ ▼ ▼
是 否 是 否 是 否
│ │ │ │ │ │
▼ ▼ ▼ ▼ ▼ ▼
TensorRT ONNX RT ONNX 原生 Core ML TFLite
+TRT
"""
ax.text(0.5, 0.5, decision_tree, ha='center', va='center', fontsize=11,
transform=ax.transAxes, fontfamily='monospace')
ax.set_title('部署方案选择指南', fontsize=14)
plt.tight_layout()
plt.show()
deployment_comparison()
六、完整部署流程示例
6.1 端到端部署
python
def complete_deployment():
"""完整部署流程"""
print("\n" + "=" * 60)
print("完整部署流程示例")
print("=" * 60)
# 模拟部署流程
steps = [
("1. 训练模型", "在PyTorch中训练并验证"),
("2. 导出模型", "转换为ONNX/TorchScript"),
("3. 优化模型", "量化、剪枝、融合"),
("4. 部署服务", "使用FastAPI/Flask封装API"),
("5. 性能测试", "压测延迟和吞吐量"),
("6. 监控运维", "日志、指标、告警")
]
print("\n📋 部署流程:")
for step, desc in steps:
print(f" {step}: {desc}")
# API服务示例代码
print("\n📐 API服务示例 (FastAPI):")
print("""
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
import torch
import numpy as np
app = FastAPI()
# 加载模型
model = torch.jit.load('model_traced.pt')
model.eval()
class PredictRequest(BaseModel):
data: list
class PredictResponse(BaseModel):
prediction: int
confidence: float
@app.post("/predict", response_model=PredictResponse)
async def predict(request: PredictRequest):
try:
# 转换输入
input_tensor = torch.tensor(request.data).float()
# 推理
with torch.no_grad():
output = model(input_tensor)
probs = torch.softmax(output, dim=1)
pred = torch.argmax(probs, dim=1)
conf = probs[0, pred].item()
return PredictResponse(
prediction=pred.item(),
confidence=conf
)
except Exception as e:
raise HTTPException(status_code=400, detail=str(e))
# 运行: uvicorn main:app --host 0.0.0.0 --port 8000
""")
# Docker部署
print("\n📐 Docker部署:")
print("""
# Dockerfile
FROM pytorch/pytorch:1.13-cuda11.6-cudnn8-runtime
WORKDIR /app
COPY requirements.txt .
RUN pip install -r requirements.txt
COPY . .
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
# 构建和运行
# docker build -t model-api .
# docker run -p 8000:8000 --gpus all model-api
""")
complete_deployment()
七、总结
| 技术 | 用途 | 加速比 | 精度损失 |
|---|---|---|---|
| TorchScript | PyTorch生产部署 | 1.2-1.5x | 0% |
| ONNX | 跨框架部署 | 1.3-1.8x | 0% |
| FP16 | 半精度推理 | 2x | <0.1% |
| INT8 | 整数量化 | 3-4x | 0.5-1% |
| TensorRT | NVIDIA加速 | 5-10x | <0.5% |
部署流程总结:
训练 → 导出 → 优化 → 打包 → 部署 → 监控
↓ ↓ ↓ ↓ ↓ ↓
PyTorch ONNX 量化 Docker API 日志
最佳实践:
- 训练时考虑部署需求
- 使用ONNX作为中间格式
- 根据硬件选择合适的优化
- 进行充分的性能测试
- 建立监控和告警机制