03-深度学习基础：模型部署与量化

模型部署与量化：从训练到生产

一、为什么需要模型部署？

1.1 训练 vs 推理的区别

python 复制代码

import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import time
import warnings
warnings.filterwarnings('ignore')

print("=" * 60)
print("模型部署：从研究到生产")
print("=" * 60)

# 训练 vs 推理对比
comparison = """
╔══════════════════╦══════════════════════════════════════════════════════════════╗
║     阶段         ║                    特点                                      ║
╠══════════════════╬══════════════════════════════════════════════════════════════╣
║ 训练 (Training)  ║ • 需要反向传播、梯度计算                                      ║
║                  ║ • 需要大量内存（梯度、优化器状态）                            ║
║                  ║ • 支持多种精度（FP32/FP16）                                   ║
║                  ║ • 可接受较慢速度                                             ║
╠══════════════════╬══════════════════════════════════════════════════════════════╣
║ 推理 (Inference) ║ • 只需前向传播                                               ║
║                  ║ • 内存占用小                                                 ║
║                  ║ • 可用更低精度（INT8/INT4）                                   ║
║                  ║ • 要求低延迟、高吞吐                                          ║
╚══════════════════╩══════════════════════════════════════════════════════════════╝
"""

print(comparison)

# 训练 vs 推理内存对比
def memory_comparison():
    """训练与推理内存对比"""
    
    categories = ['模型参数', '梯度', '优化器状态', '激活值', '总计']
    train_memory = [100, 100, 200, 150, 550]
    inference_memory = [100, 0, 0, 50, 150]
    
    x = np.arange(len(categories))
    width = 0.35
    
    fig, ax = plt.subplots(figsize=(10, 6))
    bars1 = ax.bar(x - width/2, train_memory, width, label='训练', color='lightcoral')
    bars2 = ax.bar(x + width/2, inference_memory, width, label='推理', color='lightgreen')
    
    ax.set_ylabel('内存占用 (MB)')
    ax.set_title('训练 vs 推理内存对比')
    ax.set_xticks(x)
    ax.set_xticklabels(categories)
    ax.legend()
    
    for bar, val in zip(bars1, train_memory):
        ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 5,
                f'{val}MB', ha='center', va='bottom', fontsize=9)
    for bar, val in zip(bars2, inference_memory):
        ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 5,
                f'{val}MB', ha='center', va='bottom', fontsize=9)
    
    plt.tight_layout()
    plt.show()

memory_comparison()

print("\n💡 部署的核心挑战:")
print("   1. 延迟 (Latency): 单次推理耗时")
print("   2. 吞吐量 (Throughput): 每秒处理请求数")
print("   3. 内存占用: 模型大小和运行时内存")
print("   4. 硬件限制: CPU/GPU/边缘设备")

二、模型导出格式

2.1 PyTorch模型导出

python 复制代码

def pytorch_export_demo():
    """PyTorch模型导出演示"""
    
    print("\n" + "=" * 60)
    print("PyTorch模型导出")
    print("=" * 60)
    
    # 创建一个简单模型
    class SimpleModel(nn.Module):
        def __init__(self):
            super().__init__()
            self.fc1 = nn.Linear(10, 20)
            self.relu = nn.ReLU()
            self.fc2 = nn.Linear(20, 2)
        
        def forward(self, x):
            x = self.relu(self.fc1(x))
            return self.fc2(x)
    
    model = SimpleModel()
    model.eval()  # 切换到推理模式
    
    # 1. 保存完整模型（不推荐）
    torch.save(model, 'model_full.pt')
    print("1. 保存完整模型: model_full.pt")
    
    # 2. 保存state_dict（推荐）
    torch.save(model.state_dict(), 'model_state.pt')
    print("2. 保存state_dict: model_state.pt")
    
    # 3. TorchScript (JIT)
    example_input = torch.randn(1, 10)
    traced_model = torch.jit.trace(model, example_input)
    traced_model.save('model_traced.pt')
    print("3. TorchScript追踪: model_traced.pt")
    
    # 4. TorchScript (脚本化)
    scripted_model = torch.jit.script(model)
    scripted_model.save('model_scripted.pt')
    print("4. TorchScript脚本: model_scripted.pt")
    
    # 性能对比
    print("\n📊 推理速度对比:")
    
    def benchmark(model, input_tensor, n_runs=1000):
        # 预热
        for _ in range(100):
            _ = model(input_tensor)
        
        start = time.time()
        for _ in range(n_runs):
            _ = model(input_tensor)
        elapsed = time.time() - start
        return elapsed / n_runs * 1000  # 毫秒
    
    models = {
        'PyTorch': model,
        'TorchScript (trace)': traced_model,
        'TorchScript (script)': scripted_model
    }
    
    for name, m in models.items():
        avg_time = benchmark(m, example_input)
        print(f"   {name}: {avg_time:.3f} ms")

pytorch_export_demo()

2.2 ONNX导出

python 复制代码

def onnx_export_demo():
    """ONNX模型导出"""
    
    print("\n" + "=" * 60)
    print("ONNX模型导出")
    print("=" * 60)
    
    print("\n📐 ONNX (Open Neural Network Exchange)")
    print("   优点:")
    print("   - 跨框架兼容（PyTorch → ONNX → TensorFlow）")
    print("   - 支持多种推理后端")
    print("   - 可进行图优化")
    
    # 创建模型
    class SimpleModel(nn.Module):
        def __init__(self):
            super().__init__()
            self.conv = nn.Conv2d(3, 16, 3, padding=1)
            self.bn = nn.BatchNorm2d(16)
            self.relu = nn.ReLU()
            self.pool = nn.AdaptiveAvgPool2d(1)
            self.fc = nn.Linear(16, 10)
        
        def forward(self, x):
            x = self.relu(self.bn(self.conv(x)))
            x = self.pool(x)
            x = x.view(x.size(0), -1)
            return self.fc(x)
    
    model = SimpleModel()
    model.eval()
    
    # 导出ONNX
    example_input = torch.randn(1, 3, 32, 32)
    
    print("\n📐 ONNX导出代码:")
    print("""
    import torch.onnx
    
    torch.onnx.export(
        model,                       # 模型
        example_input,               # 示例输入
        "model.onnx",                # 输出文件
        input_names=['input'],       # 输入名称
        output_names=['output'],     # 输出名称
        dynamic_axes={               # 动态轴（可变batch）
            'input': {0: 'batch_size'},
            'output': {0: 'batch_size'}
        },
        opset_version=11
    )
    """)
    
    # 模拟导出
    try:
        torch.onnx.export(model, example_input, "model.onnx", 
                         input_names=['input'], output_names=['output'],
                         opset_version=11, verbose=False)
        print("\n✅ ONNX模型已导出: model.onnx")
        
        # 验证ONNX模型
        import onnx
        onnx_model = onnx.load("model.onnx")
        onnx.checker.check_model(onnx_model)
        print("✅ ONNX模型验证通过")
        
    except Exception as e:
        print(f"导出失败: {e}")
    
    print("\n💡 ONNX推理后端:")
    print("   - ONNX Runtime (CPU/GPU)")
    print("   - TensorRT (NVIDIA GPU)")
    print("   - OpenVINO (Intel)")
    print("   - TVM")

onnx_export_demo()

三、模型量化

3.1 量化原理

python 复制代码

def quantization_principle():
    """量化原理讲解"""
    
    print("\n" + "=" * 60)
    print("模型量化原理")
    print("=" * 60)
    
    # 精度对比
    precisions = {
        'FP32': 32,
        'FP16': 16,
        'INT8': 8,
        'INT4': 4
    }
    
    # 量化公式
    print("\n📐 量化公式:")
    print("   q = round(scale * (x - zero_point))")
    print("   scale = (max - min) / (q_max - q_min)")
    print("   x = (q - zero_point) / scale")
    
    # 精度与模型大小对比
    precisions_list = ['FP32', 'FP16', 'INT8', 'INT4']
    model_sizes = [100, 50, 25, 12.5]
    acc_loss = [0, 0.1, 0.5, 1.5]
    
    fig, axes = plt.subplots(1, 2, figsize=(12, 5))
    
    axes[0].bar(precisions_list, model_sizes, color='lightblue')
    axes[0].set_ylabel('相对模型大小 (%)')
    axes[0].set_title('不同精度模型大小对比')
    axes[0].grid(True, alpha=0.3, axis='y')
    
    for bar, size in zip(axes[0].bars, model_sizes):
        axes[0].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1,
                    f'{size}%', ha='center', va='bottom')
    
    axes[1].bar(precisions_list, acc_loss, color='lightcoral')
    axes[1].set_ylabel('精度损失 (%)')
    axes[1].set_title('量化精度损失')
    axes[1].grid(True, alpha=0.3, axis='y')
    
    for bar, loss in zip(axes[1].bars, acc_loss):
        axes[1].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.05,
                    f'{loss}%', ha='center', va='bottom')
    
    plt.tight_layout()
    plt.show()
    
    print("\n📊 量化类型:")
    print("   1. 训练后量化 (PTQ): 不需要重新训练，直接量化")
    print("   2. 量化感知训练 (QAT): 训练中模拟量化，精度更高")
    
    print("\n📐 PyTorch量化:")
    print("""
    # 训练后动态量化（权重INT8，激活FP32）
    model = torch.quantization.quantize_dynamic(
        model, {nn.Linear, nn.Conv2d}, dtype=torch.qint8
    )
    
    # 训练后静态量化（权重和激活都是INT8）
    model.qconfig = torch.quantization.get_default_qconfig('fbgemm')
    torch.quantization.prepare(model, inplace=True)
    # 校准...
    torch.quantization.convert(model, inplace=True)
    
    # 量化感知训练
    model.qconfig = torch.quantization.get_default_qat_qconfig('fbgemm')
    torch.quantization.prepare_qat(model, inplace=True)
    # 训练...
    torch.quantization.convert(model, inplace=True)
    """)

quantization_principle()

3.2 量化实战

python 复制代码

def quantization_practice():
    """量化实战演示"""
    
    print("\n" + "=" * 60)
    print("量化实战")
    print("=" * 60)
    
    # 创建一个简单的分类模型
    class SimpleCNN(nn.Module):
        def __init__(self):
            super().__init__()
            self.conv1 = nn.Conv2d(1, 32, 3, 1)
            self.conv2 = nn.Conv2d(32, 64, 3, 1)
            self.dropout1 = nn.Dropout(0.25)
            self.dropout2 = nn.Dropout(0.5)
            self.fc1 = nn.Linear(9216, 128)
            self.fc2 = nn.Linear(128, 10)
        
        def forward(self, x):
            x = self.conv1(x)
            x = nn.functional.relu(x)
            x = self.conv2(x)
            x = nn.functional.relu(x)
            x = nn.functional.max_pool2d(x, 2)
            x = self.dropout1(x)
            x = torch.flatten(x, 1)
            x = self.fc1(x)
            x = nn.functional.relu(x)
            x = self.dropout2(x)
            x = self.fc2(x)
            return x
    
    # 创建模型
    model = SimpleCNN()
    model.eval()
    
    # 模拟训练后量化
    print("\n模拟量化效果:")
    
    # 模拟不同精度的模型大小
    param_count = sum(p.numel() for p in model.parameters())
    fp32_size = param_count * 4 / 1024 / 1024  # MB
    fp16_size = param_count * 2 / 1024 / 1024
    int8_size = param_count * 1 / 1024 / 1024
    
    print(f"参数量: {param_count:,}")
    print(f"FP32模型大小: {fp32_size:.2f} MB")
    print(f"FP16模型大小: {fp16_size:.2f} MB")
    print(f"INT8模型大小: {int8_size:.2f} MB")
    
    # 量化效果可视化
    fig, axes = plt.subplots(1, 2, figsize=(12, 5))
    
    # 模型大小对比
    models = ['FP32', 'FP16', 'INT8']
    sizes = [fp32_size, fp16_size, int8_size]
    colors = ['lightcoral', 'lightblue', 'lightgreen']
    
    axes[0].bar(models, sizes, color=colors)
    axes[0].set_ylabel('模型大小 (MB)')
    axes[0].set_title('不同精度模型大小对比')
    for bar, size in zip(axes[0].bars, sizes):
        axes[0].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.5,
                    f'{size:.1f}MB', ha='center', va='bottom')
    
    # 推理速度对比
    speeds = [1.0, 1.8, 3.5]
    axes[1].bar(models, speeds, color=colors)
    axes[1].set_ylabel('相对推理速度')
    axes[1].set_title('量化加速效果')
    for bar, speed in zip(axes[1].bars, speeds):
        axes[1].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.1,
                    f'{speed}x', ha='center', va='bottom')
    
    plt.tight_layout()
    plt.show()
    
    print("\n💡 量化建议:")
    print("   - 精度要求高 → FP16或INT8")
    print("   - 边缘设备 → INT8/INT4")
    print("   - 服务器部署 → FP16")
    print("   - 需要最大加速 → INT4")

quantization_practice()

四、TensorRT加速

4.1 TensorRT简介

python 复制代码

def tensorrt_demo():
    """TensorRT加速介绍"""
    
    print("\n" + "=" * 60)
    print("TensorRT：NVIDIA推理加速器")
    print("=" * 60)
    
    print("\n📐 TensorRT优化技术:")
    optimizations = [
        "1. 层融合 (Layer Fusion): 合并相邻操作",
        "2. 精度校准 (Precision Calibration): INT8/FP16优化",
        "3. 内核自动调优 (Kernel Auto-tuning): 选择最佳CUDA内核",
        "4. 张量格式优化 (Tensor Layout): 内存访问优化",
        "5. 动态内存管理 (Dynamic Memory): 减少内存占用"
    ]
    
    for opt in optimizations:
        print(f"   {opt}")
    
    # TensorRT加速效果模拟
    models = ['ResNet50', 'BERT', 'YOLOv5', 'GPT-2']
    fp32_latency = [15, 25, 20, 40]
    fp16_latency = [8, 12, 10, 20]
    int8_latency = [5, 7, 6, 12]
    
    x = np.arange(len(models))
    width = 0.25
    
    fig, ax = plt.subplots(figsize=(12, 6))
    
    ax.bar(x - width, fp32_latency, width, label='FP32', color='lightcoral')
    ax.bar(x, fp16_latency, width, label='FP16', color='lightblue')
    ax.bar(x + width, int8_latency, width, label='INT8', color='lightgreen')
    
    ax.set_xlabel('模型')
    ax.set_ylabel('延迟 (ms)')
    ax.set_title('TensorRT加速效果')
    ax.set_xticks(x)
    ax.set_xticklabels(models)
    ax.legend()
    
    for i, (f32, f16, i8) in enumerate(zip(fp32_latency, fp16_latency, int8_latency)):
        ax.text(i - width, f32 + 1, f'{f32}ms', ha='center', va='bottom', fontsize=8)
        ax.text(i, f16 + 1, f'{f16}ms', ha='center', va='bottom', fontsize=8)
        ax.text(i + width, i8 + 1, f'{i8}ms', ha='center', va='bottom', fontsize=8)
    
    plt.tight_layout()
    plt.show()
    
    print("\n📐 TensorRT使用流程:")
    print("""
    # 1. 导出ONNX
    torch.onnx.export(model, dummy_input, "model.onnx")
    
    # 2. 使用trtexec转换
    # trtexec --onnx=model.onnx --saveEngine=model.plan --fp16
    
    # 3. Python API
    import tensorrt as trt
    
    logger = trt.Logger(trt.Logger.WARNING)
    builder = trt.Builder(logger)
    network = builder.create_network()
    parser = trt.OnnxParser(network, logger)
    parser.parse_from_file("model.onnx")
    
    config = builder.create_builder_config()
    config.set_flag(trt.BuilderFlag.FP16)
    engine = builder.build_engine(network, config)
    """)
    
    print("\n💡 TensorRT适用场景:")
    print("   - NVIDIA GPU环境")
    print("   - 对延迟要求高的场景")
    print("   - 批量推理优化")

tensorrt_demo()

五、部署方案对比

5.1 部署方案总结

python 复制代码

def deployment_comparison():
    """部署方案对比"""
    
    print("\n" + "=" * 60)
    print("部署方案对比")
    print("=" * 60)
    
    comparison_table = """
    ╔══════════════════╦══════════════════════════════════════════════════════════════╗
    ║     方案         ║                    特点                                      ║
    ╠══════════════════╬══════════════════════════════════════════════════════════════╣
    ║ PyTorch Native   ║ 简单、灵活、调试方便，速度一般                               ║
    ║ TorchScript      ║ 生产就绪、C++部署、图优化                                    ║
    ║ ONNX + ONNX RT   ║ 跨框架、跨平台、CPU/GPU都支持                                ║
    ║ TensorRT         ║ NVIDIA GPU上最快、延迟最低                                  ║
    ║ OpenVINO         ║ Intel CPU/GPU优化、边缘设备友好                             ║
    ║ TFLite           ║ 移动端、嵌入式、Android/iOS                                 ║
    ║ Core ML          ║ Apple生态（iOS/macOS）                                      ║
    ╚══════════════════╩══════════════════════════════════════════════════════════════╝
    """
    
    print(comparison_table)
    
    # 选择决策树
    fig, ax = plt.subplots(figsize=(12, 8))
    ax.axis('off')
    
    decision_tree = """
    🎯 部署方案选择决策树:
    
                         开始
                          │
                          ▼
                    在什么环境部署？
                          │
            ┌─────────────┼─────────────┐
            ▼             ▼             ▼
        NVIDIA GPU    Intel CPU      移动端
            │             │             │
            ▼             ▼             ▼
      需要最低延迟？   需要跨平台？    iOS/Android？
            │             │             │
       ┌────┴────┐    ┌───┴───┐    ┌───┴───┐
       ▼         ▼    ▼       ▼    ▼       ▼
      是        否   是      否   是      否
       │         │    │       │    │       │
       ▼         ▼    ▼       ▼    ▼       ▼
    TensorRT  ONNX RT  ONNX  原生  Core ML TFLite
              +TRT
    """
    
    ax.text(0.5, 0.5, decision_tree, ha='center', va='center', fontsize=11,
           transform=ax.transAxes, fontfamily='monospace')
    ax.set_title('部署方案选择指南', fontsize=14)
    
    plt.tight_layout()
    plt.show()

deployment_comparison()

六、完整部署流程示例

6.1 端到端部署

python 复制代码

def complete_deployment():
    """完整部署流程"""
    
    print("\n" + "=" * 60)
    print("完整部署流程示例")
    print("=" * 60)
    
    # 模拟部署流程
    steps = [
        ("1. 训练模型", "在PyTorch中训练并验证"),
        ("2. 导出模型", "转换为ONNX/TorchScript"),
        ("3. 优化模型", "量化、剪枝、融合"),
        ("4. 部署服务", "使用FastAPI/Flask封装API"),
        ("5. 性能测试", "压测延迟和吞吐量"),
        ("6. 监控运维", "日志、指标、告警")
    ]
    
    print("\n📋 部署流程:")
    for step, desc in steps:
        print(f"   {step}: {desc}")
    
    # API服务示例代码
    print("\n📐 API服务示例 (FastAPI):")
    print("""
    from fastapi import FastAPI, HTTPException
    from pydantic import BaseModel
    import torch
    import numpy as np
    
    app = FastAPI()
    
    # 加载模型
    model = torch.jit.load('model_traced.pt')
    model.eval()
    
    class PredictRequest(BaseModel):
        data: list
    
    class PredictResponse(BaseModel):
        prediction: int
        confidence: float
    
    @app.post("/predict", response_model=PredictResponse)
    async def predict(request: PredictRequest):
        try:
            # 转换输入
            input_tensor = torch.tensor(request.data).float()
            
            # 推理
            with torch.no_grad():
                output = model(input_tensor)
                probs = torch.softmax(output, dim=1)
                pred = torch.argmax(probs, dim=1)
                conf = probs[0, pred].item()
            
            return PredictResponse(
                prediction=pred.item(),
                confidence=conf
            )
        except Exception as e:
            raise HTTPException(status_code=400, detail=str(e))
    
    # 运行: uvicorn main:app --host 0.0.0.0 --port 8000
    """)
    
    # Docker部署
    print("\n📐 Docker部署:")
    print("""
    # Dockerfile
    FROM pytorch/pytorch:1.13-cuda11.6-cudnn8-runtime
    
    WORKDIR /app
    COPY requirements.txt .
    RUN pip install -r requirements.txt
    COPY . .
    
    CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
    
    # 构建和运行
    # docker build -t model-api .
    # docker run -p 8000:8000 --gpus all model-api
    """)

complete_deployment()

七、总结

技术	用途	加速比	精度损失
TorchScript	PyTorch生产部署	1.2-1.5x	0%
ONNX	跨框架部署	1.3-1.8x	0%
FP16	半精度推理	2x	<0.1%
INT8	整数量化	3-4x	0.5-1%
TensorRT	NVIDIA加速	5-10x	<0.5%

部署流程总结：

复制代码

训练 → 导出 → 优化 → 打包 → 部署 → 监控
  ↓      ↓      ↓      ↓      ↓      ↓
PyTorch ONNX  量化  Docker API  日志

最佳实践：

训练时考虑部署需求
使用ONNX作为中间格式
根据硬件选择合适的优化
进行充分的性能测试
建立监控和告警机制