03-深度学习基础:模型部署与量化

模型部署与量化:从训练到生产

一、为什么需要模型部署?

1.1 训练 vs 推理的区别

python 复制代码
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import time
import warnings
warnings.filterwarnings('ignore')

print("=" * 60)
print("模型部署:从研究到生产")
print("=" * 60)

# 训练 vs 推理对比
comparison = """
╔══════════════════╦══════════════════════════════════════════════════════════════╗
║     阶段         ║                    特点                                      ║
╠══════════════════╬══════════════════════════════════════════════════════════════╣
║ 训练 (Training)  ║ • 需要反向传播、梯度计算                                      ║
║                  ║ • 需要大量内存(梯度、优化器状态)                            ║
║                  ║ • 支持多种精度(FP32/FP16)                                   ║
║                  ║ • 可接受较慢速度                                             ║
╠══════════════════╬══════════════════════════════════════════════════════════════╣
║ 推理 (Inference) ║ • 只需前向传播                                               ║
║                  ║ • 内存占用小                                                 ║
║                  ║ • 可用更低精度(INT8/INT4)                                   ║
║                  ║ • 要求低延迟、高吞吐                                          ║
╚══════════════════╩══════════════════════════════════════════════════════════════╝
"""

print(comparison)

# 训练 vs 推理内存对比
def memory_comparison():
    """训练与推理内存对比"""
    
    categories = ['模型参数', '梯度', '优化器状态', '激活值', '总计']
    train_memory = [100, 100, 200, 150, 550]
    inference_memory = [100, 0, 0, 50, 150]
    
    x = np.arange(len(categories))
    width = 0.35
    
    fig, ax = plt.subplots(figsize=(10, 6))
    bars1 = ax.bar(x - width/2, train_memory, width, label='训练', color='lightcoral')
    bars2 = ax.bar(x + width/2, inference_memory, width, label='推理', color='lightgreen')
    
    ax.set_ylabel('内存占用 (MB)')
    ax.set_title('训练 vs 推理内存对比')
    ax.set_xticks(x)
    ax.set_xticklabels(categories)
    ax.legend()
    
    for bar, val in zip(bars1, train_memory):
        ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 5,
                f'{val}MB', ha='center', va='bottom', fontsize=9)
    for bar, val in zip(bars2, inference_memory):
        ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 5,
                f'{val}MB', ha='center', va='bottom', fontsize=9)
    
    plt.tight_layout()
    plt.show()

memory_comparison()

print("\n💡 部署的核心挑战:")
print("   1. 延迟 (Latency): 单次推理耗时")
print("   2. 吞吐量 (Throughput): 每秒处理请求数")
print("   3. 内存占用: 模型大小和运行时内存")
print("   4. 硬件限制: CPU/GPU/边缘设备")

二、模型导出格式

2.1 PyTorch模型导出

python 复制代码
def pytorch_export_demo():
    """PyTorch模型导出演示"""
    
    print("\n" + "=" * 60)
    print("PyTorch模型导出")
    print("=" * 60)
    
    # 创建一个简单模型
    class SimpleModel(nn.Module):
        def __init__(self):
            super().__init__()
            self.fc1 = nn.Linear(10, 20)
            self.relu = nn.ReLU()
            self.fc2 = nn.Linear(20, 2)
        
        def forward(self, x):
            x = self.relu(self.fc1(x))
            return self.fc2(x)
    
    model = SimpleModel()
    model.eval()  # 切换到推理模式
    
    # 1. 保存完整模型(不推荐)
    torch.save(model, 'model_full.pt')
    print("1. 保存完整模型: model_full.pt")
    
    # 2. 保存state_dict(推荐)
    torch.save(model.state_dict(), 'model_state.pt')
    print("2. 保存state_dict: model_state.pt")
    
    # 3. TorchScript (JIT)
    example_input = torch.randn(1, 10)
    traced_model = torch.jit.trace(model, example_input)
    traced_model.save('model_traced.pt')
    print("3. TorchScript追踪: model_traced.pt")
    
    # 4. TorchScript (脚本化)
    scripted_model = torch.jit.script(model)
    scripted_model.save('model_scripted.pt')
    print("4. TorchScript脚本: model_scripted.pt")
    
    # 性能对比
    print("\n📊 推理速度对比:")
    
    def benchmark(model, input_tensor, n_runs=1000):
        # 预热
        for _ in range(100):
            _ = model(input_tensor)
        
        start = time.time()
        for _ in range(n_runs):
            _ = model(input_tensor)
        elapsed = time.time() - start
        return elapsed / n_runs * 1000  # 毫秒
    
    models = {
        'PyTorch': model,
        'TorchScript (trace)': traced_model,
        'TorchScript (script)': scripted_model
    }
    
    for name, m in models.items():
        avg_time = benchmark(m, example_input)
        print(f"   {name}: {avg_time:.3f} ms")

pytorch_export_demo()

2.2 ONNX导出

python 复制代码
def onnx_export_demo():
    """ONNX模型导出"""
    
    print("\n" + "=" * 60)
    print("ONNX模型导出")
    print("=" * 60)
    
    print("\n📐 ONNX (Open Neural Network Exchange)")
    print("   优点:")
    print("   - 跨框架兼容(PyTorch → ONNX → TensorFlow)")
    print("   - 支持多种推理后端")
    print("   - 可进行图优化")
    
    # 创建模型
    class SimpleModel(nn.Module):
        def __init__(self):
            super().__init__()
            self.conv = nn.Conv2d(3, 16, 3, padding=1)
            self.bn = nn.BatchNorm2d(16)
            self.relu = nn.ReLU()
            self.pool = nn.AdaptiveAvgPool2d(1)
            self.fc = nn.Linear(16, 10)
        
        def forward(self, x):
            x = self.relu(self.bn(self.conv(x)))
            x = self.pool(x)
            x = x.view(x.size(0), -1)
            return self.fc(x)
    
    model = SimpleModel()
    model.eval()
    
    # 导出ONNX
    example_input = torch.randn(1, 3, 32, 32)
    
    print("\n📐 ONNX导出代码:")
    print("""
    import torch.onnx
    
    torch.onnx.export(
        model,                       # 模型
        example_input,               # 示例输入
        "model.onnx",                # 输出文件
        input_names=['input'],       # 输入名称
        output_names=['output'],     # 输出名称
        dynamic_axes={               # 动态轴(可变batch)
            'input': {0: 'batch_size'},
            'output': {0: 'batch_size'}
        },
        opset_version=11
    )
    """)
    
    # 模拟导出
    try:
        torch.onnx.export(model, example_input, "model.onnx", 
                         input_names=['input'], output_names=['output'],
                         opset_version=11, verbose=False)
        print("\n✅ ONNX模型已导出: model.onnx")
        
        # 验证ONNX模型
        import onnx
        onnx_model = onnx.load("model.onnx")
        onnx.checker.check_model(onnx_model)
        print("✅ ONNX模型验证通过")
        
    except Exception as e:
        print(f"导出失败: {e}")
    
    print("\n💡 ONNX推理后端:")
    print("   - ONNX Runtime (CPU/GPU)")
    print("   - TensorRT (NVIDIA GPU)")
    print("   - OpenVINO (Intel)")
    print("   - TVM")

onnx_export_demo()

三、模型量化

3.1 量化原理

python 复制代码
def quantization_principle():
    """量化原理讲解"""
    
    print("\n" + "=" * 60)
    print("模型量化原理")
    print("=" * 60)
    
    # 精度对比
    precisions = {
        'FP32': 32,
        'FP16': 16,
        'INT8': 8,
        'INT4': 4
    }
    
    # 量化公式
    print("\n📐 量化公式:")
    print("   q = round(scale * (x - zero_point))")
    print("   scale = (max - min) / (q_max - q_min)")
    print("   x = (q - zero_point) / scale")
    
    # 精度与模型大小对比
    precisions_list = ['FP32', 'FP16', 'INT8', 'INT4']
    model_sizes = [100, 50, 25, 12.5]
    acc_loss = [0, 0.1, 0.5, 1.5]
    
    fig, axes = plt.subplots(1, 2, figsize=(12, 5))
    
    axes[0].bar(precisions_list, model_sizes, color='lightblue')
    axes[0].set_ylabel('相对模型大小 (%)')
    axes[0].set_title('不同精度模型大小对比')
    axes[0].grid(True, alpha=0.3, axis='y')
    
    for bar, size in zip(axes[0].bars, model_sizes):
        axes[0].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1,
                    f'{size}%', ha='center', va='bottom')
    
    axes[1].bar(precisions_list, acc_loss, color='lightcoral')
    axes[1].set_ylabel('精度损失 (%)')
    axes[1].set_title('量化精度损失')
    axes[1].grid(True, alpha=0.3, axis='y')
    
    for bar, loss in zip(axes[1].bars, acc_loss):
        axes[1].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.05,
                    f'{loss}%', ha='center', va='bottom')
    
    plt.tight_layout()
    plt.show()
    
    print("\n📊 量化类型:")
    print("   1. 训练后量化 (PTQ): 不需要重新训练,直接量化")
    print("   2. 量化感知训练 (QAT): 训练中模拟量化,精度更高")
    
    print("\n📐 PyTorch量化:")
    print("""
    # 训练后动态量化(权重INT8,激活FP32)
    model = torch.quantization.quantize_dynamic(
        model, {nn.Linear, nn.Conv2d}, dtype=torch.qint8
    )
    
    # 训练后静态量化(权重和激活都是INT8)
    model.qconfig = torch.quantization.get_default_qconfig('fbgemm')
    torch.quantization.prepare(model, inplace=True)
    # 校准...
    torch.quantization.convert(model, inplace=True)
    
    # 量化感知训练
    model.qconfig = torch.quantization.get_default_qat_qconfig('fbgemm')
    torch.quantization.prepare_qat(model, inplace=True)
    # 训练...
    torch.quantization.convert(model, inplace=True)
    """)

quantization_principle()

3.2 量化实战

python 复制代码
def quantization_practice():
    """量化实战演示"""
    
    print("\n" + "=" * 60)
    print("量化实战")
    print("=" * 60)
    
    # 创建一个简单的分类模型
    class SimpleCNN(nn.Module):
        def __init__(self):
            super().__init__()
            self.conv1 = nn.Conv2d(1, 32, 3, 1)
            self.conv2 = nn.Conv2d(32, 64, 3, 1)
            self.dropout1 = nn.Dropout(0.25)
            self.dropout2 = nn.Dropout(0.5)
            self.fc1 = nn.Linear(9216, 128)
            self.fc2 = nn.Linear(128, 10)
        
        def forward(self, x):
            x = self.conv1(x)
            x = nn.functional.relu(x)
            x = self.conv2(x)
            x = nn.functional.relu(x)
            x = nn.functional.max_pool2d(x, 2)
            x = self.dropout1(x)
            x = torch.flatten(x, 1)
            x = self.fc1(x)
            x = nn.functional.relu(x)
            x = self.dropout2(x)
            x = self.fc2(x)
            return x
    
    # 创建模型
    model = SimpleCNN()
    model.eval()
    
    # 模拟训练后量化
    print("\n模拟量化效果:")
    
    # 模拟不同精度的模型大小
    param_count = sum(p.numel() for p in model.parameters())
    fp32_size = param_count * 4 / 1024 / 1024  # MB
    fp16_size = param_count * 2 / 1024 / 1024
    int8_size = param_count * 1 / 1024 / 1024
    
    print(f"参数量: {param_count:,}")
    print(f"FP32模型大小: {fp32_size:.2f} MB")
    print(f"FP16模型大小: {fp16_size:.2f} MB")
    print(f"INT8模型大小: {int8_size:.2f} MB")
    
    # 量化效果可视化
    fig, axes = plt.subplots(1, 2, figsize=(12, 5))
    
    # 模型大小对比
    models = ['FP32', 'FP16', 'INT8']
    sizes = [fp32_size, fp16_size, int8_size]
    colors = ['lightcoral', 'lightblue', 'lightgreen']
    
    axes[0].bar(models, sizes, color=colors)
    axes[0].set_ylabel('模型大小 (MB)')
    axes[0].set_title('不同精度模型大小对比')
    for bar, size in zip(axes[0].bars, sizes):
        axes[0].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.5,
                    f'{size:.1f}MB', ha='center', va='bottom')
    
    # 推理速度对比
    speeds = [1.0, 1.8, 3.5]
    axes[1].bar(models, speeds, color=colors)
    axes[1].set_ylabel('相对推理速度')
    axes[1].set_title('量化加速效果')
    for bar, speed in zip(axes[1].bars, speeds):
        axes[1].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.1,
                    f'{speed}x', ha='center', va='bottom')
    
    plt.tight_layout()
    plt.show()
    
    print("\n💡 量化建议:")
    print("   - 精度要求高 → FP16或INT8")
    print("   - 边缘设备 → INT8/INT4")
    print("   - 服务器部署 → FP16")
    print("   - 需要最大加速 → INT4")

quantization_practice()

四、TensorRT加速

4.1 TensorRT简介

python 复制代码
def tensorrt_demo():
    """TensorRT加速介绍"""
    
    print("\n" + "=" * 60)
    print("TensorRT:NVIDIA推理加速器")
    print("=" * 60)
    
    print("\n📐 TensorRT优化技术:")
    optimizations = [
        "1. 层融合 (Layer Fusion): 合并相邻操作",
        "2. 精度校准 (Precision Calibration): INT8/FP16优化",
        "3. 内核自动调优 (Kernel Auto-tuning): 选择最佳CUDA内核",
        "4. 张量格式优化 (Tensor Layout): 内存访问优化",
        "5. 动态内存管理 (Dynamic Memory): 减少内存占用"
    ]
    
    for opt in optimizations:
        print(f"   {opt}")
    
    # TensorRT加速效果模拟
    models = ['ResNet50', 'BERT', 'YOLOv5', 'GPT-2']
    fp32_latency = [15, 25, 20, 40]
    fp16_latency = [8, 12, 10, 20]
    int8_latency = [5, 7, 6, 12]
    
    x = np.arange(len(models))
    width = 0.25
    
    fig, ax = plt.subplots(figsize=(12, 6))
    
    ax.bar(x - width, fp32_latency, width, label='FP32', color='lightcoral')
    ax.bar(x, fp16_latency, width, label='FP16', color='lightblue')
    ax.bar(x + width, int8_latency, width, label='INT8', color='lightgreen')
    
    ax.set_xlabel('模型')
    ax.set_ylabel('延迟 (ms)')
    ax.set_title('TensorRT加速效果')
    ax.set_xticks(x)
    ax.set_xticklabels(models)
    ax.legend()
    
    for i, (f32, f16, i8) in enumerate(zip(fp32_latency, fp16_latency, int8_latency)):
        ax.text(i - width, f32 + 1, f'{f32}ms', ha='center', va='bottom', fontsize=8)
        ax.text(i, f16 + 1, f'{f16}ms', ha='center', va='bottom', fontsize=8)
        ax.text(i + width, i8 + 1, f'{i8}ms', ha='center', va='bottom', fontsize=8)
    
    plt.tight_layout()
    plt.show()
    
    print("\n📐 TensorRT使用流程:")
    print("""
    # 1. 导出ONNX
    torch.onnx.export(model, dummy_input, "model.onnx")
    
    # 2. 使用trtexec转换
    # trtexec --onnx=model.onnx --saveEngine=model.plan --fp16
    
    # 3. Python API
    import tensorrt as trt
    
    logger = trt.Logger(trt.Logger.WARNING)
    builder = trt.Builder(logger)
    network = builder.create_network()
    parser = trt.OnnxParser(network, logger)
    parser.parse_from_file("model.onnx")
    
    config = builder.create_builder_config()
    config.set_flag(trt.BuilderFlag.FP16)
    engine = builder.build_engine(network, config)
    """)
    
    print("\n💡 TensorRT适用场景:")
    print("   - NVIDIA GPU环境")
    print("   - 对延迟要求高的场景")
    print("   - 批量推理优化")

tensorrt_demo()

五、部署方案对比

5.1 部署方案总结

python 复制代码
def deployment_comparison():
    """部署方案对比"""
    
    print("\n" + "=" * 60)
    print("部署方案对比")
    print("=" * 60)
    
    comparison_table = """
    ╔══════════════════╦══════════════════════════════════════════════════════════════╗
    ║     方案         ║                    特点                                      ║
    ╠══════════════════╬══════════════════════════════════════════════════════════════╣
    ║ PyTorch Native   ║ 简单、灵活、调试方便,速度一般                               ║
    ║ TorchScript      ║ 生产就绪、C++部署、图优化                                    ║
    ║ ONNX + ONNX RT   ║ 跨框架、跨平台、CPU/GPU都支持                                ║
    ║ TensorRT         ║ NVIDIA GPU上最快、延迟最低                                  ║
    ║ OpenVINO         ║ Intel CPU/GPU优化、边缘设备友好                             ║
    ║ TFLite           ║ 移动端、嵌入式、Android/iOS                                 ║
    ║ Core ML          ║ Apple生态(iOS/macOS)                                      ║
    ╚══════════════════╩══════════════════════════════════════════════════════════════╝
    """
    
    print(comparison_table)
    
    # 选择决策树
    fig, ax = plt.subplots(figsize=(12, 8))
    ax.axis('off')
    
    decision_tree = """
    🎯 部署方案选择决策树:
    
                         开始
                          │
                          ▼
                    在什么环境部署?
                          │
            ┌─────────────┼─────────────┐
            ▼             ▼             ▼
        NVIDIA GPU    Intel CPU      移动端
            │             │             │
            ▼             ▼             ▼
      需要最低延迟?   需要跨平台?    iOS/Android?
            │             │             │
       ┌────┴────┐    ┌───┴───┐    ┌───┴───┐
       ▼         ▼    ▼       ▼    ▼       ▼
      是        否   是      否   是      否
       │         │    │       │    │       │
       ▼         ▼    ▼       ▼    ▼       ▼
    TensorRT  ONNX RT  ONNX  原生  Core ML TFLite
              +TRT
    """
    
    ax.text(0.5, 0.5, decision_tree, ha='center', va='center', fontsize=11,
           transform=ax.transAxes, fontfamily='monospace')
    ax.set_title('部署方案选择指南', fontsize=14)
    
    plt.tight_layout()
    plt.show()

deployment_comparison()

六、完整部署流程示例

6.1 端到端部署

python 复制代码
def complete_deployment():
    """完整部署流程"""
    
    print("\n" + "=" * 60)
    print("完整部署流程示例")
    print("=" * 60)
    
    # 模拟部署流程
    steps = [
        ("1. 训练模型", "在PyTorch中训练并验证"),
        ("2. 导出模型", "转换为ONNX/TorchScript"),
        ("3. 优化模型", "量化、剪枝、融合"),
        ("4. 部署服务", "使用FastAPI/Flask封装API"),
        ("5. 性能测试", "压测延迟和吞吐量"),
        ("6. 监控运维", "日志、指标、告警")
    ]
    
    print("\n📋 部署流程:")
    for step, desc in steps:
        print(f"   {step}: {desc}")
    
    # API服务示例代码
    print("\n📐 API服务示例 (FastAPI):")
    print("""
    from fastapi import FastAPI, HTTPException
    from pydantic import BaseModel
    import torch
    import numpy as np
    
    app = FastAPI()
    
    # 加载模型
    model = torch.jit.load('model_traced.pt')
    model.eval()
    
    class PredictRequest(BaseModel):
        data: list
    
    class PredictResponse(BaseModel):
        prediction: int
        confidence: float
    
    @app.post("/predict", response_model=PredictResponse)
    async def predict(request: PredictRequest):
        try:
            # 转换输入
            input_tensor = torch.tensor(request.data).float()
            
            # 推理
            with torch.no_grad():
                output = model(input_tensor)
                probs = torch.softmax(output, dim=1)
                pred = torch.argmax(probs, dim=1)
                conf = probs[0, pred].item()
            
            return PredictResponse(
                prediction=pred.item(),
                confidence=conf
            )
        except Exception as e:
            raise HTTPException(status_code=400, detail=str(e))
    
    # 运行: uvicorn main:app --host 0.0.0.0 --port 8000
    """)
    
    # Docker部署
    print("\n📐 Docker部署:")
    print("""
    # Dockerfile
    FROM pytorch/pytorch:1.13-cuda11.6-cudnn8-runtime
    
    WORKDIR /app
    COPY requirements.txt .
    RUN pip install -r requirements.txt
    COPY . .
    
    CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
    
    # 构建和运行
    # docker build -t model-api .
    # docker run -p 8000:8000 --gpus all model-api
    """)

complete_deployment()

七、总结

技术 用途 加速比 精度损失
TorchScript PyTorch生产部署 1.2-1.5x 0%
ONNX 跨框架部署 1.3-1.8x 0%
FP16 半精度推理 2x <0.1%
INT8 整数量化 3-4x 0.5-1%
TensorRT NVIDIA加速 5-10x <0.5%

部署流程总结:

复制代码
训练 → 导出 → 优化 → 打包 → 部署 → 监控
  ↓      ↓      ↓      ↓      ↓      ↓
PyTorch ONNX  量化  Docker API  日志

最佳实践:

  1. 训练时考虑部署需求
  2. 使用ONNX作为中间格式
  3. 根据硬件选择合适的优化
  4. 进行充分的性能测试
  5. 建立监控和告警机制
相关推荐
redreamSo16 分钟前
大模型是不是到顶了?瓶颈到底在哪
人工智能·openai
Oo92020 分钟前
Tool Use 背后的技术逻辑
人工智能
姗姗来迟了21 分钟前
Vue3封装AI流式对话组件踩坑实录
人工智能
码上天下1 小时前
用Pinia管理AI多会话状态
人工智能
用户054324329702 小时前
Next.js接大模型流式SSE实操踩坑
人工智能
Assby2 小时前
从 Function Calling 到 MCP:理解 Agent 工具调用的底层通信机制
人工智能·后端
小星AI2 小时前
Claude Code 从入门到精通,一步到位
人工智能
后端小肥肠2 小时前
Codex + Obsidian 做人生副本视频:输入主题文案,直通剪映草稿
人工智能·aigc·agent
百度Geek说3 小时前
全链路研发智能体 ——从"体感能用"到"实际可用"的工程实践
人工智能
甲维斯4 小时前
500块的豆包,能帮我搞定这个么?!
人工智能