03-深度学习基础:模型部署与量化

模型部署与量化:从训练到生产

一、为什么需要模型部署?

1.1 训练 vs 推理的区别

python 复制代码
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import time
import warnings
warnings.filterwarnings('ignore')

print("=" * 60)
print("模型部署:从研究到生产")
print("=" * 60)

# 训练 vs 推理对比
comparison = """
╔══════════════════╦══════════════════════════════════════════════════════════════╗
║     阶段         ║                    特点                                      ║
╠══════════════════╬══════════════════════════════════════════════════════════════╣
║ 训练 (Training)  ║ • 需要反向传播、梯度计算                                      ║
║                  ║ • 需要大量内存(梯度、优化器状态)                            ║
║                  ║ • 支持多种精度(FP32/FP16)                                   ║
║                  ║ • 可接受较慢速度                                             ║
╠══════════════════╬══════════════════════════════════════════════════════════════╣
║ 推理 (Inference) ║ • 只需前向传播                                               ║
║                  ║ • 内存占用小                                                 ║
║                  ║ • 可用更低精度(INT8/INT4)                                   ║
║                  ║ • 要求低延迟、高吞吐                                          ║
╚══════════════════╩══════════════════════════════════════════════════════════════╝
"""

print(comparison)

# 训练 vs 推理内存对比
def memory_comparison():
    """训练与推理内存对比"""
    
    categories = ['模型参数', '梯度', '优化器状态', '激活值', '总计']
    train_memory = [100, 100, 200, 150, 550]
    inference_memory = [100, 0, 0, 50, 150]
    
    x = np.arange(len(categories))
    width = 0.35
    
    fig, ax = plt.subplots(figsize=(10, 6))
    bars1 = ax.bar(x - width/2, train_memory, width, label='训练', color='lightcoral')
    bars2 = ax.bar(x + width/2, inference_memory, width, label='推理', color='lightgreen')
    
    ax.set_ylabel('内存占用 (MB)')
    ax.set_title('训练 vs 推理内存对比')
    ax.set_xticks(x)
    ax.set_xticklabels(categories)
    ax.legend()
    
    for bar, val in zip(bars1, train_memory):
        ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 5,
                f'{val}MB', ha='center', va='bottom', fontsize=9)
    for bar, val in zip(bars2, inference_memory):
        ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 5,
                f'{val}MB', ha='center', va='bottom', fontsize=9)
    
    plt.tight_layout()
    plt.show()

memory_comparison()

print("\n💡 部署的核心挑战:")
print("   1. 延迟 (Latency): 单次推理耗时")
print("   2. 吞吐量 (Throughput): 每秒处理请求数")
print("   3. 内存占用: 模型大小和运行时内存")
print("   4. 硬件限制: CPU/GPU/边缘设备")

二、模型导出格式

2.1 PyTorch模型导出

python 复制代码
def pytorch_export_demo():
    """PyTorch模型导出演示"""
    
    print("\n" + "=" * 60)
    print("PyTorch模型导出")
    print("=" * 60)
    
    # 创建一个简单模型
    class SimpleModel(nn.Module):
        def __init__(self):
            super().__init__()
            self.fc1 = nn.Linear(10, 20)
            self.relu = nn.ReLU()
            self.fc2 = nn.Linear(20, 2)
        
        def forward(self, x):
            x = self.relu(self.fc1(x))
            return self.fc2(x)
    
    model = SimpleModel()
    model.eval()  # 切换到推理模式
    
    # 1. 保存完整模型(不推荐)
    torch.save(model, 'model_full.pt')
    print("1. 保存完整模型: model_full.pt")
    
    # 2. 保存state_dict(推荐)
    torch.save(model.state_dict(), 'model_state.pt')
    print("2. 保存state_dict: model_state.pt")
    
    # 3. TorchScript (JIT)
    example_input = torch.randn(1, 10)
    traced_model = torch.jit.trace(model, example_input)
    traced_model.save('model_traced.pt')
    print("3. TorchScript追踪: model_traced.pt")
    
    # 4. TorchScript (脚本化)
    scripted_model = torch.jit.script(model)
    scripted_model.save('model_scripted.pt')
    print("4. TorchScript脚本: model_scripted.pt")
    
    # 性能对比
    print("\n📊 推理速度对比:")
    
    def benchmark(model, input_tensor, n_runs=1000):
        # 预热
        for _ in range(100):
            _ = model(input_tensor)
        
        start = time.time()
        for _ in range(n_runs):
            _ = model(input_tensor)
        elapsed = time.time() - start
        return elapsed / n_runs * 1000  # 毫秒
    
    models = {
        'PyTorch': model,
        'TorchScript (trace)': traced_model,
        'TorchScript (script)': scripted_model
    }
    
    for name, m in models.items():
        avg_time = benchmark(m, example_input)
        print(f"   {name}: {avg_time:.3f} ms")

pytorch_export_demo()

2.2 ONNX导出

python 复制代码
def onnx_export_demo():
    """ONNX模型导出"""
    
    print("\n" + "=" * 60)
    print("ONNX模型导出")
    print("=" * 60)
    
    print("\n📐 ONNX (Open Neural Network Exchange)")
    print("   优点:")
    print("   - 跨框架兼容(PyTorch → ONNX → TensorFlow)")
    print("   - 支持多种推理后端")
    print("   - 可进行图优化")
    
    # 创建模型
    class SimpleModel(nn.Module):
        def __init__(self):
            super().__init__()
            self.conv = nn.Conv2d(3, 16, 3, padding=1)
            self.bn = nn.BatchNorm2d(16)
            self.relu = nn.ReLU()
            self.pool = nn.AdaptiveAvgPool2d(1)
            self.fc = nn.Linear(16, 10)
        
        def forward(self, x):
            x = self.relu(self.bn(self.conv(x)))
            x = self.pool(x)
            x = x.view(x.size(0), -1)
            return self.fc(x)
    
    model = SimpleModel()
    model.eval()
    
    # 导出ONNX
    example_input = torch.randn(1, 3, 32, 32)
    
    print("\n📐 ONNX导出代码:")
    print("""
    import torch.onnx
    
    torch.onnx.export(
        model,                       # 模型
        example_input,               # 示例输入
        "model.onnx",                # 输出文件
        input_names=['input'],       # 输入名称
        output_names=['output'],     # 输出名称
        dynamic_axes={               # 动态轴(可变batch)
            'input': {0: 'batch_size'},
            'output': {0: 'batch_size'}
        },
        opset_version=11
    )
    """)
    
    # 模拟导出
    try:
        torch.onnx.export(model, example_input, "model.onnx", 
                         input_names=['input'], output_names=['output'],
                         opset_version=11, verbose=False)
        print("\n✅ ONNX模型已导出: model.onnx")
        
        # 验证ONNX模型
        import onnx
        onnx_model = onnx.load("model.onnx")
        onnx.checker.check_model(onnx_model)
        print("✅ ONNX模型验证通过")
        
    except Exception as e:
        print(f"导出失败: {e}")
    
    print("\n💡 ONNX推理后端:")
    print("   - ONNX Runtime (CPU/GPU)")
    print("   - TensorRT (NVIDIA GPU)")
    print("   - OpenVINO (Intel)")
    print("   - TVM")

onnx_export_demo()

三、模型量化

3.1 量化原理

python 复制代码
def quantization_principle():
    """量化原理讲解"""
    
    print("\n" + "=" * 60)
    print("模型量化原理")
    print("=" * 60)
    
    # 精度对比
    precisions = {
        'FP32': 32,
        'FP16': 16,
        'INT8': 8,
        'INT4': 4
    }
    
    # 量化公式
    print("\n📐 量化公式:")
    print("   q = round(scale * (x - zero_point))")
    print("   scale = (max - min) / (q_max - q_min)")
    print("   x = (q - zero_point) / scale")
    
    # 精度与模型大小对比
    precisions_list = ['FP32', 'FP16', 'INT8', 'INT4']
    model_sizes = [100, 50, 25, 12.5]
    acc_loss = [0, 0.1, 0.5, 1.5]
    
    fig, axes = plt.subplots(1, 2, figsize=(12, 5))
    
    axes[0].bar(precisions_list, model_sizes, color='lightblue')
    axes[0].set_ylabel('相对模型大小 (%)')
    axes[0].set_title('不同精度模型大小对比')
    axes[0].grid(True, alpha=0.3, axis='y')
    
    for bar, size in zip(axes[0].bars, model_sizes):
        axes[0].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1,
                    f'{size}%', ha='center', va='bottom')
    
    axes[1].bar(precisions_list, acc_loss, color='lightcoral')
    axes[1].set_ylabel('精度损失 (%)')
    axes[1].set_title('量化精度损失')
    axes[1].grid(True, alpha=0.3, axis='y')
    
    for bar, loss in zip(axes[1].bars, acc_loss):
        axes[1].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.05,
                    f'{loss}%', ha='center', va='bottom')
    
    plt.tight_layout()
    plt.show()
    
    print("\n📊 量化类型:")
    print("   1. 训练后量化 (PTQ): 不需要重新训练,直接量化")
    print("   2. 量化感知训练 (QAT): 训练中模拟量化,精度更高")
    
    print("\n📐 PyTorch量化:")
    print("""
    # 训练后动态量化(权重INT8,激活FP32)
    model = torch.quantization.quantize_dynamic(
        model, {nn.Linear, nn.Conv2d}, dtype=torch.qint8
    )
    
    # 训练后静态量化(权重和激活都是INT8)
    model.qconfig = torch.quantization.get_default_qconfig('fbgemm')
    torch.quantization.prepare(model, inplace=True)
    # 校准...
    torch.quantization.convert(model, inplace=True)
    
    # 量化感知训练
    model.qconfig = torch.quantization.get_default_qat_qconfig('fbgemm')
    torch.quantization.prepare_qat(model, inplace=True)
    # 训练...
    torch.quantization.convert(model, inplace=True)
    """)

quantization_principle()

3.2 量化实战

python 复制代码
def quantization_practice():
    """量化实战演示"""
    
    print("\n" + "=" * 60)
    print("量化实战")
    print("=" * 60)
    
    # 创建一个简单的分类模型
    class SimpleCNN(nn.Module):
        def __init__(self):
            super().__init__()
            self.conv1 = nn.Conv2d(1, 32, 3, 1)
            self.conv2 = nn.Conv2d(32, 64, 3, 1)
            self.dropout1 = nn.Dropout(0.25)
            self.dropout2 = nn.Dropout(0.5)
            self.fc1 = nn.Linear(9216, 128)
            self.fc2 = nn.Linear(128, 10)
        
        def forward(self, x):
            x = self.conv1(x)
            x = nn.functional.relu(x)
            x = self.conv2(x)
            x = nn.functional.relu(x)
            x = nn.functional.max_pool2d(x, 2)
            x = self.dropout1(x)
            x = torch.flatten(x, 1)
            x = self.fc1(x)
            x = nn.functional.relu(x)
            x = self.dropout2(x)
            x = self.fc2(x)
            return x
    
    # 创建模型
    model = SimpleCNN()
    model.eval()
    
    # 模拟训练后量化
    print("\n模拟量化效果:")
    
    # 模拟不同精度的模型大小
    param_count = sum(p.numel() for p in model.parameters())
    fp32_size = param_count * 4 / 1024 / 1024  # MB
    fp16_size = param_count * 2 / 1024 / 1024
    int8_size = param_count * 1 / 1024 / 1024
    
    print(f"参数量: {param_count:,}")
    print(f"FP32模型大小: {fp32_size:.2f} MB")
    print(f"FP16模型大小: {fp16_size:.2f} MB")
    print(f"INT8模型大小: {int8_size:.2f} MB")
    
    # 量化效果可视化
    fig, axes = plt.subplots(1, 2, figsize=(12, 5))
    
    # 模型大小对比
    models = ['FP32', 'FP16', 'INT8']
    sizes = [fp32_size, fp16_size, int8_size]
    colors = ['lightcoral', 'lightblue', 'lightgreen']
    
    axes[0].bar(models, sizes, color=colors)
    axes[0].set_ylabel('模型大小 (MB)')
    axes[0].set_title('不同精度模型大小对比')
    for bar, size in zip(axes[0].bars, sizes):
        axes[0].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.5,
                    f'{size:.1f}MB', ha='center', va='bottom')
    
    # 推理速度对比
    speeds = [1.0, 1.8, 3.5]
    axes[1].bar(models, speeds, color=colors)
    axes[1].set_ylabel('相对推理速度')
    axes[1].set_title('量化加速效果')
    for bar, speed in zip(axes[1].bars, speeds):
        axes[1].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.1,
                    f'{speed}x', ha='center', va='bottom')
    
    plt.tight_layout()
    plt.show()
    
    print("\n💡 量化建议:")
    print("   - 精度要求高 → FP16或INT8")
    print("   - 边缘设备 → INT8/INT4")
    print("   - 服务器部署 → FP16")
    print("   - 需要最大加速 → INT4")

quantization_practice()

四、TensorRT加速

4.1 TensorRT简介

python 复制代码
def tensorrt_demo():
    """TensorRT加速介绍"""
    
    print("\n" + "=" * 60)
    print("TensorRT:NVIDIA推理加速器")
    print("=" * 60)
    
    print("\n📐 TensorRT优化技术:")
    optimizations = [
        "1. 层融合 (Layer Fusion): 合并相邻操作",
        "2. 精度校准 (Precision Calibration): INT8/FP16优化",
        "3. 内核自动调优 (Kernel Auto-tuning): 选择最佳CUDA内核",
        "4. 张量格式优化 (Tensor Layout): 内存访问优化",
        "5. 动态内存管理 (Dynamic Memory): 减少内存占用"
    ]
    
    for opt in optimizations:
        print(f"   {opt}")
    
    # TensorRT加速效果模拟
    models = ['ResNet50', 'BERT', 'YOLOv5', 'GPT-2']
    fp32_latency = [15, 25, 20, 40]
    fp16_latency = [8, 12, 10, 20]
    int8_latency = [5, 7, 6, 12]
    
    x = np.arange(len(models))
    width = 0.25
    
    fig, ax = plt.subplots(figsize=(12, 6))
    
    ax.bar(x - width, fp32_latency, width, label='FP32', color='lightcoral')
    ax.bar(x, fp16_latency, width, label='FP16', color='lightblue')
    ax.bar(x + width, int8_latency, width, label='INT8', color='lightgreen')
    
    ax.set_xlabel('模型')
    ax.set_ylabel('延迟 (ms)')
    ax.set_title('TensorRT加速效果')
    ax.set_xticks(x)
    ax.set_xticklabels(models)
    ax.legend()
    
    for i, (f32, f16, i8) in enumerate(zip(fp32_latency, fp16_latency, int8_latency)):
        ax.text(i - width, f32 + 1, f'{f32}ms', ha='center', va='bottom', fontsize=8)
        ax.text(i, f16 + 1, f'{f16}ms', ha='center', va='bottom', fontsize=8)
        ax.text(i + width, i8 + 1, f'{i8}ms', ha='center', va='bottom', fontsize=8)
    
    plt.tight_layout()
    plt.show()
    
    print("\n📐 TensorRT使用流程:")
    print("""
    # 1. 导出ONNX
    torch.onnx.export(model, dummy_input, "model.onnx")
    
    # 2. 使用trtexec转换
    # trtexec --onnx=model.onnx --saveEngine=model.plan --fp16
    
    # 3. Python API
    import tensorrt as trt
    
    logger = trt.Logger(trt.Logger.WARNING)
    builder = trt.Builder(logger)
    network = builder.create_network()
    parser = trt.OnnxParser(network, logger)
    parser.parse_from_file("model.onnx")
    
    config = builder.create_builder_config()
    config.set_flag(trt.BuilderFlag.FP16)
    engine = builder.build_engine(network, config)
    """)
    
    print("\n💡 TensorRT适用场景:")
    print("   - NVIDIA GPU环境")
    print("   - 对延迟要求高的场景")
    print("   - 批量推理优化")

tensorrt_demo()

五、部署方案对比

5.1 部署方案总结

python 复制代码
def deployment_comparison():
    """部署方案对比"""
    
    print("\n" + "=" * 60)
    print("部署方案对比")
    print("=" * 60)
    
    comparison_table = """
    ╔══════════════════╦══════════════════════════════════════════════════════════════╗
    ║     方案         ║                    特点                                      ║
    ╠══════════════════╬══════════════════════════════════════════════════════════════╣
    ║ PyTorch Native   ║ 简单、灵活、调试方便,速度一般                               ║
    ║ TorchScript      ║ 生产就绪、C++部署、图优化                                    ║
    ║ ONNX + ONNX RT   ║ 跨框架、跨平台、CPU/GPU都支持                                ║
    ║ TensorRT         ║ NVIDIA GPU上最快、延迟最低                                  ║
    ║ OpenVINO         ║ Intel CPU/GPU优化、边缘设备友好                             ║
    ║ TFLite           ║ 移动端、嵌入式、Android/iOS                                 ║
    ║ Core ML          ║ Apple生态(iOS/macOS)                                      ║
    ╚══════════════════╩══════════════════════════════════════════════════════════════╝
    """
    
    print(comparison_table)
    
    # 选择决策树
    fig, ax = plt.subplots(figsize=(12, 8))
    ax.axis('off')
    
    decision_tree = """
    🎯 部署方案选择决策树:
    
                         开始
                          │
                          ▼
                    在什么环境部署?
                          │
            ┌─────────────┼─────────────┐
            ▼             ▼             ▼
        NVIDIA GPU    Intel CPU      移动端
            │             │             │
            ▼             ▼             ▼
      需要最低延迟?   需要跨平台?    iOS/Android?
            │             │             │
       ┌────┴────┐    ┌───┴───┐    ┌───┴───┐
       ▼         ▼    ▼       ▼    ▼       ▼
      是        否   是      否   是      否
       │         │    │       │    │       │
       ▼         ▼    ▼       ▼    ▼       ▼
    TensorRT  ONNX RT  ONNX  原生  Core ML TFLite
              +TRT
    """
    
    ax.text(0.5, 0.5, decision_tree, ha='center', va='center', fontsize=11,
           transform=ax.transAxes, fontfamily='monospace')
    ax.set_title('部署方案选择指南', fontsize=14)
    
    plt.tight_layout()
    plt.show()

deployment_comparison()

六、完整部署流程示例

6.1 端到端部署

python 复制代码
def complete_deployment():
    """完整部署流程"""
    
    print("\n" + "=" * 60)
    print("完整部署流程示例")
    print("=" * 60)
    
    # 模拟部署流程
    steps = [
        ("1. 训练模型", "在PyTorch中训练并验证"),
        ("2. 导出模型", "转换为ONNX/TorchScript"),
        ("3. 优化模型", "量化、剪枝、融合"),
        ("4. 部署服务", "使用FastAPI/Flask封装API"),
        ("5. 性能测试", "压测延迟和吞吐量"),
        ("6. 监控运维", "日志、指标、告警")
    ]
    
    print("\n📋 部署流程:")
    for step, desc in steps:
        print(f"   {step}: {desc}")
    
    # API服务示例代码
    print("\n📐 API服务示例 (FastAPI):")
    print("""
    from fastapi import FastAPI, HTTPException
    from pydantic import BaseModel
    import torch
    import numpy as np
    
    app = FastAPI()
    
    # 加载模型
    model = torch.jit.load('model_traced.pt')
    model.eval()
    
    class PredictRequest(BaseModel):
        data: list
    
    class PredictResponse(BaseModel):
        prediction: int
        confidence: float
    
    @app.post("/predict", response_model=PredictResponse)
    async def predict(request: PredictRequest):
        try:
            # 转换输入
            input_tensor = torch.tensor(request.data).float()
            
            # 推理
            with torch.no_grad():
                output = model(input_tensor)
                probs = torch.softmax(output, dim=1)
                pred = torch.argmax(probs, dim=1)
                conf = probs[0, pred].item()
            
            return PredictResponse(
                prediction=pred.item(),
                confidence=conf
            )
        except Exception as e:
            raise HTTPException(status_code=400, detail=str(e))
    
    # 运行: uvicorn main:app --host 0.0.0.0 --port 8000
    """)
    
    # Docker部署
    print("\n📐 Docker部署:")
    print("""
    # Dockerfile
    FROM pytorch/pytorch:1.13-cuda11.6-cudnn8-runtime
    
    WORKDIR /app
    COPY requirements.txt .
    RUN pip install -r requirements.txt
    COPY . .
    
    CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
    
    # 构建和运行
    # docker build -t model-api .
    # docker run -p 8000:8000 --gpus all model-api
    """)

complete_deployment()

七、总结

技术 用途 加速比 精度损失
TorchScript PyTorch生产部署 1.2-1.5x 0%
ONNX 跨框架部署 1.3-1.8x 0%
FP16 半精度推理 2x <0.1%
INT8 整数量化 3-4x 0.5-1%
TensorRT NVIDIA加速 5-10x <0.5%

部署流程总结:

复制代码
训练 → 导出 → 优化 → 打包 → 部署 → 监控
  ↓      ↓      ↓      ↓      ↓      ↓
PyTorch ONNX  量化  Docker API  日志

最佳实践:

  1. 训练时考虑部署需求
  2. 使用ONNX作为中间格式
  3. 根据硬件选择合适的优化
  4. 进行充分的性能测试
  5. 建立监控和告警机制
相关推荐
疯狂成瘾者2 小时前
通用GPU后台解析与边缘计算方案对比分析
人工智能·边缘计算
.ZGR.2 小时前
【全栈实战】搭建属于你的AI图像生成平台:从Java Swing 到 Web 应用
java·人工智能·node.js
j_xxx404_2 小时前
【AI大模型入门(三)】大模型API接入、Ollama本地部署、SDK接入
人工智能·安全·ai
阿杰学AI2 小时前
AI核心知识133—大语言模型之 AI Coding(简洁且通俗易懂版)
人工智能·ai·语言模型·自然语言处理·ai编程·ai coding
朱阿朱2 小时前
机器学习数学基础
人工智能·机器学习·概率论·高数
中电金信2 小时前
中电金信:赋能精准决策,两大场景解锁金融营销新范式
大数据·人工智能
liangdabiao2 小时前
定制的乐高马赛克像素画生成器-微信小程序版本-AI 风格优化-一键完成所有工作
人工智能·微信小程序·小程序
醉卧考场君莫笑2 小时前
NLP(jieba库实现分词以及代码实现)
人工智能·自然语言处理