一、为什么需要预热
1.1 首次推理延迟来源
复制代码
首次推理慢的原因:
1. 模型加载: 从磁盘读取模型文件
2. 显存分配: 分配 HBM/DDR 内存
3. 算子编译: 首次使用时编译算子内核
4. 缓存冷启动: L2 Cache、TLB 为空
5. 数据预热: CPU Cache 未命中
预热后的推理:
模型已在显存 → 算子已编译 → 缓存已填充 → 延迟大幅降低
1.2 延迟对比
复制代码
首次推理 vs 预热后推理:
模型加载: 2000ms → 0ms (预加载)
显存分配: 500ms → 0ms (预分配)
算子编译: 1000ms → 0ms (缓存)
推理执行: 100ms → 80ms (缓存命中)
─────────────────────────────────
总计: 3600ms → 80ms
加速比: 45x
二、基础预热方法
2.1 模型预加载
python
复制代码
import torch
import torch.npu
class ModelPreloader:
"""模型预加载器"""
def __init__(self):
self.preloaded_models = {}
self.lock = threading.Lock()
def preload(self, model_id, model_path):
"""预加载模型到显存"""
with self.lock:
if model_id in self.preloaded_models:
print(f"模型已预加载: {model_id}")
return
start_time = time.time()
# 加载模型
model = self._load_model(model_path)
# 移动到 NPU
model = model.npu()
# 执行一次空推理 (触发编译)
dummy_input = self._get_dummy_input(model)
with torch.no_grad():
_ = model(dummy_input)
elapsed = time.time() - start_time
self.preloaded_models[model_id] = {
'model': model,
'loaded_at': time.time(),
'load_time': elapsed
}
print(f"模型预加载完成: {model_id}, 耗时={elapsed:.2f}s")
def get_model(self, model_id):
"""获取预加载的模型"""
with self.lock:
if model_id not in self.preloaded_models:
raise ValueError(f"模型未预加载: {model_id}")
return self.preloaded_models[model_id]['model']
def _load_model(self, model_path):
"""加载模型"""
# 实际实现中,这里会加载 .om 或 .pt 模型
return torch.load(model_path)
def _get_dummy_input(self, model):
"""获取虚拟输入"""
# 根据模型输入形状创建虚拟数据
return torch.randn(1, 3, 224, 224).npu()
# 使用示例
preloader = ModelPreloader()
# 预加载模型
preloader.preload("resnet50", "resnet50.om")
preloader.preload("bert", "bert.om")
# 使用预加载的模型
model = preloader.get_model("resnet50")
output = model(input_data)
2.2 显存预分配
python
复制代码
class MemoryPreallocator:
"""显存预分配器"""
def __init__(self):
self.allocated_blocks = {}
def preallocate(self, block_id, size, dtype=torch.float32):
"""预分配显存"""
if block_id in self.allocated_blocks:
print(f"显存块已存在: {block_id}")
return
# 分配显存
block = torch.empty(
size,
dtype=dtype,
device='npu',
requires_grad=False
)
self.allocated_blocks[block_id] = {
'tensor': block,
'size': block.numel() * block.element_size(),
'allocated_at': time.time()
}
size_mb = block.numel() * block.element_size() / 1024**2
print(f"显存预分配完成: {block_id}, 大小={size_mb:.1f}MB")
def get_block(self, block_id):
"""获取预分配的显存块"""
if block_id not in self.allocated_blocks:
raise ValueError(f"显存块未分配: {block_id}")
return self.allocated_blocks[block_id]['tensor']
def free(self, block_id):
"""释放显存块"""
if block_id in self.allocated_blocks:
del self.allocated_blocks[block_id]
print(f"显存块已释放: {block_id}")
def get_stats(self):
"""获取统计信息"""
total_size = sum(
block['size'] for block in self.allocated_blocks.values()
)
return {
'num_blocks': len(self.allocated_blocks),
'total_size_mb': total_size / 1024**2,
'blocks': list(self.allocated_blocks.keys())
}
# 使用示例
preallocator = MemoryPreallocator()
# 预分配显存
preallocator.preallocate("input_buffer", (1, 3, 224, 224))
preallocator.preallocate("output_buffer", (1, 1000))
# 使用预分配的显存
input_buffer = preallocator.get_block("input_buffer")
input_buffer.copy_(input_data)
三、算子编译缓存
3.1 启用算子缓存
bash
复制代码
# 环境变量配置
export ASCEND_OPP_CACHE_PATH=/tmp/ascend_opp_cache
export ASCEND_OPP_CACHE_SIZE=1024 # MB
# 首次运行 (编译并缓存)
./run_inference
# 后续运行 (使用缓存)
./run_inference # 速度大幅提升
3.2 手动预热算子
python
复制代码
class OperatorWarmer:
"""算子预热器"""
def __init__(self):
self.warmed_ops = set()
def warm_operator(self, op_name, *args, **kwargs):
"""预热指定算子"""
if op_name in self.warmed_ops:
return
start_time = time.time()
# 创建测试输入
test_inputs = self._create_test_inputs(op_name)
# 执行算子
op_func = self._get_operator(op_name)
_ = op_func(*test_inputs, **kwargs)
elapsed = time.time() - start_time
self.warmed_ops.add(op_name)
print(f"算子预热完成: {op_name}, 耗时={elapsed:.3f}s")
def warm_all_operators(self, model):
"""预热模型使用的所有算子"""
ops = self._extract_operators(model)
for op_name in ops:
self.warm_operator(op_name)
print(f"共预热 {len(ops)} 个算子")
def _extract_operators(self, model):
"""提取模型使用的算子"""
# 实际实现中,这里会分析模型图
return ['Conv2d', 'BatchNorm', 'ReLU', 'MaxPool', 'Linear']
def _create_test_inputs(self, op_name):
"""创建算子测试输入"""
# 根据算子类型创建合适的测试输入
return [torch.randn(1, 3, 224, 224).npu()]
def _get_operator(self, op_name):
"""获取算子函数"""
ops = {
'Conv2d': torch.nn.Conv2d(3, 64, 3, padding=1).npu(),
'BatchNorm': torch.nn.BatchNorm2d(64).npu(),
'ReLU': torch.nn.ReLU().npu(),
'MaxPool': torch.nn.MaxPool2d(2).npu(),
'Linear': torch.nn.Linear(1000, 100).npu(),
}
return ops.get(op_name)
# 使用示例
warmer = OperatorWarmer()
warmer.warm_all_operators(model)
四、异步预热策略
4.1 后台预热线程
python
复制代码
class AsyncPreloader:
"""异步预加载器"""
def __init__(self):
self.preload_queue = queue.Queue()
self.preloaded = {}
self.lock = threading.Lock()
self.worker = threading.Thread(target=self._preload_worker, daemon=True)
self.worker.start()
def submit(self, model_id, model_path):
"""提交预加载任务"""
self.preload_queue.put((model_id, model_path))
print(f"预加载任务已提交: {model_id}")
def _preload_worker(self):
"""预加载工作线程"""
while True:
try:
model_id, model_path = self.preload_queue.get(timeout=1.0)
# 执行预加载
model = self._load_and_warm(model_path)
with self.lock:
self.preloaded[model_id] = model
print(f"预加载完成: {model_id}")
except queue.Empty:
continue
def _load_and_warm(self, model_path):
"""加载并预热模型"""
# 加载模型
model = torch.load(model_path).npu()
# 预热
dummy_input = torch.randn(1, 3, 224, 224).npu()
with torch.no_grad():
_ = model(dummy_input)
return model
def get_model(self, model_id, timeout=30):
"""获取模型 (等待预热完成)"""
start_time = time.time()
while time.time() - start_time < timeout:
with self.lock:
if model_id in self.preloaded:
return self.preloaded[model_id]
time.sleep(0.1)
raise TimeoutError(f"预加载超时: {model_id}")
# 使用示例
preloader = AsyncPreloader()
# 提交预加载任务
preloader.submit("resnet50", "resnet50.om")
preloader.submit("bert", "bert.om")
# 稍后获取模型
model = preloader.get_model("resnet50")
4.2 启动时批量预热
python
复制代码
class StartupWarmer:
"""启动时批量预热"""
def __init__(self, config):
self.config = config
self.preloader = AsyncPreloader()
def warmup(self):
"""执行预热"""
start_time = time.time()
print("开始启动预热...")
# 1. 预加载所有模型
for model_config in self.config['models']:
self.preloader.submit(
model_config['id'],
model_config['path']
)
# 2. 预分配显存
preallocator = MemoryPreallocator()
for buffer_config in self.config['buffers']:
preallocator.preallocate(
buffer_config['id'],
buffer_config['shape']
)
# 3. 预热算子
warmer = OperatorWarmer()
for op_name in self.config.get('operators', []):
warmer.warm_operator(op_name)
# 4. 等待所有预加载完成
for model_config in self.config['models']:
self.preloader.get_model(model_config['id'])
elapsed = time.time() - start_time
print(f"预热完成,耗时={elapsed:.2f}s")
# 使用示例
config = {
'models': [
{'id': 'resnet50', 'path': 'resnet50.om'},
{'id': 'bert', 'path': 'bert.om'},
],
'buffers': [
{'id': 'input', 'shape': (1, 3, 224, 224)},
{'id': 'output', 'shape': (1, 1000)},
],
'operators': ['Conv2d', 'BatchNorm', 'ReLU']
}
warmer = StartupWarmer(config)
warmer.warmup()
五、常见问题
| 问题 |
原因 |
解决方案 |
| 预热时间太长 |
模型太大 |
使用异步预热、减少预热模型数量 |
| 预热后效果不明显 |
未触发算子编译 |
确保执行一次完整推理 |
| 显存不足 |
预分配太多 |
优化预分配策略 |
| 预热失败 |
模型路径错误 |
检查模型路径和文件完整性 |
| 服务启动慢 |
同步预热 |
使用异步预热 |
相关仓库