【LLM实操系列08】性能优化:让LLM应用快10倍
性能瓶颈速查
找出你的瓶颈
import time
import psutil
import GPUtil
class PerformanceProfiler:
"""性能分析器"""
def profile_llm_app(self):
"""诊断LLM应用瓶颈"""
bottlenecks = []
# 1. GPU利用率
gpus = GPUtil.getGPUs()
if gpus and gpus[0].memoryUtil < 0.5:
bottlenecks.append("GPU显存未充分利用")
# 2. CPU瓶颈
if psutil.cpu_percent() > 80:
bottlenecks.append("CPU成为瓶颈")
# 3. 内存
if psutil.virtual_memory().percent > 85:
bottlenecks.append("内存不足")
# 4. 网络延迟(API调用)
api_latency = self._test_api_latency()
if api_latency > 500: # ms
bottlenecks.append(f"API延迟高:{api_latency}ms")
return bottlenecks
# 常见瓶颈和解决方案
BOTTLENECK_SOLUTIONS = {
"推理慢": "使用vLLM或TensorRT",
"显存不足": "量化或使用更小模型",
"API成本高": "缓存+本地模型",
"并发低": "批处理+异步",
"首次响应慢": "模型预热+持久化"
}
推理加速:vLLM
# 安装:pip install vllm
# ❌ 慢速:Transformers(10 tokens/s)
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained("llama-7b")
output = model.generate(input_ids, max_length=100)
# ✅ 快速:vLLM(100+ tokens/s)
from vllm import LLM, SamplingParams
llm = LLM(model="llama-7b", tensor_parallel_size=1)
sampling_params = SamplingParams(
temperature=0.8,
top_p=0.95,
max_tokens=100
)
# 批量推理
prompts = ["Hello", "What is", "How to"]
outputs = llm.generate(prompts, sampling_params)
# 速度提升10倍!
vLLM服务部署
# 1. 启动vLLM服务器
# python -m vllm.entrypoints.openai.api_server \
# --model llama-7b \
# --port 8000 \
# --gpu-memory-utilization 0.9
# 2. 客户端调用(兼容OpenAI)
import openai
openai.api_base = "http://localhost:8000/v1"
response = openai.ChatCompletion.create(
model="llama-7b",
messages=[{"role": "user", "content": "Hello"}],
stream=True # 支持流式
)
# 3. 高级配置
from vllm import EngineArgs, LLMEngine
engine_args = EngineArgs(
model="llama-7b",
tokenizer="llama-7b",
tokenizer_mode="auto",
trust_remote_code=True,
dtype="float16", # 半精度
max_model_len=2048,
gpu_memory_utilization=0.9, # GPU利用率
max_num_batched_tokens=8192, # 批处理大小
max_num_seqs=256, # 最大并发
)
engine = LLMEngine.from_engine_args(engine_args)
量化:内存优化
1. 动态量化(最简单)
import torch
from transformers import AutoModelForCausalLM
# 8bit量化(bitsandbytes)
model = AutoModelForCausalLM.from_pretrained(
"llama-7b",
load_in_8bit=True, # 内存减少50%
device_map="auto"
)
# 4bit量化(QLoRA)
from transformers import BitsAndBytesConfig
bnb_config = BitsAndBytesConfig(
load_in_4bit=True, # 内存减少75%
bnb_4bit_compute_dtype=torch.float16,
bnb_4bit_quant_type="nf4",
bnb_4bit_use_double_quant=True
)
model = AutoModelForCausalLM.from_pretrained(
"llama-7b",
quantization_config=bnb_config
)
2. GGUF量化(CPU推理)
# 转换为GGUF格式
python convert.py models/llama-7b --outtype f16
./quantize model.gguf model_q4_k_m.gguf q4_k_m
# 使用llama.cpp加载
from llama_cpp import Llama
llm = Llama(
model_path="model_q4_k_m.gguf",
n_ctx=2048,
n_threads=8, # CPU线程数
n_gpu_layers=35 # GPU加速层数
)
3. 量化效果对比
def compare_quantization():
"""量化方案对比"""
methods = {
"FP16": {"size": "14GB", "speed": "100%", "accuracy": "100%"},
"INT8": {"size": "7GB", "speed": "95%", "accuracy": "99%"},
"INT4": {"size": "3.5GB", "speed": "90%", "accuracy": "95%"},
"GGUF-Q4": {"size": "4GB", "speed": "85%", "accuracy": "94%"},
}
return methods
缓存策略
1. KV Cache优化
from transformers import Cache
class OptimizedInference:
"""KV缓存优化推理"""
def __init__(self, model):
self.model = model
self.cache = Cache()
def generate_with_cache(self, input_ids, max_new_tokens=100):
"""使用KV缓存生成"""
past_key_values = None
for _ in range(max_new_tokens):
with torch.no_grad():
outputs = self.model(
input_ids,
past_key_values=past_key_values,
use_cache=True
)
# 更新缓存
past_key_values = outputs.past_key_values
# 获取下一个token
next_token = outputs.logits[:, -1, :].argmax(dim=-1)
input_ids = torch.cat([input_ids, next_token.unsqueeze(-1)], dim=-1)
return input_ids
2. 结果缓存
import hashlib
from functools import lru_cache
import redis
import json
class LLMCache:
"""LLM结果缓存"""
def __init__(self, redis_host="localhost"):
self.redis_client = redis.Redis(host=redis_host)
self.local_cache = {}
def get_cache_key(self, prompt: str, params: dict) -> str:
"""生成缓存键"""
content = f"{prompt}{json.dumps(params, sort_keys=True)}"
return hashlib.md5(content.encode()).hexdigest()
def cached_generate(self, prompt: str, **params):
"""带缓存的生成"""
cache_key = self.get_cache_key(prompt, params)
# 1. 本地缓存
if cache_key in self.local_cache:
return self.local_cache[cache_key]
# 2. Redis缓存
cached = self.redis_client.get(cache_key)
if cached:
result = json.loads(cached)
self.local_cache[cache_key] = result
return result
# 3. 生成新结果
result = self.llm.generate(prompt, **params)
# 4. 保存缓存
self.redis_client.setex(
cache_key,
3600, # TTL: 1小时
json.dumps(result)
)
self.local_cache[cache_key] = result
return result
3. Embedding缓存
class EmbeddingCache:
"""向量缓存"""
def __init__(self):
self.cache = {}
@lru_cache(maxsize=10000)
def get_embedding(self, text: str):
"""缓存文本向量"""
if text not in self.cache:
self.cache[text] = self.model.encode(text)
return self.cache[text]
def batch_encode(self, texts: list):
"""批量编码优化"""
uncached = []
results = []
for text in texts:
if text in self.cache:
results.append(self.cache[text])
else:
uncached.append(text)
results.append(None)
# 批量计算未缓存的
if uncached:
new_embeddings = self.model.encode(uncached)
j = 0
for i, result in enumerate(results):
if result is None:
results[i] = new_embeddings[j]
self.cache[texts[i]] = new_embeddings[j]
j += 1
return results
并发优化
1. 异步处理
import asyncio
from typing import List
import aiohttp
class AsyncLLM:
"""异步LLM客户端"""
def __init__(self, api_key: str, max_concurrent=10):
self.api_key = api_key
self.semaphore = asyncio.Semaphore(max_concurrent)
async def generate_single(self, session, prompt):
"""单个异步请求"""
async with self.semaphore:
headers = {"Authorization": f"Bearer {self.api_key}"}
data = {
"model": "gpt-3.5-turbo",
"messages": [{"role": "user", "content": prompt}]
}
async with session.post(
"https://api.openai.com/v1/chat/completions",
headers=headers,
json=data
) as response:
result = await response.json()
return result["choices"][0]["message"]["content"]
async def batch_generate(self, prompts: List[str]):
"""批量异步生成"""
async with aiohttp.ClientSession() as session:
tasks = [
self.generate_single(session, prompt)
for prompt in prompts
]
return await asyncio.gather(*tasks)
# 使用
async def main():
llm = AsyncLLM(api_key="sk-xxx")
prompts = ["Hello", "What is AI?", "How to code?"]
results = await llm.batch_generate(prompts)
# 3个请求并发,总时间 = max(单个请求时间)
2. 批处理优化
class BatchProcessor:
"""批处理优化器"""
def __init__(self, model, batch_size=8, timeout=0.1):
self.model = model
self.batch_size = batch_size
self.timeout = timeout
self.queue = []
self.results = {}
async def add_request(self, request_id: str, prompt: str):
"""添加请求到队列"""
future = asyncio.Future()
self.queue.append({
"id": request_id,
"prompt": prompt,
"future": future
})
# 触发批处理
if len(self.queue) >= self.batch_size:
await self._process_batch()
return await future
async def _process_batch(self):
"""处理批次"""
if not self.queue:
return
batch = self.queue[:self.batch_size]
self.queue = self.queue[self.batch_size:]
# 批量推理
prompts = [item["prompt"] for item in batch]
results = self.model.generate(prompts, max_length=100)
# 分发结果
for item, result in zip(batch, results):
item["future"].set_result(result)
流式优化
Server-Sent Events (SSE)
from fastapi import FastAPI
from fastapi.responses import StreamingResponse
import json
app = FastAPI()
@app.get("/stream")
async def stream_llm(prompt: str):
"""流式API端点"""
async def generate():
# 流式生成
for chunk in llm.generate_stream(prompt):
# SSE格式
data = json.dumps({"text": chunk})
yield f"data: {data}\n\n"
# 结束标记
yield "data: [DONE]\n\n"
return StreamingResponse(
generate(),
media_type="text/event-stream"
)
# 客户端
async def stream_client():
async with aiohttp.ClientSession() as session:
async with session.get(
"http://localhost:8000/stream",
params={"prompt": "Tell me a story"}
) as response:
async for line in response.content:
if line.startswith(b"data: "):
data = json.loads(line[6:])
print(data["text"], end="")
部署优化
1. 模型预热
class ModelWarmer:
"""模型预热器"""
def warmup(self, model, num_iterations=3):
"""预热模型"""
dummy_input = torch.randint(0, 1000, (1, 10))
for _ in range(num_iterations):
with torch.no_grad():
_ = model(dummy_input)
torch.cuda.synchronize() # 等待GPU完成
print("模型预热完成")
2. 负载均衡
class LoadBalancer:
"""简单负载均衡器"""
def __init__(self, servers: List[str]):
self.servers = servers
self.current = 0
def get_server(self) -> str:
"""轮询选择服务器"""
server = self.servers[self.current]
self.current = (self.current + 1) % len(self.servers)
return server
async def request(self, prompt: str):
"""负载均衡请求"""
server = self.get_server()
async with aiohttp.ClientSession() as session:
async with session.post(
f"{server}/generate",
json={"prompt": prompt}
) as response:
return await response.json()
监控指标
class LLMMetrics:
"""性能监控"""
def __init__(self):
self.metrics = {
"total_requests": 0,
"total_tokens": 0,
"avg_latency": 0,
"p95_latency": 0,
"throughput": 0
}
self.latencies = []
def record(self, latency: float, tokens: int):
"""记录指标"""
self.metrics["total_requests"] += 1
self.metrics["total_tokens"] += tokens
self.latencies.append(latency)
# 计算统计
self.metrics["avg_latency"] = sum(self.latencies) / len(self.latencies)
self.metrics["p95_latency"] = sorted(self.latencies)[int(len(self.latencies) * 0.95)]
self.metrics["throughput"] = self.metrics["total_tokens"] / sum(self.latencies)
def report(self):
"""生成报告"""
return f"""
性能报告:
- 总请求数:{self.metrics['total_requests']}
- 平均延迟:{self.metrics['avg_latency']:.2f}ms
- P95延迟:{self.metrics['p95_latency']:.2f}ms
- 吞吐量:{self.metrics['throughput']:.1f} tokens/s
"""
优化检查清单
| 优化项 |
提升效果 |
实施难度 |
优先级 |
| vLLM推理 |
5-10x |
简单 |
高 |
| 量化 |
2-4x内存 |
简单 |
高 |
| 结果缓存 |
10-100x |
简单 |
高 |
| 批处理 |
2-5x |
中等 |
中 |
| 流式输出 |
体验提升 |
简单 |
高 |
| KV缓存 |
1.5-2x |
中等 |
中 |
| 异步并发 |
3-10x |
中等 |
高 |