极简LLM入门指南 8

【LLM实操系列08】性能优化:让LLM应用快10倍

性能瓶颈速查

找出你的瓶颈

python 复制代码
import time
import psutil
import GPUtil

class PerformanceProfiler:
    """性能分析器"""

    def profile_llm_app(self):
        """诊断LLM应用瓶颈"""
        bottlenecks = []

        # 1. GPU利用率
        gpus = GPUtil.getGPUs()
        if gpus and gpus[0].memoryUtil < 0.5:
            bottlenecks.append("GPU显存未充分利用")

        # 2. CPU瓶颈
        if psutil.cpu_percent() > 80:
            bottlenecks.append("CPU成为瓶颈")

        # 3. 内存
        if psutil.virtual_memory().percent > 85:
            bottlenecks.append("内存不足")

        # 4. 网络延迟(API调用)
        api_latency = self._test_api_latency()
        if api_latency > 500:  # ms
            bottlenecks.append(f"API延迟高:{api_latency}ms")

        return bottlenecks

# 常见瓶颈和解决方案
BOTTLENECK_SOLUTIONS = {
    "推理慢": "使用vLLM或TensorRT",
    "显存不足": "量化或使用更小模型",
    "API成本高": "缓存+本地模型",
    "并发低": "批处理+异步",
    "首次响应慢": "模型预热+持久化"
}

推理加速:vLLM

vLLM vs 原生Transformers

python 复制代码
# 安装:pip install vllm

# ❌ 慢速:Transformers(10 tokens/s)
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained("llama-7b")
output = model.generate(input_ids, max_length=100)

# ✅ 快速:vLLM(100+ tokens/s)
from vllm import LLM, SamplingParams

llm = LLM(model="llama-7b", tensor_parallel_size=1)
sampling_params = SamplingParams(
    temperature=0.8,
    top_p=0.95,
    max_tokens=100
)

# 批量推理
prompts = ["Hello", "What is", "How to"]
outputs = llm.generate(prompts, sampling_params)

# 速度提升10倍!

vLLM服务部署

python 复制代码
# 1. 启动vLLM服务器
# python -m vllm.entrypoints.openai.api_server \
#     --model llama-7b \
#     --port 8000 \
#     --gpu-memory-utilization 0.9

# 2. 客户端调用(兼容OpenAI)
import openai

openai.api_base = "http://localhost:8000/v1"
response = openai.ChatCompletion.create(
    model="llama-7b",
    messages=[{"role": "user", "content": "Hello"}],
    stream=True  # 支持流式
)

# 3. 高级配置
from vllm import EngineArgs, LLMEngine

engine_args = EngineArgs(
    model="llama-7b",
    tokenizer="llama-7b",
    tokenizer_mode="auto",
    trust_remote_code=True,
    dtype="float16",  # 半精度
    max_model_len=2048,
    gpu_memory_utilization=0.9,  # GPU利用率
    max_num_batched_tokens=8192,  # 批处理大小
    max_num_seqs=256,  # 最大并发
)

engine = LLMEngine.from_engine_args(engine_args)

量化:内存优化

1. 动态量化(最简单)

python 复制代码
import torch
from transformers import AutoModelForCausalLM

# 8bit量化(bitsandbytes)
model = AutoModelForCausalLM.from_pretrained(
    "llama-7b",
    load_in_8bit=True,  # 内存减少50%
    device_map="auto"
)

# 4bit量化(QLoRA)
from transformers import BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,  # 内存减少75%
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True
)

model = AutoModelForCausalLM.from_pretrained(
    "llama-7b",
    quantization_config=bnb_config
)

2. GGUF量化(CPU推理)

bash 复制代码
# 转换为GGUF格式
python convert.py models/llama-7b --outtype f16
./quantize model.gguf model_q4_k_m.gguf q4_k_m

# 使用llama.cpp加载
from llama_cpp import Llama

llm = Llama(
    model_path="model_q4_k_m.gguf",
    n_ctx=2048,
    n_threads=8,  # CPU线程数
    n_gpu_layers=35  # GPU加速层数
)

3. 量化效果对比

python 复制代码
def compare_quantization():
    """量化方案对比"""
    methods = {
        "FP16": {"size": "14GB", "speed": "100%", "accuracy": "100%"},
        "INT8": {"size": "7GB", "speed": "95%", "accuracy": "99%"},
        "INT4": {"size": "3.5GB", "speed": "90%", "accuracy": "95%"},
        "GGUF-Q4": {"size": "4GB", "speed": "85%", "accuracy": "94%"},
    }
    return methods

缓存策略

1. KV Cache优化

python 复制代码
from transformers import Cache

class OptimizedInference:
    """KV缓存优化推理"""

    def __init__(self, model):
        self.model = model
        self.cache = Cache()

    def generate_with_cache(self, input_ids, max_new_tokens=100):
        """使用KV缓存生成"""
        past_key_values = None

        for _ in range(max_new_tokens):
            with torch.no_grad():
                outputs = self.model(
                    input_ids,
                    past_key_values=past_key_values,
                    use_cache=True
                )

            # 更新缓存
            past_key_values = outputs.past_key_values

            # 获取下一个token
            next_token = outputs.logits[:, -1, :].argmax(dim=-1)
            input_ids = torch.cat([input_ids, next_token.unsqueeze(-1)], dim=-1)

        return input_ids

2. 结果缓存

python 复制代码
import hashlib
from functools import lru_cache
import redis
import json

class LLMCache:
    """LLM结果缓存"""

    def __init__(self, redis_host="localhost"):
        self.redis_client = redis.Redis(host=redis_host)
        self.local_cache = {}

    def get_cache_key(self, prompt: str, params: dict) -> str:
        """生成缓存键"""
        content = f"{prompt}{json.dumps(params, sort_keys=True)}"
        return hashlib.md5(content.encode()).hexdigest()

    def cached_generate(self, prompt: str, **params):
        """带缓存的生成"""
        cache_key = self.get_cache_key(prompt, params)

        # 1. 本地缓存
        if cache_key in self.local_cache:
            return self.local_cache[cache_key]

        # 2. Redis缓存
        cached = self.redis_client.get(cache_key)
        if cached:
            result = json.loads(cached)
            self.local_cache[cache_key] = result
            return result

        # 3. 生成新结果
        result = self.llm.generate(prompt, **params)

        # 4. 保存缓存
        self.redis_client.setex(
            cache_key,
            3600,  # TTL: 1小时
            json.dumps(result)
        )
        self.local_cache[cache_key] = result

        return result

3. Embedding缓存

python 复制代码
class EmbeddingCache:
    """向量缓存"""

    def __init__(self):
        self.cache = {}

    @lru_cache(maxsize=10000)
    def get_embedding(self, text: str):
        """缓存文本向量"""
        if text not in self.cache:
            self.cache[text] = self.model.encode(text)
        return self.cache[text]

    def batch_encode(self, texts: list):
        """批量编码优化"""
        uncached = []
        results = []

        for text in texts:
            if text in self.cache:
                results.append(self.cache[text])
            else:
                uncached.append(text)
                results.append(None)

        # 批量计算未缓存的
        if uncached:
            new_embeddings = self.model.encode(uncached)

            j = 0
            for i, result in enumerate(results):
                if result is None:
                    results[i] = new_embeddings[j]
                    self.cache[texts[i]] = new_embeddings[j]
                    j += 1

        return results

并发优化

1. 异步处理

python 复制代码
import asyncio
from typing import List
import aiohttp

class AsyncLLM:
    """异步LLM客户端"""

    def __init__(self, api_key: str, max_concurrent=10):
        self.api_key = api_key
        self.semaphore = asyncio.Semaphore(max_concurrent)

    async def generate_single(self, session, prompt):
        """单个异步请求"""
        async with self.semaphore:
            headers = {"Authorization": f"Bearer {self.api_key}"}
            data = {
                "model": "gpt-3.5-turbo",
                "messages": [{"role": "user", "content": prompt}]
            }

            async with session.post(
                "https://api.openai.com/v1/chat/completions",
                headers=headers,
                json=data
            ) as response:
                result = await response.json()
                return result["choices"][0]["message"]["content"]

    async def batch_generate(self, prompts: List[str]):
        """批量异步生成"""
        async with aiohttp.ClientSession() as session:
            tasks = [
                self.generate_single(session, prompt)
                for prompt in prompts
            ]
            return await asyncio.gather(*tasks)

# 使用
async def main():
    llm = AsyncLLM(api_key="sk-xxx")
    prompts = ["Hello", "What is AI?", "How to code?"]
    results = await llm.batch_generate(prompts)
    # 3个请求并发,总时间 = max(单个请求时间)

2. 批处理优化

python 复制代码
class BatchProcessor:
    """批处理优化器"""

    def __init__(self, model, batch_size=8, timeout=0.1):
        self.model = model
        self.batch_size = batch_size
        self.timeout = timeout
        self.queue = []
        self.results = {}

    async def add_request(self, request_id: str, prompt: str):
        """添加请求到队列"""
        future = asyncio.Future()
        self.queue.append({
            "id": request_id,
            "prompt": prompt,
            "future": future
        })

        # 触发批处理
        if len(self.queue) >= self.batch_size:
            await self._process_batch()

        return await future

    async def _process_batch(self):
        """处理批次"""
        if not self.queue:
            return

        batch = self.queue[:self.batch_size]
        self.queue = self.queue[self.batch_size:]

        # 批量推理
        prompts = [item["prompt"] for item in batch]
        results = self.model.generate(prompts, max_length=100)

        # 分发结果
        for item, result in zip(batch, results):
            item["future"].set_result(result)

流式优化

Server-Sent Events (SSE)

python 复制代码
from fastapi import FastAPI
from fastapi.responses import StreamingResponse
import json

app = FastAPI()

@app.get("/stream")
async def stream_llm(prompt: str):
    """流式API端点"""

    async def generate():
        # 流式生成
        for chunk in llm.generate_stream(prompt):
            # SSE格式
            data = json.dumps({"text": chunk})
            yield f"data: {data}\n\n"

        # 结束标记
        yield "data: [DONE]\n\n"

    return StreamingResponse(
        generate(),
        media_type="text/event-stream"
    )

# 客户端
async def stream_client():
    async with aiohttp.ClientSession() as session:
        async with session.get(
            "http://localhost:8000/stream",
            params={"prompt": "Tell me a story"}
        ) as response:
            async for line in response.content:
                if line.startswith(b"data: "):
                    data = json.loads(line[6:])
                    print(data["text"], end="")

部署优化

1. 模型预热

python 复制代码
class ModelWarmer:
    """模型预热器"""

    def warmup(self, model, num_iterations=3):
        """预热模型"""
        dummy_input = torch.randint(0, 1000, (1, 10))

        for _ in range(num_iterations):
            with torch.no_grad():
                _ = model(dummy_input)

        torch.cuda.synchronize()  # 等待GPU完成
        print("模型预热完成")

2. 负载均衡

python 复制代码
class LoadBalancer:
    """简单负载均衡器"""

    def __init__(self, servers: List[str]):
        self.servers = servers
        self.current = 0

    def get_server(self) -> str:
        """轮询选择服务器"""
        server = self.servers[self.current]
        self.current = (self.current + 1) % len(self.servers)
        return server

    async def request(self, prompt: str):
        """负载均衡请求"""
        server = self.get_server()
        async with aiohttp.ClientSession() as session:
            async with session.post(
                f"{server}/generate",
                json={"prompt": prompt}
            ) as response:
                return await response.json()

监控指标

python 复制代码
class LLMMetrics:
    """性能监控"""

    def __init__(self):
        self.metrics = {
            "total_requests": 0,
            "total_tokens": 0,
            "avg_latency": 0,
            "p95_latency": 0,
            "throughput": 0
        }
        self.latencies = []

    def record(self, latency: float, tokens: int):
        """记录指标"""
        self.metrics["total_requests"] += 1
        self.metrics["total_tokens"] += tokens
        self.latencies.append(latency)

        # 计算统计
        self.metrics["avg_latency"] = sum(self.latencies) / len(self.latencies)
        self.metrics["p95_latency"] = sorted(self.latencies)[int(len(self.latencies) * 0.95)]
        self.metrics["throughput"] = self.metrics["total_tokens"] / sum(self.latencies)

    def report(self):
        """生成报告"""
        return f"""
性能报告:
- 总请求数:{self.metrics['total_requests']}
- 平均延迟:{self.metrics['avg_latency']:.2f}ms
- P95延迟:{self.metrics['p95_latency']:.2f}ms
- 吞吐量:{self.metrics['throughput']:.1f} tokens/s
"""

优化检查清单

优化项 提升效果 实施难度 优先级
vLLM推理 5-10x 简单
量化 2-4x内存 简单
结果缓存 10-100x 简单
批处理 2-5x 中等
流式输出 体验提升 简单
KV缓存 1.5-2x 中等
异步并发 3-10x 中等
相关推荐
心疼你的一切5 小时前
自然语言处理_NLP与Transformer架构
人工智能·深度学习·目标检测·机器学习·计算机视觉·自然语言处理·transformer
Baihai_IDP5 小时前
对长上下文能力有不同要求,怎么选择合适的模型?
人工智能·面试·llm
Shawn_Shawn12 小时前
大模型的奥秘:Token与Transformer简单理解
人工智能·llm
未来魔导20 小时前
Gin版本的路由总结
开发语言·llm·gin·路由
mingchen_peng21 小时前
第三章 大语言模型基础
大模型·llm·hello-agent
骚戴1 天前
深入解析:Gemini 3.0 Pro 的 SSE 流式响应与跨区域延迟优化实践
java·人工智能·python·大模型·llm
智泊AI1 天前
为什么Anthropic说:AI的未来是Skills不是Agent?
llm
CoderJia程序员甲1 天前
GitHub 热榜项目 - 日榜(2025-12-15)
git·ai·开源·llm·github
未来魔导1 天前
PocketBase的自定义任务【专供LLM请求耗时任务】
llm·延时任务·pocketbase