06 · 从开发到生产------生成优化、监控、安全与成本
检索做完了,但 RAG 的价值在于"生成"。最后一篇核心文章,把系统推上生产。
1. 上下文拼接策略
检索结果怎么"喂"给 LLM,直接决定了回答质量。
1.1 Prompt 模板设计
shell
// generation/prompt.ts
const RAG_PROMPT = `
你是一个专业的知识库助手。请严格根据以下提供的文档内容回答问题。
## 规则
1. 只使用提供的文档内容回答,不要编造
2. 如果文档中没有相关信息,诚实地说"文档中没有找到相关信息"
3. 引用时注明文档来源和章节
4. 如果信息不完整,说明需要补充哪些信息
## 相关文档
{documents}
## 用户问题
{query}
## 回答
`.trim();
1.2 Token 预算分配
typescript
// generation/budget.ts
class TokenBudget {
constructor(private modelLimit: number = 8192) {}
allocate(query: string): {
systemTokens: number;
documentTokens: number;
historyTokens: number;
outputTokens: number;
} {
// 策略:输出 30%,上下文 60%,系统 10%
return {
systemTokens: Math.floor(this.modelLimit * 0.1),
documentTokens: Math.floor(this.modelLimit * 0.5),
historyTokens: Math.floor(this.modelLimit * 0.1),
outputTokens: Math.floor(this.modelLimit * 0.3),
};
}
}
1.3 动态上下文窗口
ini
// generation/context.ts
function buildContext(docs: Document[], tokenBudget: number): string {
let context = "";
let tokenCount = 0;
for (const doc of docs) {
const estimateTokens = doc.pageContent.length / 4;
if (tokenCount + estimateTokens > tokenBudget) break;
context += `\n--- 来源: ${doc.metadata.source} ---\n`;
if (doc.metadata.sectionTitle) {
context += `## ${doc.metadata.sectionTitle}\n`;
}
context += doc.pageContent + "\n";
tokenCount += estimateTokens;
}
return context;
}
2. 流式生成 + 溯源引用
typescript
// generation/stream.ts
import { ChatOpenAI } from "@langchain/openai";
import { StringOutputParser } from "@langchain/core/output_parsers";
import { ChatPromptTemplate } from "@langchain/core/prompts";
async function* streamRAGResponse(query: string, docs: Document[]) {
const prompt = ChatPromptTemplate.fromMessages([
["system", "根据以下文档回答问题,引用时标注 [来源:xxx]"],
["user", "文档:\n{documents}\n\n问题:{query}"]
]);
const llm = new ChatOpenAI({ model: "gpt-4o", streaming: true, temperature: 0.1 });
const chain = prompt.pipe(llm).pipe(new StringOutputParser());
const stream = await chain.stream({
documents: buildContext(docs, 3000),
query,
});
// 逐 token 流式输出
for await (const chunk of stream) {
yield { type: "token", content: chunk };
}
// 返回引用
yield {
type: "sources",
sources: docs.map(d => ({
title: d.metadata.source,
excerpt: d.pageContent.substring(0, 100) + "...",
})),
};
}
3. 幻觉检测
typescript
// generation/hallucination.ts
async function detectHallucination(
answer: string,
sourceDocs: Document[]
): Promise<{ score: number; issues: string[] }> {
// 策略 1:引用来源检查
const citations = answer.match(/[来源[::].*?]/g) || [];
const missingSources = citations.filter(c => {
const sourceName = c.replace(/[来源[::]\s*|]/g, "");
return !sourceDocs.some(d => d.metadata.source.includes(sourceName));
});
// 策略 2:NLI(自然语言推理)检查
const nliResult = await checkEntailment(answer, sourceDocs);
return {
score: nliResult.entailmentScore,
issues: [
...missingSources.map(s => `引用来源"${s}"不在检索结果中`),
...nliResult.contradictions.map(c => `回答与文档矛盾:${c}`),
],
};
}
4. 监控体系
kotlin
// monitoring/dashboard.ts
interface RAGMetrics {
// 检索指标
retrieval: {
avgLatency: number;
p99Latency: number;
emptyRate: number; // 检索为空的比例
avgResultsCount: number;
};
// 生成指标
generation: {
avgLatency: number;
avgTokens: number;
hallucinationRate: number; // 包含不在文档中的内容
refusalRate: number; // "文档中没有相关信息"
};
// 业务指标
business: {
qps: number;
userSatisfaction: number; // 点赞/点踩比例
avgConversationLength: number;
};
// 成本指标
cost: {
embeddingCost: number;
llmCost: number;
totalCost: number;
};
}
class RAGMonitor {
async collect(): Promise<RAGMetrics> {
const [retrieval, generation, business, cost] = await Promise.all([
this.collectRetrievalMetrics(),
this.collectGenerationMetrics(),
this.collectBusinessMetrics(),
this.collectCostMetrics(),
]);
// 写入 Prometheus / 自建监控
await this.pushToPrometheus({ retrieval, generation, business, cost });
return { retrieval, generation, business, cost };
}
// 告警规则
checkAlerts(metrics: RAGMetrics): void {
if (metrics.retrieval.emptyRate > 0.05) {
this.alert("检索空结果率超过 5%");
}
if (metrics.generation.hallucinationRate > 0.1) {
this.alert("幻觉率超过 10%");
}
if (metrics.retrieval.p99Latency > 2000) {
this.alert("检索 P99 延迟超过 2s");
}
if (metrics.cost.totalCost > metrics.cost.yesterdayTotal * 1.5) {
this.alert("日成本较昨日增长超 50%");
}
}
}
5. 安全防护
5.1 Prompt Injection 防御
typescript
// security/injection.ts
const INJECTION_PATTERNS = [
/忽略(上述|之前|以上|所有|系统)?(指令|规则|提示)/i,
/你(现在是|扮演|作为)/,
/(系统|原始)?prompt/i,
/忘记.*(训练|设定|身份)/i,
/从现在开始/,
];
function detectInjection(input: string): boolean {
return INJECTION_PATTERNS.some(pattern => pattern.test(input));
}
// 在检索和生成之前检查
async function safeRAG(query: string): Promise<string> {
if (detectInjection(query)) {
return "检测到异常输入,请重新提问。";
}
return ragPipeline.query(query);
}
5.2 敏感信息防护
typescript
// security/sensitive.ts
class SensitiveDataFilter {
private patterns = {
idCard: /\d{17}[\dXx]/,
phone: /1[3-9]\d{9}/,
email: /[\w.-]+@[\w.-]+.\w+/,
};
maskDocumentBeforeEmbedding(doc: string): string {
return doc
.replace(this.patterns.idCard, "***")
.replace(this.patterns.phone, "***")
.replace(this.patterns.email, "***");
}
}
6. 成本优化
6.1 语义缓存
typescript
// cost/cache.ts
class SemanticCache {
private cache = new Map<string, CacheEntry>();
private embedder = new OpenAIEmbeddings({ model: "text-embedding-3-small" });
private threshold = 0.95;
async get(query: string): Promise<string | null> {
const queryVec = await this.embedder.embedQuery(query);
for (const [key, entry] of this.cache) {
if (Date.now() - entry.timestamp > entry.ttl * 1000) {
this.cache.delete(key);
continue;
}
const similarity = this.cosine(queryVec, entry.embedding);
if (similarity >= this.threshold) return entry.answer;
}
return null;
}
async set(query: string, answer: string, ttl = 3600) {
const vec = await this.embedder.embedQuery(query);
this.cache.set(query, { answer, embedding: vec, timestamp: Date.now(), ttl });
}
}
6.2 模型降级策略
kotlin
// cost/degradation.ts
class CostAwareRouter {
route(task: Task): ModelChoice {
// 预算耗尽 → 最便宜模型
if (this.budgetRemaining <= 0) return { model: "deepseek-chat", cost: 0.0014 };
// 简单问题 → 小模型
if (task.complexity === "simple") return { model: "gpt-4o-mini", cost: 0.0015 };
// 标准问题 → 中模型
if (task.complexity === "medium") return { model: "deepseek-chat", cost: 0.0014 };
// 复杂推理 → 大模型
return { model: "gpt-4o", cost: 0.0025 };
}
}
6.3 节省清单
| 优化项 | 预估节省 | 实施难度 |
|---|---|---|
| 语义缓存 | 30-50% | 中 |
| 模型降级(简单问题用小模型) | 20-30% | 中 |
| Prompt 压缩 | 10-20% | 低 |
| 本地 Embedding 替换 API | 90% (Embedding 成本) | 中 |
| 减少 Re-rank 候选数(50→20) | 60% (Re-rank 成本) | 低 |
7. 增量更新
typescript
// production/incremental.ts
class IncrementalUpdater {
async onDocumentChange(filePath: string): Promise<void> {
// 1. 找到旧向量的 ID
const oldChunks = await vectorStore.similaritySearch("", 1, {
source: filePath,
});
// 2. 删除旧数据
if (oldChunks.length > 0) {
const oldIds = oldChunks.map(c => c.metadata.chunkId);
await vectorStore.delete({ ids: oldIds });
}
// 3. 重新解析 + 分块 + 向量化 + 写入
const newChunks = await this.ingestAndChunk(filePath);
await vectorStore.addDocuments(newChunks);
}
}
上一篇:05 · 检索增强:混合检索、Re-rank 与 Query 优化 下一篇(进阶):A · 多模态 RAG:图片与表格检索