基于RAG的农产品GEO溯源智能问答系统实现
背景
传统的农产品溯源系统主要提供"扫码查信息"的功能,用户体验相对被动。当买家想要了解"这批黄瓜采摘前有没有打药?"或者"这个地块历年的农残情况如何?"这类问题时,需要逐条翻阅农事记录,效率低下。
本文介绍如何使用RAG(检索增强生成)技术,构建一个能够理解自然语言查询的智能溯源问答系统,实现"问什么答什么"的溯源体验。
系统架构
┌────────────────────────────────────────────────────────────────────┐
│ 用户查询入口 │
│ "这批西红柿采摘前7天内用过什么农药?" │
└────────────────────────────────────────────────────────────────────┘
│
▼
┌────────────────────────────────────────────────────────────────────┐
│ Query理解与改写层 │
│ • 意图识别(查询类型:农事/检测/产地) │
│ • 实体抽取(作物、时间、操作类型) │
│ • 查询改写(自然语言→结构化条件) │
└────────────────────────────────────────────────────────────────────┘
│
▼
┌────────────────────────────────────────────────────────────────────┐
│ 混合检索层 │
│ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │
│ │ 向量检索 │ │ 知识图谱 │ │ 元数据过滤 │ │
│ │ (语义相似) │ │ (关系查询) │ │ (精确匹配) │ │
│ └──────────────┘ └──────────────┘ └──────────────┘ │
└────────────────────────────────────────────────────────────────────┘
│
▼
┌────────────────────────────────────────────────────────────────────┐
│ 上下文整合与排序 │
│ • 多路召回结果合并 │
│ • 相关性打分与重排序 │
│ • 上下文窗口管理 │
└────────────────────────────────────────────────────────────────────┘
│
▼
┌────────────────────────────────────────────────────────────────────┐
│ LLM生成层 │
│ • 基于检索结果生成答案 │
│ • 引用溯源(标注信息来源) │
│ • 多轮对话上下文管理 │
└────────────────────────────────────────────────────────────────────┘
核心模块实现
一、数据向量化与索引
首先,需要将农事记录、检测报告等文本数据转化为向量表示。
python
from dataclasses import dataclass
from typing import List, Optional
from datetime import datetime
import json
@dataclass
class FarmDocument:
"""农事文档模型"""
doc_id: str
plot_id: str
batch_id: Optional[str]
doc_type: str # operation / report / plot_info
content: str
metadata: dict
created_at: datetime
class DocumentVectorizer:
"""文档向量化服务"""
def __init__(self, embedding_model):
self.model = embedding_model
self.vector_dim = 768 # 嵌入维度
def vectorize(self, document: FarmDocument) -> List[float]:
"""
将文档转化为向量
"""
# 构建文本表示
text = self._build_text(document)
# 调用嵌入模型
embedding = self.model.encode(text)
return embedding.tolist()
def _build_text(self, doc: FarmDocument) -> str:
"""
构建用于向量化的文本
"""
if doc.doc_type == 'operation':
# 农事操作记录
return self._format_operation_text(doc)
elif doc.doc_type == 'report':
# 检测报告
return self._format_report_text(doc)
else:
# 地块信息
return doc.content
def _format_operation_text(self, doc: FarmDocument) -> str:
"""
格式化农事操作记录
"""
attrs = doc.metadata.get('attributes', {})
op_type = doc.metadata.get('operation_type', '')
templates = {
'fertilize': "地块{plot_id}于{date}进行施肥操作,使用{fertilizer_name},"
"用量{amount}公斤/亩,施用方式为{method}。操作人:{operator}。",
'pesticide': "地块{plot_id}于{date}进行喷药操作,使用{pesticide_name},"
"稀释倍数{dilution}倍,喷施面积{area}亩。操作人:{operator}。",
'harvest': "地块{plot_id}于{date}进行采摘,采摘作物为{crop},"
"采摘量{quantity}公斤。采摘人:{operator}。"
}
template = templates.get(op_type, "{content}")
return template.format(
plot_id=doc.plot_id,
date=doc.created_at.strftime('%Y年%m月%d日'),
operator=doc.metadata.get('operator_name', '未知'),
**attrs
)
# 使用示例
from sentence_transformers import SentenceTransformer
# 加载中文嵌入模型
model = SentenceTransformer('shibing624/text2vec-base-chinese')
vectorizer = DocumentVectorizer(model)
# 向量化农事文档
doc = FarmDocument(
doc_id="OP-2024-001",
plot_id="PLOT-A12",
batch_id="BATCH-2024-0415",
doc_type='operation',
content="",
metadata={
'operation_type': 'pesticide',
'attributes': {
'pesticide_name': '吡虫啉',
'dilution': 2000,
'area': 5.2
},
'operator_name': '张建国'
},
created_at=datetime(2024, 4, 10)
)
vector = vectorizer.vectorize(doc)
二、向量检索服务
使用FAISS或Milvus进行向量相似度检索。
python
import faiss
import numpy as np
from typing import List, Tuple
class VectorStore:
"""向量存储与检索"""
def __init__(self, dim: int = 768):
self.dim = dim
self.index = faiss.IndexFlatIP(dim) # 内积相似度
self.doc_mapping = {} # 向量ID到文档的映射
self.next_id = 0
def add_documents(self, documents: List[Tuple[FarmDocument, List[float]]]):
"""
添加文档向量
"""
vectors = []
for doc, vector in documents:
vectors.append(vector)
self.doc_mapping[self.next_id] = doc
self.next_id += 1
# 归一化后添加到索引
vectors = np.array(vectors, dtype=np.float32)
faiss.normalize_L2(vectors)
self.index.add(vectors)
def search(self, query_vector: List[float], k: int = 10) -> List[Tuple[FarmDocument, float]]:
"""
向量检索
"""
query = np.array([query_vector], dtype=np.float32)
faiss.normalize_L2(query)
scores, indices = self.index.search(query, k)
results = []
for score, idx in zip(scores[0], indices[0]):
if idx >= 0: # FAISS返回-1表示不足k个结果
results.append((self.doc_mapping[idx], float(score)))
return results
# 使用示例
store = VectorStore(dim=768)
# 批量添加文档向量
docs_with_vectors = [(doc1, vector1), (doc2, vector2), ...]
store.add_documents(docs_with_vectors)
# 查询
query_text = "这块地最近打过什么药?"
query_vector = model.encode(query_text).tolist()
results = store.search(query_vector, k=5)
三、知识图谱构建
除了向量检索,还可以构建知识图谱进行关系查询。
python
from typing import Dict, List, Set
from collections import defaultdict
class FarmKnowledgeGraph:
"""农业知识图谱"""
def __init__(self):
self.entities = {} # 实体存储
self.relations = defaultdict(list) # 关系存储
self.reverse_relations = defaultdict(list) # 反向关系
def add_entity(self, entity_id: str, entity_type: str, properties: dict):
"""添加实体"""
self.entities[entity_id] = {
'type': entity_type,
'properties': properties
}
def add_relation(self, source: str, relation: str, target: str, properties: dict = None):
"""添加关系"""
self.relations[(source, relation)].append({
'target': target,
'properties': properties or {}
})
self.reverse_relations[(target, relation)].append({
'source': source,
'properties': properties or {}
})
def query_related(self, entity_id: str, relation: str) -> List[dict]:
"""查询关联实体"""
return self.relations.get((entity_id, relation), [])
def query_reverse(self, entity_id: str, relation: str) -> List[dict]:
"""反向查询"""
return self.reverse_relations.get((entity_id, relation), [])
def get_entity_chain(self, start_id: str, relation_chain: List[str]) -> List[str]:
"""获取关系链上的所有实体"""
current = [start_id]
for relation in relation_chain:
next_level = []
for entity_id in current:
related = self.query_related(entity_id, relation)
next_level.extend([r['target'] for r in related])
current = next_level
return current
# 构建知识图谱
kg = FarmKnowledgeGraph()
# 添加地块实体
kg.add_entity("PLOT-A12", "Plot", {
"name": "A12号地块",
"area": 5.2,
"crop": "西红柿",
"location": "河南省开封市祥符区"
})
# 添加种植户实体
kg.add_entity("FARMER-001", "Farmer", {
"name": "张建国",
"phone": "138****1234"
})
# 添加农事操作实体
kg.add_entity("OP-2024-001", "Operation", {
"type": "pesticide",
"date": "2024-04-10",
"pesticide": "吡虫啉"
})
# 添加关系
kg.add_relation("PLOT-A12", "managed_by", "FARMER-001")
kg.add_relation("PLOT-A12", "has_operation", "OP-2024-001")
kg.add_relation("OP-2024-001", "performed_by", "FARMER-001")
# 查询示例:获取地块的所有农事操作
operations = kg.query_related("PLOT-A12", "has_operation")
# 返回: [{'target': 'OP-2024-001', 'properties': {}}]
# 查询关系链:地块 -> 农事操作 -> 操作人
operators = kg.get_entity_chain("PLOT-A12", ["has_operation", "performed_by"])
# 返回: ["FARMER-001"]
四、混合检索与相似度匹配
结合向量检索、知识图谱和元数据过滤,实现精准召回。
python
from dataclasses import dataclass
from typing import List, Optional
from enum import Enum
class QueryIntent(Enum):
OPERATION_QUERY = "operation" # 农事操作查询
REPORT_QUERY = "report" # 检测报告查询
PLOT_QUERY = "plot" # 地块信息查询
COMPARISON = "comparison" # 对比查询
HISTORY = "history" # 历史趋势查询
@dataclass
class ParsedQuery:
"""解析后的查询"""
intent: QueryIntent
entities: dict # 抽取的实体
time_range: tuple # 时间范围
filters: dict # 过滤条件
original_query: str # 原始查询
class HybridRetriever:
"""混合检索器"""
def __init__(self, vector_store, knowledge_graph, metadata_store, embedder):
self.vector_store = vector_store
self.kg = knowledge_graph
self.meta_store = metadata_store
self.embedder = embedder
def retrieve(self, parsed_query: ParsedQuery, top_k: int = 10) -> List[dict]:
"""
混合检索
"""
results = []
# 1. 元数据精确过滤
meta_results = self._metadata_filter(parsed_query)
# 2. 向量语义检索
vector_results = self._vector_search(parsed_query)
# 3. 知识图谱关系查询
kg_results = self._kg_query(parsed_query)
# 4. 结果合并与去重
results = self._merge_results(
meta_results=meta_results,
vector_results=vector_results,
kg_results=kg_results
)
# 5. 相关性重排序
results = self._rerank(results, parsed_query)
return results[:top_k]
def _metadata_filter(self, query: ParsedQuery) -> List[dict]:
"""
元数据过滤
"""
filters = query.filters.copy()
# 添加时间过滤
if query.time_range:
filters['start_date'] = query.time_range[0]
filters['end_date'] = query.time_range[1]
# 添加地块过滤
if query.entities.get('plot_id'):
filters['plot_id'] = query.entities['plot_id']
# 添加批次过滤
if query.entities.get('batch_id'):
filters['batch_id'] = query.entities['batch_id']
return self.meta_store.query(filters)
def _vector_search(self, query: ParsedQuery) -> List[dict]:
"""
向量检索
"""
query_vector = self.embedder.encode(query.original_query).tolist()
vector_results = self.vector_store.search(query_vector, k=20)
return [
{'doc': doc, 'score': score, 'source': 'vector'}
for doc, score in vector_results
]
def _kg_query(self, query: ParsedQuery) -> List[dict]:
"""
知识图谱查询
"""
results = []
plot_id = query.entities.get('plot_id')
if plot_id:
# 查询地块关联的农事操作
operations = self.kg.query_related(plot_id, 'has_operation')
for op in operations:
op_entity = self.kg.entities.get(op['target'])
if op_entity:
results.append({
'doc': FarmDocument(
doc_id=op['target'],
plot_id=plot_id,
batch_id=None,
doc_type='operation',
content=str(op_entity['properties']),
metadata=op_entity['properties'],
created_at=datetime.now()
),
'score': 1.0,
'source': 'knowledge_graph'
})
return results
def _merge_results(self, **result_sets) -> List[dict]:
"""
合并多路召回结果
"""
merged = {}
for source_name, results in result_sets.items():
for result in results:
doc_id = result['doc'].doc_id
if doc_id not in merged:
merged[doc_id] = result
merged[doc_id]['sources'] = [source_name]
else:
# 多路命中的文档提升权重
merged[doc_id]['score'] += result['score'] * 0.5
merged[doc_id]['sources'].append(source_name)
return sorted(merged.values(), key=lambda x: x['score'], reverse=True)
def _rerank(self, results: List[dict], query: ParsedQuery) -> List[dict]:
"""
相关性重排序
"""
for result in results:
# 多路召回加权
source_bonus = len(result.get('sources', [])) * 0.1
# 时间相关性加权
time_bonus = self._compute_time_relevance(result['doc'], query.time_range)
# 实体匹配加权
entity_bonus = self._compute_entity_match(result['doc'], query.entities)
result['final_score'] = result['score'] + source_bonus + time_bonus + entity_bonus
return sorted(results, key=lambda x: x['final_score'], reverse=True)
def _compute_time_relevance(self, doc: FarmDocument, time_range: tuple) -> float:
"""计算时间相关性"""
if not time_range or not doc.created_at:
return 0
start, end = time_range
if start <= doc.created_at <= end:
return 0.2
return 0
def _compute_entity_match(self, doc: FarmDocument, entities: dict) -> float:
"""计算实体匹配度"""
score = 0
if entities.get('plot_id') and doc.plot_id == entities['plot_id']:
score += 0.3
if entities.get('operation_type') and doc.metadata.get('operation_type') == entities['operation_type']:
score += 0.2
return score
五、Query理解与改写
python
import re
from datetime import datetime, timedelta
class QueryParser:
"""查询解析器"""
def __init__(self, ner_model=None):
self.ner_model = ner_model
self.time_patterns = [
(r'最近(\d+)天', self._parse_recent_days),
(r'(\d+)天前', self._parse_days_ago),
(r'(\d+)月(\d+)日', self._parse_specific_date),
(r'采摘前(\d+)天', self._parse_before_harvest),
(r'(\d+)月(\d+)日到(\d+)月(\d+)日', self._parse_date_range),
]
self.operation_keywords = {
'施肥': 'fertilize',
'打药': 'pesticide',
'喷药': 'pesticide',
'用药': 'pesticide',
'灌溉': 'irrigate',
'浇水': 'irrigate',
'除草': 'weed',
'采摘': 'harvest',
'收割': 'harvest',
}
def parse(self, query: str, context: dict = None) -> ParsedQuery:
"""
解析查询
"""
# 意图识别
intent = self._classify_intent(query)
# 实体抽取
entities = self._extract_entities(query, context or {})
# 时间范围解析
time_range = self._parse_time(query, context)
# 过滤条件提取
filters = self._extract_filters(query)
return ParsedQuery(
intent=intent,
entities=entities,
time_range=time_range,
filters=filters,
original_query=query
)
def _classify_intent(self, query: str) -> QueryIntent:
"""意图分类"""
if any(kw in query for kw in ['检测', '农残', '质检', '报告']):
return QueryIntent.REPORT_QUERY
elif any(kw in query for kw in ['地块', '面积', '位置', '坐标']):
return QueryIntent.PLOT_QUERY
elif any(kw in query for kw in ['对比', '比较', '区别', '差异']):
return QueryIntent.COMPARISON
elif any(kw in query for kw in ['历史', '往年', '趋势', '变化']):
return QueryIntent.HISTORY
else:
return QueryIntent.OPERATION_QUERY
def _extract_entities(self, query: str, context: dict) -> dict:
"""实体抽取"""
entities = {}
# 从上下文获取地块/批次信息
if context.get('plot_id'):
entities['plot_id'] = context['plot_id']
if context.get('batch_id'):
entities['batch_id'] = context['batch_id']
# 从查询中提取操作类型
for keyword, op_type in self.operation_keywords.items():
if keyword in query:
entities['operation_type'] = op_type
break
# 从查询中提取作物名称
crops = ['西红柿', '黄瓜', '茄子', '辣椒', '大蒜', '大葱', '白菜']
for crop in crops:
if crop in query:
entities['crop'] = crop
break
return entities
def _parse_time(self, query: str, context: dict) -> tuple:
"""时间范围解析"""
for pattern, parser in self.time_patterns:
match = re.search(pattern, query)
if match:
return parser(match, context)
# 默认返回最近30天
end = datetime.now()
start = end - timedelta(days=30)
return (start, end)
def _parse_recent_days(self, match, context) -> tuple:
days = int(match.group(1))
end = datetime.now()
start = end - timedelta(days=days)
return (start, end)
def _parse_days_ago(self, match, context) -> tuple:
days = int(match.group(1))
end = datetime.now()
start = end - timedelta(days=days)
return (start, end)
def _parse_before_harvest(self, match, context) -> tuple:
"""解析采摘前N天"""
days = int(match.group(1))
# 从上下文获取采摘日期
harvest_date = context.get('harvest_date')
if harvest_date:
end = harvest_date
start = end - timedelta(days=days)
return (start, end)
return None
def _extract_filters(self, query: str) -> dict:
"""提取过滤条件"""
filters = {}
# 农药名称
pesticides = ['吡虫啉', '多菌灵', '阿维菌素', '氯虫苯甲酰胺']
for pesticide in pesticides:
if pesticide in query:
filters['pesticide_name'] = pesticide
break
# 肥料名称
fertilizers = ['尿素', '复合肥', '有机肥', '磷酸二铵']
for fertilizer in fertilizers:
if fertilizer in query:
filters['fertilizer_name'] = fertilizer
break
return filters
# 使用示例
parser = QueryParser()
query = "这批西红柿采摘前7天内打过什么药?"
context = {
'plot_id': 'PLOT-A12',
'batch_id': 'BATCH-2024-0415',
'harvest_date': datetime(2024, 4, 15)
}
parsed = parser.parse(query, context)
# 输出:
# ParsedQuery(
# intent=QueryIntent.OPERATION_QUERY,
# entities={'plot_id': 'PLOT-A12', 'batch_id': 'BATCH-2024-0415', 'operation_type': 'pesticide', 'crop': '西红柿'},
# time_range=(datetime(2024, 4, 8), datetime(2024, 4, 15)),
# filters={},
# original_query="这批西红柿采摘前7天内打过什么药?"
# )
六、答案生成与溯源
python
from typing import List
class AnswerGenerator:
"""答案生成器"""
def __init__(self, llm_client):
self.llm = llm_client
def generate(self, query: str, retrieved_docs: List[dict],
parsed_query: ParsedQuery) -> str:
"""
生成答案
"""
# 构建上下文
context = self._build_context(retrieved_docs)
# 构建提示词
prompt = self._build_prompt(query, context, parsed_query)
# 调用LLM生成答案
answer = self.llm.generate(prompt)
# 添加引用溯源
answer_with_citations = self._add_citations(answer, retrieved_docs)
return answer_with_citations
def _build_context(self, retrieved_docs: List[dict]) -> str:
"""构建上下文"""
context_parts = []
for i, result in enumerate(retrieved_docs[:5]): # 最多使用5个文档
doc = result['doc']
context_parts.append(
f"[文档{i+1}] {self._format_doc(doc)}"
)
return "\n\n".join(context_parts)
def _format_doc(self, doc: FarmDocument) -> str:
"""格式化文档"""
if doc.doc_type == 'operation':
attrs = doc.metadata.get('attributes', {})
op_type = doc.metadata.get('operation_type', '')
type_labels = {
'fertilize': '施肥',
'pesticide': '喷药',
'irrigate': '灌溉',
'harvest': '采摘'
}
return f"{doc.created_at.strftime('%Y-%m-%d')},地块{doc.plot_id}进行了{type_labels.get(op_type, op_type)}操作。详细信息:{json.dumps(attrs, ensure_ascii=False)}"
elif doc.doc_type == 'report':
return f"检测报告:{doc.content}"
else:
return doc.content
def _build_prompt(self, query: str, context: str, parsed_query: ParsedQuery) -> str:
"""构建提示词"""
prompt = f"""你是一个农产品溯源助手。请根据以下检索到的信息,回答用户的问题。
用户问题:{query}
检索到的相关信息:
{context}
请回答用户的问题。要求:
1. 直接回答问题,不要重复问题本身
2. 如果检索信息中有相关内容,请准确引用
3. 如果检索信息不足,请明确说明缺少哪些信息
4. 回答中使用具体的数据和时间
5. 语言简洁,不要过度展开
答案:"""
return prompt
def _add_citations(self, answer: str, retrieved_docs: List[dict]) -> str:
"""添加引用溯源"""
citations = "\n\n---\n📋 信息来源:\n"
seen_plots = set()
for result in retrieved_docs[:3]:
doc = result['doc']
if doc.plot_id not in seen_plots:
citations += f"• 地块 {doc.plot_id},记录时间 {doc.created_at.strftime('%Y-%m-%d')}\n"
seen_plots.add(doc.plot_id)
return answer + citations
# 完整的RAG问答服务
class TraceQAService:
"""溯源问答服务"""
def __init__(self, query_parser, retriever, answer_generator):
self.parser = query_parser
self.retriever = retriever
self.generator = answer_generator
def answer(self, query: str, context: dict = None) -> str:
"""
回答查询
"""
# 1. 解析查询
parsed_query = self.parser.parse(query, context)
# 2. 混合检索
retrieved_docs = self.retriever.retrieve(parsed_query, top_k=10)
# 3. 生成答案
answer = self.generator.generate(query, retrieved_docs, parsed_query)
return answer
# 使用示例
qa_service = TraceQAService(parser, retriever, generator)
query = "这批西红柿采摘前7天内打过什么药?"
context = {
'plot_id': 'PLOT-A12',
'batch_id': 'BATCH-2024-0415',
'harvest_date': datetime(2024, 4, 15)
}
answer = qa_service.answer(query, context)
# 输出:
# "根据记录,地块PLOT-A12在2024年4月10日进行了喷药操作,使用吡虫啉,稀释倍数2000倍,喷施面积5.2亩。采摘日期为4月15日,距离喷药时间5天,符合农药安全间隔期要求。
#
# ---
# 📋 信息来源:
# • 地块 PLOT-A12,记录时间 2024-04-10"
性能优化
1. 向量索引优化
python
# 使用FAISS IVF索引加速大规模检索
class OptimizedVectorStore:
def __init__(self, dim: int = 768, nlist: int = 100):
self.dim = dim
self.nlist = nlist
self.quantizer = faiss.IndexFlatIP(dim)
self.index = faiss.IndexIVFFlat(self.quantizer, dim, nlist)
self.is_trained = False
def train(self, vectors: np.ndarray):
"""训练索引"""
faiss.normalize_L2(vectors)
self.index.train(vectors)
self.is_trained = True
def add(self, vectors: np.ndarray):
"""添加向量"""
if not self.is_trained:
self.train(vectors)
faiss.normalize_L2(vectors)
self.index.add(vectors)
def search(self, query: np.ndarray, k: int = 10, nprobe: int = 10):
"""搜索"""
faiss.normalize_L2(query)
self.index.nprobe = nprobe
return self.index.search(query, k)
2. 缓存策略
python
import hashlib
from functools import lru_cache
class CachedQAService:
"""带缓存的问答服务"""
def __init__(self, qa_service, cache_ttl: int = 3600):
self.qa_service = qa_service
self.cache_ttl = cache_ttl
self.cache = {} # 实际应用中使用Redis
def answer(self, query: str, context: dict = None) -> str:
# 生成缓存键
cache_key = self._make_cache_key(query, context)
# 检查缓存
if cache_key in self.cache:
cached_result, timestamp = self.cache[cache_key]
if time.time() - timestamp < self.cache_ttl:
return cached_result
# 调用服务
answer = self.qa_service.answer(query, context)
# 存入缓存
self.cache[cache_key] = (answer, time.time())
return answer
def _make_cache_key(self, query: str, context: dict) -> str:
"""生成缓存键"""
content = f"{query}|{json.dumps(context or {}, sort_keys=True)}"
return hashlib.md5(content.encode()).hexdigest()
总结
本文介绍了一个基于RAG的农产品GEO溯源智能问答系统,核心特点:
- 向量检索:将农事记录转化为语义向量,支持自然语言查询
- 知识图谱:构建地块-操作-种植户的关系网络,支持关系查询
- 混合检索:结合向量检索、知识图谱、元数据过滤,提高召回精度
- 智能生成:基于LLM生成自然语言答案,并标注信息来源
相比传统的"扫码查信息"模式,RAG问答系统让用户能够用自然语言提问,系统自动理解意图、检索相关信息、生成准确答案,大幅提升了溯源信息获取效率。