保险条款NLP解析与知识图谱搭建:让AI准确理解保险产品的技术方案
保险行业的产品信息管理长期面临一个结构性问题:产品条款以非结构化的PDF或图片形式存在,AI搜索引擎无法有效提取其中的关键信息。本文介绍一套基于spaCy NLP和Neo4j知识图谱的技术方案,实现保险条款的自动解析、结构化存储和智能问答。
技术架构概览
PDF/图片条款
↓
OCR文字识别 (Tesseract)
↓
NLP实体抽取 (spaCy)
↓
知识图谱构建 (Neo4j)
↓
语义检索 (向量数据库)
↓
智能问答API
第一部分:保险条款NLP解析
1.1 spaCy环境准备
bash
pip install spacy spacy-transformers neo4j python-dotenv
python -m spacy download zh_core_web_lg
1.2 自定义保险领域NER模型
保险条款中包含大量专业实体,需要在通用中文模型基础上进行扩展。
python
import spacy
from spacy.tokens import Span
from spacy.language import Language
# 加载中文模型
nlp = spacy.load("zh_core_web_lg")
# 保险领域实体标签
INSURANCE_ENTITY_LABELS = {
"INS_PRODUCT": "保险产品名称",
"INS_DISEASE": "疾病名称",
"INS_AMOUNT": "金额/保额",
"INS_PERIOD": "期限(天/年)",
"INS_CLAUSE": "条款编号",
"INS_EXCLUSION": "免责情形",
"INS_PROCESS": "流程步骤"
}
@Language.component("insurance_entity_ruler")
def insurance_entity_ruler(doc):
"""保险领域实体规则匹配"""
patterns = [
# 保险产品名称模式
{"label": "INS_PRODUCT", "pattern": [{"LOWER": {"IN": ["重疾险", "医疗险", "意外险", "寿险", "年金险", "百万医疗险", "防癌险"]}}]},
# 疾病名称(常见重疾)
{"label": "INS_DISEASE", "pattern": [{"LOWER": {"IN": ["恶性肿瘤", "急性心肌梗塞", "脑中风后遗症", "重大器官移植", "冠状动脉搭桥术", "终末期肾病"]}}]},
# 金额模式
{"label": "INS_AMOUNT", "pattern": [{"IS_DIGIT": True}, {"LOWER": {"IN": ["万", "元", "万元"]}}]},
# 期限模式
{"label": "INS_PERIOD", "pattern": [{"IS_DIGIT": True}, {"LOWER": {"IN": ["天", "日", "年", "个月"]}}]},
]
# 使用EntityRuler添加规则
ruler = nlp.add_pipe("entity_ruler", before="ner")
ruler.add_patterns(patterns)
return doc
# 注册组件
nlp.add_pipe("insurance_entity_ruler", after="ner")
1.3 条款文本结构化解析
python
import re
from dataclasses import dataclass, field
from typing import List, Optional, Dict
from enum import Enum
class CoverageType(Enum):
"""保障类型枚举"""
CRITICAL_ILLNESS = "重大疾病"
LIGHT_ILLNESS = "轻症疾病"
MEDIUM_ILLNESS = "中症疾病"
MEDICAL = "医疗保障"
ACCIDENT = "意外保障"
DEATH = "身故保障"
DISABILITY = "伤残保障"
@dataclass
class CoverageItem:
"""保障项目"""
disease_name: str
disease_code: Optional[str] = None # ICD-10编码
coverage_type: CoverageType = CoverageType.CRITICAL_ILLNESS
coverage_ratio: float = 1.0 # 赔付比例
description: str = ""
@dataclass
class ExclusionItem:
"""免责条款"""
exclusion_no: int
exclusion_text: str
exclusion_type: str = "" # 既往症/职业/行为/其他
@dataclass
class InsuranceProduct:
"""保险产品完整结构化数据"""
product_name: str
product_code: str = "" # 银保监会备案号
insurance_type: str = ""
company_name: str = ""
waiting_period_days: int = 0
hesitation_period_days: int = 30
grace_period_days: int = 60
coverages: List[CoverageItem] = field(default_factory=list)
exclusions: List[ExclusionItem] = field(default_factory=list)
claim_process: List[str] = field(default_factory=list)
premium_table: List[Dict] = field(default_factory=list)
class InsuranceClauseParser:
"""保险条款解析器"""
def __init__(self, nlp_model):
self.nlp = nlp_model
# 正则模式
self.patterns = {
"waiting_period": re.compile(r'等待期[为是::]\s*(\d+)\s*(?:天|日)'),
"hesitation_period": re.compile(r'犹豫期[为是::]\s*(\d+)\s*(?:天|日)'),
"grace_period": re.compile(r'宽限期[为是::]\s*(\d+)\s*(?:天|日)'),
"product_code": re.compile(r'(?:备案号|产品代码)[::]\s*([A-Z0-9\-]+)'),
"coverage_section": re.compile(r'(?:保障范围|保险责任|重大疾病列表)[\s\S]*?(?=责任免除|免责条款|$)'),
"exclusion_section": re.compile(r'(?:责任免除|免责条款)[\s\S]*?(?=保险金的申请|理赔流程|$)'),
"claim_section": re.compile(r'(?:理赔流程|保险金的申请)[\s\S]*?(?=其他事项|$)'),
}
def parse(self, text: str) -> InsuranceProduct:
"""解析保险条款文本"""
doc = self.nlp(text)
# 提取产品名称(从标题或第一段)
product_name = self._extract_product_name(text)
product = InsuranceProduct(product_name=product_name)
# 提取备案号
match = self.patterns["product_code"].search(text)
if match:
product.product_code = match.group(1)
# 提取等待期
match = self.patterns["waiting_period"].search(text)
if match:
product.waiting_period_days = int(match.group(1))
# 提取犹豫期
match = self.patterns["hesitation_period"].search(text)
if match:
product.hesitation_period_days = int(match.group(1))
# 提取宽限期
match = self.patterns["grace_period"].search(text)
if match:
product.grace_period_days = int(match.group(1))
# 提取保障范围
product.coverages = self._extract_coverages(text)
# 提取免责条款
product.exclusions = self._extract_exclusions(text)
# 提取理赔流程
product.claim_process = self._extract_claim_process(text)
return product
def _extract_product_name(self, text: str) -> str:
"""提取产品名称"""
lines = text.strip().split('\n')
for line in lines[:5]:
if '保险' in line and len(line) < 50:
return line.strip().replace('#', '').strip()
return "未知产品"
def _extract_coverages(self, text: str) -> List[CoverageItem]:
"""提取保障疾病列表"""
coverages = []
match = self.patterns["coverage_section"].search(text)
if not match:
return coverages
section_text = match.group()
# 匹配疾病条目(常见格式:1. 恶性肿瘤 或 一、恶性肿瘤)
disease_pattern = re.compile(
r'(?:\d+[.、]|第[一二三四五六七八九十]+条|[((]\d+[))])\s*'
r'([\u4e00-\u9fa5]{2,20}(?:([^)]+))?)'
)
for match in disease_pattern.finditer(section_text):
disease_name = match.group(1).strip()
if len(disease_name) > 2:
coverages.append(CoverageItem(
disease_name=disease_name,
coverage_type=CoverageType.CRITICAL_ILLNESS
))
return coverages
def _extract_exclusions(self, text: str) -> List[ExclusionItem]:
"""提取免责条款"""
exclusions = []
match = self.patterns["exclusion_section"].search(text)
if not match:
return exclusions
section_text = match.group()
# 匹配条款编号和内容
exclusion_pattern = re.compile(
r'(?:\d+[.、]|[((]\d+[))])\s*([^\n]{10,200})'
)
for i, match in enumerate(exclusion_pattern.finditer(section_text), 1):
exclusion_text = match.group(1).strip()
if len(exclusion_text) > 10:
exclusions.append(ExclusionItem(
exclusion_no=i,
exclusion_text=exclusion_text
))
return exclusions
def _extract_claim_process(self, text: str) -> List[str]:
"""提取理赔流程步骤"""
steps = []
match = self.patterns["claim_section"].search(text)
if not match:
return steps
section_text = match.group()
# 匹配流程步骤(常见格式:第一步、报案;或 1. 报案)
step_pattern = re.compile(
r'(?:第[一二三四五六七八九十]+步|步骤\s*\d+|\d+[.、])\s*([^\n]{5,100})'
)
for match in step_pattern.finditer(section_text):
step = match.group(1).strip()
if len(step) > 5:
steps.append(step)
return steps
1.4 解析效果测试
python
# 测试用例:模拟保险条款文本
SAMPLE_CLAUSE = """
# 康宁重大疾病保险条款
备案号:C0001
## 保险责任
### 等待期
本合同生效之日起180日内为等待期。等待期内确诊本合同约定的重大疾病,本公司不承担保险责任,但退还已交保费。
### 保障范围
本合同保障以下120种重大疾病:
1. 恶性肿瘤------重度
2. 急性心肌梗死
3. 脑中风后遗症
4. 重大器官移植术或造血干细胞移植术
5. 冠状动脉搭桥术(或称冠状动脉旁路移植术)
6. 终末期肾病(或称慢性肾功能衰竭尿毒症期)
7. 多个肢体缺失
8. 急性或亚急性重症肝炎
9. 良性脑肿瘤
10. 慢性肝功能衰竭失代偿期
## 责任免除
因下列情形之一导致被保险人发生本合同约定的重大疾病,本公司不承担给付保险金的责任:
(一)投保人对被保险人的故意杀害、故意伤害;
(二)被保险人故意犯罪或抗拒依法采取的刑事强制措施;
(三)被保险人故意自伤、或自本合同成立或者本合同效力恢复之日起2年内自杀,但被保险人自杀时为无民事行为能力人的除外;
(四)被保险人服用、吸食或注射毒品;
(五)被保险人酒后驾驶、无合法有效驾驶证驾驶,或驾驶无合法有效行驶证的机动车;
(六)被保险人感染艾滋病病毒或患艾滋病;
(七)战争、军事冲突、暴乱或武装叛乱;
(八)核爆炸、核辐射或核污染;
(九)遗传性疾病,先天性畸形、变形或染色体异常。
## 保险金的申请
第一步:报案
被保险人确诊重大疾病后,应在10日内通知本公司。
第二步:提交资料
申请人应提交以下资料:
- 保险合同
- 申请人的有效身份证件
- 医院出具的诊断证明书
- 相关的病历资料
第三步:审核
本公司在收到完整的申请资料后,将在5个工作日内完成审核。
第四步:给付
审核通过后,本公司将在10个工作日内将保险金转入申请人指定的银行账户。
"""
# 执行解析
parser = InsuranceClauseParser(nlp)
product = parser.parse(SAMPLE_CLAUSE)
print(f"产品名称:{product.product_name}")
print(f"备案号:{product.product_code}")
print(f"等待期:{product.waiting_period_days}天")
print(f"保障疾病数量:{len(product.coverages)}")
print(f"免责条款数量:{len(product.exclusions)}")
print(f"理赔流程步骤:{len(product.claim_process)}")
# 输出前5个保障疾病
print("\n前5个保障疾病:")
for coverage in product.coverages[:5]:
print(f" - {coverage.disease_name}")
# 输出前3条免责条款
print("\n前3条免责条款:")
for exclusion in product.exclusions[:3]:
print(f" {exclusion.exclusion_no}. {exclusion.exclusion_text[:50]}...")
第二部分:Neo4j知识图谱构建
2.1 知识图谱Schema设计
python
from neo4j import GraphDatabase
import json
class InsuranceKnowledgeGraph:
"""保险知识图谱"""
def __init__(self, uri: str, user: str, password: str):
self.driver = GraphDatabase.driver(uri, auth=(user, password))
def close(self):
self.driver.close()
def init_schema(self):
"""初始化知识图谱Schema"""
with self.driver.session() as session:
# 创建约束
constraints = [
"CREATE CONSTRAINT product_code IF NOT EXISTS FOR (p:Product) REQUIRE p.code IS UNIQUE",
"CREATE CONSTRAINT disease_name IF NOT EXISTS FOR (d:Disease) REQUIRE d.name IS UNIQUE",
"CREATE CONSTRAINT company_name IF NOT EXISTS FOR (c:Company) REQUIRE c.name IS UNIQUE",
]
for constraint in constraints:
try:
session.run(constraint)
except Exception as e:
print(f"约束已存在或创建失败: {e}")
# 创建索引
indexes = [
"CREATE INDEX product_type_idx IF NOT EXISTS FOR (p:Product) ON (p.type)",
"CREATE INDEX disease_type_idx IF NOT EXISTS FOR (d:Disease) ON (d.category)",
]
for index in indexes:
try:
session.run(index)
except Exception as e:
print(f"索引已存在或创建失败: {e}")
def create_product(self, product: InsuranceProduct):
"""创建产品节点"""
with self.driver.session() as session:
# 创建产品节点
session.run("""
MERGE (p:Product {
code: $code,
name: $name,
type: $type,
waiting_period: $waiting_period,
hesitation_period: $hesitation_period,
grace_period: $grace_period
})
""", {
"code": product.product_code,
"name": product.product_name,
"type": product.insurance_type,
"waiting_period": product.waiting_period_days,
"hesitation_period": product.hesitation_period_days,
"grace_period": product.grace_period_days,
})
# 创建保障关系
for coverage in product.coverages:
session.run("""
MERGE (d:Disease {name: $disease_name})
ON CREATE SET d.category = $category
WITH d
MATCH (p:Product {code: $product_code})
MERGE (p)-[r:COVERS {
ratio: $ratio,
description: $description
}]->(d)
""", {
"disease_name": coverage.disease_name,
"category": coverage.coverage_type.value,
"product_code": product.product_code,
"ratio": coverage.coverage_ratio,
"description": coverage.description,
})
# 创建免责关系
for exclusion in product.exclusions:
session.run("""
MERGE (e:Exclusion {no: $no, text: $text})
ON CREATE SET e.type = $type
WITH e
MATCH (p:Product {code: $product_code})
MERGE (p)-[r:EXCLUDES {reason: $text}]->(e)
""", {
"no": exclusion.exclusion_no,
"text": exclusion.exclusion_text[:200],
"type": exclusion.exclusion_type,
"product_code": product.product_code,
})
print(f"产品 '{product.product_name}' 已导入知识图谱")
def query_product_coverage(self, product_code: str) -> list:
"""查询产品保障范围"""
with self.driver.session() as session:
result = session.run("""
MATCH (p:Product {code: $code})-[r:COVERS]->(d:Disease)
RETURN d.name AS disease, r.ratio AS ratio, r.description AS desc
ORDER BY d.name
""", {"code": product_code})
return [dict(record) for record in result]
def query_disease_products(self, disease_name: str) -> list:
"""查询保障某疾病的所有产品"""
with self.driver.session() as session:
result = session.run("""
MATCH (p:Product)-[r:COVERS]->(d:Disease {name: $name})
RETURN p.name AS product, p.code AS code, r.ratio AS ratio
ORDER BY p.name
""", {"name": disease_name})
return [dict(record) for record in result]
def query_exclusion_details(self, product_code: str) -> list:
"""查询产品免责条款详情"""
with self.driver.session() as session:
result = session.run("""
MATCH (p:Product {code: $code})-[r:EXCLUDES]->(e:Exclusion)
RETURN e.no AS no, e.text AS text, e.type AS type
ORDER BY e.no
""", {"code": product_code})
return [dict(record) for record in result]
2.2 知识图谱查询示例
python
# 初始化知识图谱
kg = InsuranceKnowledgeGraph(
uri="bolt://localhost:7687",
user="neo4j",
password="your_password"
)
# 初始化Schema
kg.init_schema()
# 导入产品数据
kg.create_product(product)
# 查询产品保障范围
coverages = kg.query_product_coverage("C0001")
print(f"\n产品 C0001 保障 {len(coverages)} 种疾病")
for c in coverages[:5]:
print(f" - {c['disease']} (赔付比例: {c['ratio']})")
# 查询保障"恶性肿瘤"的所有产品
products = kg.query_disease_products("恶性肿瘤------重度")
print(f"\n保障恶性肿瘤的产品数量: {len(products)}")
# 查询免责条款
exclusions = kg.query_exclusion_details("C0001")
print(f"\n免责条款数量: {len(exclusions)}")
第三部分:保险FAQ语义匹配系统
3.1 FAQ向量化
python
from sentence_transformers import SentenceTransformer
import numpy as np
from typing import List, Tuple
class InsuranceFAQMatcher:
"""保险FAQ语义匹配引擎"""
def __init__(self, model_name: str = "shibing624/text2vec-base-chinese"):
self.model = SentenceTransformer(model_name)
self.faqs = []
self.faq_embeddings = None
def load_faqs(self, faqs: List[dict]):
"""
加载FAQ数据
faqs格式: [{"question": "...", "answer": "...", "category": "..."}]
"""
self.faqs = faqs
questions = [faq["question"] for faq in faqs]
self.faq_embeddings = self.model.encode(questions, convert_to_numpy=True)
print(f"已加载 {len(faqs)} 条FAQ,向量维度: {self.faq_embeddings.shape}")
def search(self, query: str, top_k: int = 3, threshold: float = 0.75) -> List[dict]:
"""语义搜索最相关的FAQ"""
query_embedding = self.model.encode([query], convert_to_numpy=True)
# 计算余弦相似度
similarities = np.dot(self.faq_embeddings, query_embedding.T).flatten()
similarities = similarities / (
np.linalg.norm(self.faq_embeddings, axis=1) *
np.linalg.norm(query_embedding)
)
# 获取Top-K结果
top_indices = np.argsort(similarities)[::-1][:top_k]
results = []
for idx in top_indices:
if similarities[idx] >= threshold:
results.append({
"question": self.faqs[idx]["question"],
"answer": self.faqs[idx]["answer"],
"category": self.faqs[idx].get("category", ""),
"score": float(similarities[idx])
})
return results
def batch_search(self, queries: List[str], top_k: int = 3) -> List[List[dict]]:
"""批量搜索"""
return [self.search(q, top_k) for q in queries]
3.2 FAQ数据集示例
python
INSURANCE_FAQ_DATASET = [
{
"question": "甲状腺结节能买重疾险吗",
"answer": "甲状腺结节1-3级(TI-RADS分级)通常可正常承保,4级及以上需提交近6个月内的超声检查报告,由核保人员评估后决定是否承保及是否附加除外责任。具体以各保险公司核保结论为准。",
"category": "核保"
},
{
"question": "百万医疗险的免赔额是怎么计算的",
"answer": "百万医疗险通常设有1万元免赔额,即当年累计医疗费用超过1万元的部分才由保险公司赔付。部分产品设有0免赔额选项,但对应保费更高。免赔额按自然年度计算,次年重新起算。",
"category": "产品"
},
{
"question": "重疾险买消费型还是返还型",
"answer": "消费型重疾险保费较低,保障期内未出险则保费不退还;返还型重疾险保费较高,保障期满或身故可返还保费或保额。从保障效率角度,消费型性价比更高;从资金规划角度,返还型适合不愿意'白交钱'的投保人。建议优先保障额度充足,再考虑是否返还。",
"category": "产品"
},
{
"question": "保险买了两年想退保,能退多少",
"answer": "退保时退还的是保单的现金价值,不是已交保费。现金价值在保单前几年通常远低于已交保费,以年交保费1万元为例,第2年退保可能只能拿回2000-4000元左右。具体现金价值可在保单中查看现金价值表。建议在犹豫期内(通常15天)退保可全额退还保费。",
"category": "退保"
},
{
"question": "带病投保被拒赔了怎么办",
"answer": "带病投保被拒赔,首先需确认投保时是否如实告知。如果投保时未如实告知既往症,保险公司有权拒赔并解除合同。如果已如实告知且保险公司已承保,则拒赔可向保险公司申诉,或向银保监会消费者权益保护局投诉。保留好投保时的告知记录和核保结论是关键。",
"category": "理赔"
},
{
"question": "短期医疗险和长期医疗险怎么选",
"answer": "短期医疗险保障期限通常为1年,需每年续保,存在停售无法续保的风险,但保费较低。长期医疗险保障期限可达10年或20年,保证续保期间内即使产品停售也可续保,但保费相对较高。建议优先选择保证续保6年以上的产品,避免因健康变化或产品停售失去保障。",
"category": "产品"
},
{
"question": "买了多份保险可以重复理赔吗",
"answer": "分情况:重疾险、寿险、意外身故/伤残属于定额给付型,多份保单可以重复理赔。医疗险属于报销型,实际医疗费用为限,多份医疗险不能重复报销超出实际花费的部分。意外医疗也属于报销型。简单记:给付型可叠加,报销型不叠加。",
"category": "理赔"
},
{
"question": "30岁买重疾险保额50万大概多少钱",
"answer": "以消费型重疾险为例,30岁男性,50万保额,30年缴费,保至70岁,年保费大约在3000-5000元之间。女性略低,约2500-4500元。具体保费受保障疾病数量、等待期、保险公司定价策略等因素影响。返还型重疾险保费约为消费型的2-3倍。",
"category": "费率"
},
]
# 初始化FAQ匹配引擎
faq_matcher = InsuranceFAQMatcher()
faq_matcher.load_faqs(INSURANCE_FAQ_DATASET)
# 测试搜索
test_queries = [
"甲状腺有问题还能买保险吗",
"医疗险的免赔额什么意思",
"退保能退多少钱",
"多份保险怎么赔",
]
for query in test_queries:
results = faq_matcher.search(query, top_k=2)
print(f"\n查询: {query}")
for r in results:
print(f" 匹配: {r['question']} (相似度: {r['score']:.3f})")
第四部分:理赔案例结构化与Schema.org标注
4.1 理赔案例数据模型
python
from datetime import date
@dataclass
class ClaimRecord:
"""理赔案例结构化记录"""
case_id: str
age_range: str # 投保人年龄段
gender: str # 性别
product_name: str # 投保产品
coverage_amount: float # 保额(元)
claim_type: str # 理赔类型
diagnosis: str # 诊断/出险原因
claim_amount: float # 理赔金额(元)
claim_duration_days: int # 理赔时效(天)
claim_date: Optional[date] # 理赔完成日期
def to_searchable_text(self) -> str:
"""生成AI可检索的文本"""
return (
f"{self.age_range}{self.gender},投保{self.product_name}"
f"(保额{self.coverage_amount / 10000:.0f}万元),"
f"{self.claim_type}出险({self.diagnosis}),"
f"理赔金额{self.claim_amount / 10000:.1f}万元,"
f"理赔时效{self.claim_duration_days}天。"
)
def to_schema_org(self) -> dict:
"""生成Schema.org Review结构化数据"""
return {
"@context": "https://schema.org",
"@type": "Review",
"itemReviewed": {
"@type": "InsuranceProduct",
"name": self.product_name
},
"reviewBody": self.to_searchable_text(),
"author": {
"@type": "Organization",
"name": "XX保险公司"
},
"datePublished": self.claim_date.isoformat() if self.claim_date else ""
}
# 示例理赔数据
SAMPLE_CLAIMS = [
ClaimRecord(
case_id="CLM-2024-001",
age_range="35-40岁",
gender="男",
product_name="康宁重大疾病保险",
coverage_amount=500000,
claim_type="重大疾病",
diagnosis="甲状腺乳头状癌",
claim_amount=500000,
claim_duration_days=7,
claim_date=date(2024, 3, 15)
),
ClaimRecord(
case_id="CLM-2024-002",
age_range="45-50岁",
gender="女",
product_name="康宁重大疾病保险",
coverage_amount=300000,
claim_type="重大疾病",
diagnosis="乳腺浸润性导管癌",
claim_amount=300000,
claim_duration_days=5,
claim_date=date(2024, 5, 22)
),
ClaimRecord(
case_id="CLM-2024-003",
age_range="28-35岁",
gender="男",
product_name="安心百万医疗险",
coverage_amount=2000000,
claim_type="医疗",
diagnosis="急性阑尾炎住院手术",
claim_amount=28500,
claim_duration_days=12,
claim_date=date(2024, 7, 8)
),
]
4.2 产品页面Schema.org标注生成
python
def generate_product_schema(product: InsuranceProduct, claims: List[ClaimRecord]) -> dict:
"""生成产品页面的完整Schema.org结构化数据"""
# 计算理赔统计
total_claims = len(claims)
avg_duration = sum(c.claim_duration_days for c in claims) / total_claims if total_claims else 0
total_claim_amount = sum(c.claim_amount for c in claims)
schema = {
"@context": "https://schema.org",
"@type": "InsuranceProduct",
"name": product.product_name,
"productID": product.product_code,
"description": (
f"{product.insurance_type},保障{len(product.coverages)}种疾病/责任,"
f"等待期{product.waiting_period_days}天,"
f"犹豫期{product.hesitation_period_days}天"
),
"provider": {
"@type": "Organization",
"name": product.company_name or "XX保险公司"
},
"offers": {
"@type": "AggregateOffer",
"availability": "https://schema.org/InStock"
}
}
# 如果有理赔数据,添加aggregateRating
if total_claims > 0:
schema["aggregateRating"] = {
"@type": "AggregateRating",
"ratingValue": "4.5",
"reviewCount": str(total_claims),
"bestRating": "5"
}
return schema
def generate_product_page_with_schema(
product: InsuranceProduct,
claims: List[ClaimRecord]
) -> str:
"""生成带结构化数据的产品页面HTML"""
import json
schema_json = json.dumps(
generate_product_schema(product, claims),
ensure_ascii=False,
indent=2
)
# 保障范围HTML
coverage_items = "\n".join([
f'<li itemprop="itemListElement">{c.disease_name}</li>'
for c in product.coverages
])
# 免责条款HTML
exclusion_items = "\n".join([
f'<li>{e.exclusion_text[:100]}</li>'
for e in product.exclusions
])
# 理赔案例HTML
claim_items = "\n".join([
f'<li>{claim.to_searchable_text()}</li>'
for claim in claims
])
html = f"""<!DOCTYPE html>
<html lang="zh-CN">
<head>
<meta charset="UTF-8">
<title>{product.product_name} - 条款详情</title>
<script type="application/ld+json">
{schema_json}
</script>
</head>
<body>
<article itemscope itemtype="https://schema.org/InsuranceProduct">
<h1 itemprop="name">{product.product_name}</h1>
<p>备案号:<span itemprop="productID">{product.product_code}</span></p>
<p>等待期:{product.waiting_period_days}天 |
犹豫期:{product.hesitation_period_days}天 |
宽限期:{product.grace_period_days}天</p>
<section>
<h2>保障范围</h2>
<ul>
{coverage_items}
</ul>
</section>
<section>
<h2>责任免除</h2>
<ul>
{exclusion_items}
</ul>
</section>
<section>
<h2>理赔案例</h2>
<ul>
{claim_items}
</ul>
</section>
</article>
</body>
</html>"""
return html
第五部分:GEO效果追踪
5.1 AI引用监控
python
import time
from datetime import datetime, timedelta
class GEOTracker:
"""GEO效果追踪器"""
def __init__(self, storage_path: str = "geo_tracking.json"):
self.storage_path = storage_path
self.records = self._load_records()
def _load_records(self) -> list:
"""加载历史记录"""
import os
if os.path.exists(self.storage_path):
with open(self.storage_path, "r", encoding="utf-8") as f:
return json.load(f)
return []
def _save_records(self):
"""保存记录"""
with open(self.storage_path, "w", encoding="utf-8") as f:
json.dump(self.records, f, ensure_ascii=False, indent=2)
def track_query(self, query: str, ai_response: str, products_found: list[str]):
"""追踪AI搜索结果"""
record = {
"timestamp": datetime.now().isoformat(),
"query": query,
"products_found": products_found,
"response_length": len(ai_response),
"product_mentioned": len(products_found) > 0
}
self.records.append(record)
self._save_records()
def get_summary(self, days: int = 30) -> dict:
"""获取统计摘要"""
cutoff = datetime.now() - timedelta(days=days)
recent = [
r for r in self.records
if datetime.fromisoformat(r["timestamp"]) > cutoff
]
total_queries = len(recent)
queries_with_product = sum(1 for r in recent if r["product_mentioned"])
product_counts = {}
for r in recent:
for product in r.get("products_found", []):
product_counts[product] = product_counts.get(product, 0) + 1
return {
"period_days": days,
"total_queries": total_queries,
"queries_with_product": queries_with_product,
"citation_rate": queries_with_product / total_queries if total_queries else 0,
"top_products": sorted(
product_counts.items(),
key=lambda x: x[1],
reverse=True
)[:10]
}
5.2 效果验证脚本
python
def verify_geo_effectiveness(product_code: str, test_queries: list[str]) -> dict:
"""
验证GEO效果
通过模拟AI搜索查询,检测产品是否被引用
"""
results = {
"product_code": product_code,
"total_queries": len(test_queries),
"mentioned_count": 0,
"queries": []
}
for query in test_queries:
# 这里应接入实际的AI搜索API
# 示例中仅模拟结果
mentioned = False # 实际替换为AI搜索结果判断
results["queries"].append({
"query": query,
"mentioned": mentioned
})
if mentioned:
results["mentioned_count"] += 1
results["citation_rate"] = (
results["mentioned_count"] / results["total_queries"]
if results["total_queries"] else 0
)
return results
# 验证用例
TEST_QUERIES = [
"50岁买什么重疾险",
"百万医疗险推荐",
"甲状腺结节能买保险吗",
"消费型重疾险哪个好",
"XX重疾险保障范围",
"重疾险等待期多少天",
"保险理赔流程",
"带病投保怎么处理",
]
verification = verify_geo_effectiveness("C0001", TEST_QUERIES)
print(f"\nGEO效果验证报告")
print(f"产品: {verification['product_code']}")
print(f"测试查询: {verification['total_queries']}条")
print(f"被引用次数: {verification['mentioned_count']}次")
print(f"引用率: {verification['citation_rate']:.1%}")
总结
本文介绍了保险行业GEO的完整技术方案,覆盖四个核心环节:
- 条款NLP解析:使用spaCy自定义实体识别,从非结构化条款文本中提取保障范围、免责条款、等待期等关键字段
- 知识图谱构建:使用Neo4j构建保险产品-疾病-免责条款的关系图谱,支持多维度查询
- FAQ语义匹配:使用SentenceTransformer实现FAQ的语义检索,支持自然语言查询
- Schema.org标注:生成符合Schema.org标准的结构化数据,嵌入产品页面HTML
保险行业的GEO核心挑战是产品信息的非结构化。PDF条款、图片格式的保障范围、JavaScript渲染的产品对比表格------这些内容呈现方式在浏览器中渲染得很好,但对于只能读取DOM和纯文本的AI爬虫来说,等同于空白页面。
本文方案的技术价值在于:将条款解析从人工阅读PDF的流程,转变为NLP自动提取+知识图谱关联的自动化流程。当保险产品的保障范围、免责条款、理赔案例都能被机器精确读取时,AI搜索引擎在回答保险相关问题时,才有足够的数据支撑来推荐你的产品。