保险条款NLP解析与知识图谱搭建：让AI准确理解保险产品的技术方案

保险行业的产品信息管理长期面临一个结构性问题：产品条款以非结构化的PDF或图片形式存在，AI搜索引擎无法有效提取其中的关键信息。本文介绍一套基于spaCy NLP和Neo4j知识图谱的技术方案，实现保险条款的自动解析、结构化存储和智能问答。

技术架构概览

复制代码

PDF/图片条款
    ↓
OCR文字识别 (Tesseract)
    ↓
NLP实体抽取 (spaCy)
    ↓
知识图谱构建 (Neo4j)
    ↓
语义检索 (向量数据库)
    ↓
智能问答API

第一部分：保险条款NLP解析

1.1 spaCy环境准备

bash 复制代码

pip install spacy spacy-transformers neo4j python-dotenv
python -m spacy download zh_core_web_lg

1.2 自定义保险领域NER模型

保险条款中包含大量专业实体，需要在通用中文模型基础上进行扩展。

python 复制代码

import spacy
from spacy.tokens import Span
from spacy.language import Language

# 加载中文模型
nlp = spacy.load("zh_core_web_lg")

# 保险领域实体标签
INSURANCE_ENTITY_LABELS = {
    "INS_PRODUCT": "保险产品名称",
    "INS_DISEASE": "疾病名称",
    "INS_AMOUNT": "金额/保额",
    "INS_PERIOD": "期限（天/年）",
    "INS_CLAUSE": "条款编号",
    "INS_EXCLUSION": "免责情形",
    "INS_PROCESS": "流程步骤"
}

@Language.component("insurance_entity_ruler")
def insurance_entity_ruler(doc):
    """保险领域实体规则匹配"""
    patterns = [
        # 保险产品名称模式
        {"label": "INS_PRODUCT", "pattern": [{"LOWER": {"IN": ["重疾险", "医疗险", "意外险", "寿险", "年金险", "百万医疗险", "防癌险"]}}]},
        # 疾病名称（常见重疾）
        {"label": "INS_DISEASE", "pattern": [{"LOWER": {"IN": ["恶性肿瘤", "急性心肌梗塞", "脑中风后遗症", "重大器官移植", "冠状动脉搭桥术", "终末期肾病"]}}]},
        # 金额模式
        {"label": "INS_AMOUNT", "pattern": [{"IS_DIGIT": True}, {"LOWER": {"IN": ["万", "元", "万元"]}}]},
        # 期限模式
        {"label": "INS_PERIOD", "pattern": [{"IS_DIGIT": True}, {"LOWER": {"IN": ["天", "日", "年", "个月"]}}]},
    ]

    # 使用EntityRuler添加规则
    ruler = nlp.add_pipe("entity_ruler", before="ner")
    ruler.add_patterns(patterns)

    return doc

# 注册组件
nlp.add_pipe("insurance_entity_ruler", after="ner")

1.3 条款文本结构化解析

python 复制代码

import re
from dataclasses import dataclass, field
from typing import List, Optional, Dict
from enum import Enum

class CoverageType(Enum):
    """保障类型枚举"""
    CRITICAL_ILLNESS = "重大疾病"
    LIGHT_ILLNESS = "轻症疾病"
    MEDIUM_ILLNESS = "中症疾病"
    MEDICAL = "医疗保障"
    ACCIDENT = "意外保障"
    DEATH = "身故保障"
    DISABILITY = "伤残保障"

@dataclass
class CoverageItem:
    """保障项目"""
    disease_name: str
    disease_code: Optional[str] = None  # ICD-10编码
    coverage_type: CoverageType = CoverageType.CRITICAL_ILLNESS
    coverage_ratio: float = 1.0  # 赔付比例
    description: str = ""

@dataclass
class ExclusionItem:
    """免责条款"""
    exclusion_no: int
    exclusion_text: str
    exclusion_type: str = ""  # 既往症/职业/行为/其他

@dataclass
class InsuranceProduct:
    """保险产品完整结构化数据"""
    product_name: str
    product_code: str = ""  # 银保监会备案号
    insurance_type: str = ""
    company_name: str = ""
    waiting_period_days: int = 0
    hesitation_period_days: int = 30
    grace_period_days: int = 60
    coverages: List[CoverageItem] = field(default_factory=list)
    exclusions: List[ExclusionItem] = field(default_factory=list)
    claim_process: List[str] = field(default_factory=list)
    premium_table: List[Dict] = field(default_factory=list)

class InsuranceClauseParser:
    """保险条款解析器"""

    def __init__(self, nlp_model):
        self.nlp = nlp_model

        # 正则模式
        self.patterns = {
            "waiting_period": re.compile(r'等待期[为是：:]\s*(\d+)\s*(?:天|日)'),
            "hesitation_period": re.compile(r'犹豫期[为是：:]\s*(\d+)\s*(?:天|日)'),
            "grace_period": re.compile(r'宽限期[为是：:]\s*(\d+)\s*(?:天|日)'),
            "product_code": re.compile(r'(?:备案号|产品代码)[：:]\s*([A-Z0-9\-]+)'),
            "coverage_section": re.compile(r'(?:保障范围|保险责任|重大疾病列表)[\s\S]*?(?=责任免除|免责条款|$)'),
            "exclusion_section": re.compile(r'(?:责任免除|免责条款)[\s\S]*?(?=保险金的申请|理赔流程|$)'),
            "claim_section": re.compile(r'(?:理赔流程|保险金的申请)[\s\S]*?(?=其他事项|$)'),
        }

    def parse(self, text: str) -> InsuranceProduct:
        """解析保险条款文本"""
        doc = self.nlp(text)

        # 提取产品名称（从标题或第一段）
        product_name = self._extract_product_name(text)

        product = InsuranceProduct(product_name=product_name)

        # 提取备案号
        match = self.patterns["product_code"].search(text)
        if match:
            product.product_code = match.group(1)

        # 提取等待期
        match = self.patterns["waiting_period"].search(text)
        if match:
            product.waiting_period_days = int(match.group(1))

        # 提取犹豫期
        match = self.patterns["hesitation_period"].search(text)
        if match:
            product.hesitation_period_days = int(match.group(1))

        # 提取宽限期
        match = self.patterns["grace_period"].search(text)
        if match:
            product.grace_period_days = int(match.group(1))

        # 提取保障范围
        product.coverages = self._extract_coverages(text)

        # 提取免责条款
        product.exclusions = self._extract_exclusions(text)

        # 提取理赔流程
        product.claim_process = self._extract_claim_process(text)

        return product

    def _extract_product_name(self, text: str) -> str:
        """提取产品名称"""
        lines = text.strip().split('\n')
        for line in lines[:5]:
            if '保险' in line and len(line) < 50:
                return line.strip().replace('#', '').strip()
        return "未知产品"

    def _extract_coverages(self, text: str) -> List[CoverageItem]:
        """提取保障疾病列表"""
        coverages = []

        match = self.patterns["coverage_section"].search(text)
        if not match:
            return coverages

        section_text = match.group()

        # 匹配疾病条目（常见格式：1. 恶性肿瘤 或 一、恶性肿瘤）
        disease_pattern = re.compile(
            r'(?:\d+[.、]|第[一二三四五六七八九十]+条|[(（]\d+[)）])\s*'
            r'([\u4e00-\u9fa5]{2,20}(?:（[^）]+）)?)'
        )

        for match in disease_pattern.finditer(section_text):
            disease_name = match.group(1).strip()
            if len(disease_name) > 2:
                coverages.append(CoverageItem(
                    disease_name=disease_name,
                    coverage_type=CoverageType.CRITICAL_ILLNESS
                ))

        return coverages

    def _extract_exclusions(self, text: str) -> List[ExclusionItem]:
        """提取免责条款"""
        exclusions = []

        match = self.patterns["exclusion_section"].search(text)
        if not match:
            return exclusions

        section_text = match.group()

        # 匹配条款编号和内容
        exclusion_pattern = re.compile(
            r'(?:\d+[.、]|[(（]\d+[)）])\s*([^\n]{10,200})'
        )

        for i, match in enumerate(exclusion_pattern.finditer(section_text), 1):
            exclusion_text = match.group(1).strip()
            if len(exclusion_text) > 10:
                exclusions.append(ExclusionItem(
                    exclusion_no=i,
                    exclusion_text=exclusion_text
                ))

        return exclusions

    def _extract_claim_process(self, text: str) -> List[str]:
        """提取理赔流程步骤"""
        steps = []

        match = self.patterns["claim_section"].search(text)
        if not match:
            return steps

        section_text = match.group()

        # 匹配流程步骤（常见格式：第一步、报案；或 1. 报案）
        step_pattern = re.compile(
            r'(?:第[一二三四五六七八九十]+步|步骤\s*\d+|\d+[.、])\s*([^\n]{5,100})'
        )

        for match in step_pattern.finditer(section_text):
            step = match.group(1).strip()
            if len(step) > 5:
                steps.append(step)

        return steps

1.4 解析效果测试

python 复制代码

# 测试用例：模拟保险条款文本
SAMPLE_CLAUSE = """
# 康宁重大疾病保险条款

备案号：C0001

## 保险责任

### 等待期

本合同生效之日起180日内为等待期。等待期内确诊本合同约定的重大疾病，本公司不承担保险责任，但退还已交保费。

### 保障范围

本合同保障以下120种重大疾病：

1. 恶性肿瘤------重度
2. 急性心肌梗死
3. 脑中风后遗症
4. 重大器官移植术或造血干细胞移植术
5. 冠状动脉搭桥术（或称冠状动脉旁路移植术）
6. 终末期肾病（或称慢性肾功能衰竭尿毒症期）
7. 多个肢体缺失
8. 急性或亚急性重症肝炎
9. 良性脑肿瘤
10. 慢性肝功能衰竭失代偿期

## 责任免除

因下列情形之一导致被保险人发生本合同约定的重大疾病，本公司不承担给付保险金的责任：

（一）投保人对被保险人的故意杀害、故意伤害；
（二）被保险人故意犯罪或抗拒依法采取的刑事强制措施；
（三）被保险人故意自伤、或自本合同成立或者本合同效力恢复之日起2年内自杀，但被保险人自杀时为无民事行为能力人的除外；
（四）被保险人服用、吸食或注射毒品；
（五）被保险人酒后驾驶、无合法有效驾驶证驾驶，或驾驶无合法有效行驶证的机动车；
（六）被保险人感染艾滋病病毒或患艾滋病；
（七）战争、军事冲突、暴乱或武装叛乱；
（八）核爆炸、核辐射或核污染；
（九）遗传性疾病，先天性畸形、变形或染色体异常。

## 保险金的申请

第一步：报案

被保险人确诊重大疾病后，应在10日内通知本公司。

第二步：提交资料

申请人应提交以下资料：
- 保险合同
- 申请人的有效身份证件
- 医院出具的诊断证明书
- 相关的病历资料

第三步：审核

本公司在收到完整的申请资料后，将在5个工作日内完成审核。

第四步：给付

审核通过后，本公司将在10个工作日内将保险金转入申请人指定的银行账户。
"""

# 执行解析
parser = InsuranceClauseParser(nlp)
product = parser.parse(SAMPLE_CLAUSE)

print(f"产品名称：{product.product_name}")
print(f"备案号：{product.product_code}")
print(f"等待期：{product.waiting_period_days}天")
print(f"保障疾病数量：{len(product.coverages)}")
print(f"免责条款数量：{len(product.exclusions)}")
print(f"理赔流程步骤：{len(product.claim_process)}")

# 输出前5个保障疾病
print("\n前5个保障疾病：")
for coverage in product.coverages[:5]:
    print(f"  - {coverage.disease_name}")

# 输出前3条免责条款
print("\n前3条免责条款：")
for exclusion in product.exclusions[:3]:
    print(f"  {exclusion.exclusion_no}. {exclusion.exclusion_text[:50]}...")

第二部分：Neo4j知识图谱构建

2.1 知识图谱Schema设计

python 复制代码

from neo4j import GraphDatabase
import json

class InsuranceKnowledgeGraph:
    """保险知识图谱"""

    def __init__(self, uri: str, user: str, password: str):
        self.driver = GraphDatabase.driver(uri, auth=(user, password))

    def close(self):
        self.driver.close()

    def init_schema(self):
        """初始化知识图谱Schema"""
        with self.driver.session() as session:
            # 创建约束
            constraints = [
                "CREATE CONSTRAINT product_code IF NOT EXISTS FOR (p:Product) REQUIRE p.code IS UNIQUE",
                "CREATE CONSTRAINT disease_name IF NOT EXISTS FOR (d:Disease) REQUIRE d.name IS UNIQUE",
                "CREATE CONSTRAINT company_name IF NOT EXISTS FOR (c:Company) REQUIRE c.name IS UNIQUE",
            ]
            for constraint in constraints:
                try:
                    session.run(constraint)
                except Exception as e:
                    print(f"约束已存在或创建失败: {e}")

            # 创建索引
            indexes = [
                "CREATE INDEX product_type_idx IF NOT EXISTS FOR (p:Product) ON (p.type)",
                "CREATE INDEX disease_type_idx IF NOT EXISTS FOR (d:Disease) ON (d.category)",
            ]
            for index in indexes:
                try:
                    session.run(index)
                except Exception as e:
                    print(f"索引已存在或创建失败: {e}")

    def create_product(self, product: InsuranceProduct):
        """创建产品节点"""
        with self.driver.session() as session:
            # 创建产品节点
            session.run("""
                MERGE (p:Product {
                    code: $code,
                    name: $name,
                    type: $type,
                    waiting_period: $waiting_period,
                    hesitation_period: $hesitation_period,
                    grace_period: $grace_period
                })
            """, {
                "code": product.product_code,
                "name": product.product_name,
                "type": product.insurance_type,
                "waiting_period": product.waiting_period_days,
                "hesitation_period": product.hesitation_period_days,
                "grace_period": product.grace_period_days,
            })

            # 创建保障关系
            for coverage in product.coverages:
                session.run("""
                    MERGE (d:Disease {name: $disease_name})
                    ON CREATE SET d.category = $category
                    WITH d
                    MATCH (p:Product {code: $product_code})
                    MERGE (p)-[r:COVERS {
                        ratio: $ratio,
                        description: $description
                    }]->(d)
                """, {
                    "disease_name": coverage.disease_name,
                    "category": coverage.coverage_type.value,
                    "product_code": product.product_code,
                    "ratio": coverage.coverage_ratio,
                    "description": coverage.description,
                })

            # 创建免责关系
            for exclusion in product.exclusions:
                session.run("""
                    MERGE (e:Exclusion {no: $no, text: $text})
                    ON CREATE SET e.type = $type
                    WITH e
                    MATCH (p:Product {code: $product_code})
                    MERGE (p)-[r:EXCLUDES {reason: $text}]->(e)
                """, {
                    "no": exclusion.exclusion_no,
                    "text": exclusion.exclusion_text[:200],
                    "type": exclusion.exclusion_type,
                    "product_code": product.product_code,
                })

            print(f"产品 '{product.product_name}' 已导入知识图谱")

    def query_product_coverage(self, product_code: str) -> list:
        """查询产品保障范围"""
        with self.driver.session() as session:
            result = session.run("""
                MATCH (p:Product {code: $code})-[r:COVERS]->(d:Disease)
                RETURN d.name AS disease, r.ratio AS ratio, r.description AS desc
                ORDER BY d.name
            """, {"code": product_code})
            return [dict(record) for record in result]

    def query_disease_products(self, disease_name: str) -> list:
        """查询保障某疾病的所有产品"""
        with self.driver.session() as session:
            result = session.run("""
                MATCH (p:Product)-[r:COVERS]->(d:Disease {name: $name})
                RETURN p.name AS product, p.code AS code, r.ratio AS ratio
                ORDER BY p.name
            """, {"name": disease_name})
            return [dict(record) for record in result]

    def query_exclusion_details(self, product_code: str) -> list:
        """查询产品免责条款详情"""
        with self.driver.session() as session:
            result = session.run("""
                MATCH (p:Product {code: $code})-[r:EXCLUDES]->(e:Exclusion)
                RETURN e.no AS no, e.text AS text, e.type AS type
                ORDER BY e.no
            """, {"code": product_code})
            return [dict(record) for record in result]

2.2 知识图谱查询示例

python 复制代码

# 初始化知识图谱
kg = InsuranceKnowledgeGraph(
    uri="bolt://localhost:7687",
    user="neo4j",
    password="your_password"
)

# 初始化Schema
kg.init_schema()

# 导入产品数据
kg.create_product(product)

# 查询产品保障范围
coverages = kg.query_product_coverage("C0001")
print(f"\n产品 C0001 保障 {len(coverages)} 种疾病")
for c in coverages[:5]:
    print(f"  - {c['disease']} (赔付比例: {c['ratio']})")

# 查询保障"恶性肿瘤"的所有产品
products = kg.query_disease_products("恶性肿瘤------重度")
print(f"\n保障恶性肿瘤的产品数量: {len(products)}")

# 查询免责条款
exclusions = kg.query_exclusion_details("C0001")
print(f"\n免责条款数量: {len(exclusions)}")

第三部分：保险FAQ语义匹配系统

3.1 FAQ向量化

python 复制代码

from sentence_transformers import SentenceTransformer
import numpy as np
from typing import List, Tuple

class InsuranceFAQMatcher:
    """保险FAQ语义匹配引擎"""

    def __init__(self, model_name: str = "shibing624/text2vec-base-chinese"):
        self.model = SentenceTransformer(model_name)
        self.faqs = []
        self.faq_embeddings = None

    def load_faqs(self, faqs: List[dict]):
        """
        加载FAQ数据
        faqs格式: [{"question": "...", "answer": "...", "category": "..."}]
        """
        self.faqs = faqs
        questions = [faq["question"] for faq in faqs]
        self.faq_embeddings = self.model.encode(questions, convert_to_numpy=True)
        print(f"已加载 {len(faqs)} 条FAQ，向量维度: {self.faq_embeddings.shape}")

    def search(self, query: str, top_k: int = 3, threshold: float = 0.75) -> List[dict]:
        """语义搜索最相关的FAQ"""
        query_embedding = self.model.encode([query], convert_to_numpy=True)

        # 计算余弦相似度
        similarities = np.dot(self.faq_embeddings, query_embedding.T).flatten()
        similarities = similarities / (
            np.linalg.norm(self.faq_embeddings, axis=1) *
            np.linalg.norm(query_embedding)
        )

        # 获取Top-K结果
        top_indices = np.argsort(similarities)[::-1][:top_k]

        results = []
        for idx in top_indices:
            if similarities[idx] >= threshold:
                results.append({
                    "question": self.faqs[idx]["question"],
                    "answer": self.faqs[idx]["answer"],
                    "category": self.faqs[idx].get("category", ""),
                    "score": float(similarities[idx])
                })

        return results

    def batch_search(self, queries: List[str], top_k: int = 3) -> List[List[dict]]:
        """批量搜索"""
        return [self.search(q, top_k) for q in queries]

3.2 FAQ数据集示例

python 复制代码

INSURANCE_FAQ_DATASET = [
    {
        "question": "甲状腺结节能买重疾险吗",
        "answer": "甲状腺结节1-3级（TI-RADS分级）通常可正常承保，4级及以上需提交近6个月内的超声检查报告，由核保人员评估后决定是否承保及是否附加除外责任。具体以各保险公司核保结论为准。",
        "category": "核保"
    },
    {
        "question": "百万医疗险的免赔额是怎么计算的",
        "answer": "百万医疗险通常设有1万元免赔额，即当年累计医疗费用超过1万元的部分才由保险公司赔付。部分产品设有0免赔额选项，但对应保费更高。免赔额按自然年度计算，次年重新起算。",
        "category": "产品"
    },
    {
        "question": "重疾险买消费型还是返还型",
        "answer": "消费型重疾险保费较低，保障期内未出险则保费不退还；返还型重疾险保费较高，保障期满或身故可返还保费或保额。从保障效率角度，消费型性价比更高；从资金规划角度，返还型适合不愿意'白交钱'的投保人。建议优先保障额度充足，再考虑是否返还。",
        "category": "产品"
    },
    {
        "question": "保险买了两年想退保，能退多少",
        "answer": "退保时退还的是保单的现金价值，不是已交保费。现金价值在保单前几年通常远低于已交保费，以年交保费1万元为例，第2年退保可能只能拿回2000-4000元左右。具体现金价值可在保单中查看现金价值表。建议在犹豫期内（通常15天）退保可全额退还保费。",
        "category": "退保"
    },
    {
        "question": "带病投保被拒赔了怎么办",
        "answer": "带病投保被拒赔，首先需确认投保时是否如实告知。如果投保时未如实告知既往症，保险公司有权拒赔并解除合同。如果已如实告知且保险公司已承保，则拒赔可向保险公司申诉，或向银保监会消费者权益保护局投诉。保留好投保时的告知记录和核保结论是关键。",
        "category": "理赔"
    },
    {
        "question": "短期医疗险和长期医疗险怎么选",
        "answer": "短期医疗险保障期限通常为1年，需每年续保，存在停售无法续保的风险，但保费较低。长期医疗险保障期限可达10年或20年，保证续保期间内即使产品停售也可续保，但保费相对较高。建议优先选择保证续保6年以上的产品，避免因健康变化或产品停售失去保障。",
        "category": "产品"
    },
    {
        "question": "买了多份保险可以重复理赔吗",
        "answer": "分情况：重疾险、寿险、意外身故/伤残属于定额给付型，多份保单可以重复理赔。医疗险属于报销型，实际医疗费用为限，多份医疗险不能重复报销超出实际花费的部分。意外医疗也属于报销型。简单记：给付型可叠加，报销型不叠加。",
        "category": "理赔"
    },
    {
        "question": "30岁买重疾险保额50万大概多少钱",
        "answer": "以消费型重疾险为例，30岁男性，50万保额，30年缴费，保至70岁，年保费大约在3000-5000元之间。女性略低，约2500-4500元。具体保费受保障疾病数量、等待期、保险公司定价策略等因素影响。返还型重疾险保费约为消费型的2-3倍。",
        "category": "费率"
    },
]

# 初始化FAQ匹配引擎
faq_matcher = InsuranceFAQMatcher()
faq_matcher.load_faqs(INSURANCE_FAQ_DATASET)

# 测试搜索
test_queries = [
    "甲状腺有问题还能买保险吗",
    "医疗险的免赔额什么意思",
    "退保能退多少钱",
    "多份保险怎么赔",
]

for query in test_queries:
    results = faq_matcher.search(query, top_k=2)
    print(f"\n查询: {query}")
    for r in results:
        print(f"  匹配: {r['question']} (相似度: {r['score']:.3f})")

第四部分：理赔案例结构化与Schema.org标注

4.1 理赔案例数据模型

python 复制代码

from datetime import date

@dataclass
class ClaimRecord:
    """理赔案例结构化记录"""
    case_id: str
    age_range: str              # 投保人年龄段
    gender: str                 # 性别
    product_name: str           # 投保产品
    coverage_amount: float      # 保额（元）
    claim_type: str             # 理赔类型
    diagnosis: str              # 诊断/出险原因
    claim_amount: float         # 理赔金额（元）
    claim_duration_days: int    # 理赔时效（天）
    claim_date: Optional[date]  # 理赔完成日期

    def to_searchable_text(self) -> str:
        """生成AI可检索的文本"""
        return (
            f"{self.age_range}{self.gender}，投保{self.product_name}"
            f"（保额{self.coverage_amount / 10000:.0f}万元），"
            f"{self.claim_type}出险（{self.diagnosis}），"
            f"理赔金额{self.claim_amount / 10000:.1f}万元，"
            f"理赔时效{self.claim_duration_days}天。"
        )

    def to_schema_org(self) -> dict:
        """生成Schema.org Review结构化数据"""
        return {
            "@context": "https://schema.org",
            "@type": "Review",
            "itemReviewed": {
                "@type": "InsuranceProduct",
                "name": self.product_name
            },
            "reviewBody": self.to_searchable_text(),
            "author": {
                "@type": "Organization",
                "name": "XX保险公司"
            },
            "datePublished": self.claim_date.isoformat() if self.claim_date else ""
        }


# 示例理赔数据
SAMPLE_CLAIMS = [
    ClaimRecord(
        case_id="CLM-2024-001",
        age_range="35-40岁",
        gender="男",
        product_name="康宁重大疾病保险",
        coverage_amount=500000,
        claim_type="重大疾病",
        diagnosis="甲状腺乳头状癌",
        claim_amount=500000,
        claim_duration_days=7,
        claim_date=date(2024, 3, 15)
    ),
    ClaimRecord(
        case_id="CLM-2024-002",
        age_range="45-50岁",
        gender="女",
        product_name="康宁重大疾病保险",
        coverage_amount=300000,
        claim_type="重大疾病",
        diagnosis="乳腺浸润性导管癌",
        claim_amount=300000,
        claim_duration_days=5,
        claim_date=date(2024, 5, 22)
    ),
    ClaimRecord(
        case_id="CLM-2024-003",
        age_range="28-35岁",
        gender="男",
        product_name="安心百万医疗险",
        coverage_amount=2000000,
        claim_type="医疗",
        diagnosis="急性阑尾炎住院手术",
        claim_amount=28500,
        claim_duration_days=12,
        claim_date=date(2024, 7, 8)
    ),
]

4.2 产品页面Schema.org标注生成

python 复制代码

def generate_product_schema(product: InsuranceProduct, claims: List[ClaimRecord]) -> dict:
    """生成产品页面的完整Schema.org结构化数据"""

    # 计算理赔统计
    total_claims = len(claims)
    avg_duration = sum(c.claim_duration_days for c in claims) / total_claims if total_claims else 0
    total_claim_amount = sum(c.claim_amount for c in claims)

    schema = {
        "@context": "https://schema.org",
        "@type": "InsuranceProduct",
        "name": product.product_name,
        "productID": product.product_code,
        "description": (
            f"{product.insurance_type}，保障{len(product.coverages)}种疾病/责任，"
            f"等待期{product.waiting_period_days}天，"
            f"犹豫期{product.hesitation_period_days}天"
        ),
        "provider": {
            "@type": "Organization",
            "name": product.company_name or "XX保险公司"
        },
        "offers": {
            "@type": "AggregateOffer",
            "availability": "https://schema.org/InStock"
        }
    }

    # 如果有理赔数据，添加aggregateRating
    if total_claims > 0:
        schema["aggregateRating"] = {
            "@type": "AggregateRating",
            "ratingValue": "4.5",
            "reviewCount": str(total_claims),
            "bestRating": "5"
        }

    return schema


def generate_product_page_with_schema(
    product: InsuranceProduct,
    claims: List[ClaimRecord]
) -> str:
    """生成带结构化数据的产品页面HTML"""

    import json

    schema_json = json.dumps(
        generate_product_schema(product, claims),
        ensure_ascii=False,
        indent=2
    )

    # 保障范围HTML
    coverage_items = "\n".join([
        f'<li itemprop="itemListElement">{c.disease_name}</li>'
        for c in product.coverages
    ])

    # 免责条款HTML
    exclusion_items = "\n".join([
        f'<li>{e.exclusion_text[:100]}</li>'
        for e in product.exclusions
    ])

    # 理赔案例HTML
    claim_items = "\n".join([
        f'<li>{claim.to_searchable_text()}</li>'
        for claim in claims
    ])

    html = f"""<!DOCTYPE html>
<html lang="zh-CN">
<head>
    <meta charset="UTF-8">
    <title>{product.product_name} - 条款详情</title>
    <script type="application/ld+json">
{schema_json}
    </script>
</head>
<body>
    <article itemscope itemtype="https://schema.org/InsuranceProduct">
        <h1 itemprop="name">{product.product_name}</h1>
        <p>备案号：<span itemprop="productID">{product.product_code}</span></p>
        <p>等待期：{product.waiting_period_days}天 |
           犹豫期：{product.hesitation_period_days}天 |
           宽限期：{product.grace_period_days}天</p>

        <section>
            <h2>保障范围</h2>
            <ul>
                {coverage_items}
            </ul>
        </section>

        <section>
            <h2>责任免除</h2>
            <ul>
                {exclusion_items}
            </ul>
        </section>

        <section>
            <h2>理赔案例</h2>
            <ul>
                {claim_items}
            </ul>
        </section>
    </article>
</body>
</html>"""

    return html

第五部分：GEO效果追踪

5.1 AI引用监控

python 复制代码

import time
from datetime import datetime, timedelta

class GEOTracker:
    """GEO效果追踪器"""

    def __init__(self, storage_path: str = "geo_tracking.json"):
        self.storage_path = storage_path
        self.records = self._load_records()

    def _load_records(self) -> list:
        """加载历史记录"""
        import os
        if os.path.exists(self.storage_path):
            with open(self.storage_path, "r", encoding="utf-8") as f:
                return json.load(f)
        return []

    def _save_records(self):
        """保存记录"""
        with open(self.storage_path, "w", encoding="utf-8") as f:
            json.dump(self.records, f, ensure_ascii=False, indent=2)

    def track_query(self, query: str, ai_response: str, products_found: list[str]):
        """追踪AI搜索结果"""
        record = {
            "timestamp": datetime.now().isoformat(),
            "query": query,
            "products_found": products_found,
            "response_length": len(ai_response),
            "product_mentioned": len(products_found) > 0
        }
        self.records.append(record)
        self._save_records()

    def get_summary(self, days: int = 30) -> dict:
        """获取统计摘要"""
        cutoff = datetime.now() - timedelta(days=days)
        recent = [
            r for r in self.records
            if datetime.fromisoformat(r["timestamp"]) > cutoff
        ]

        total_queries = len(recent)
        queries_with_product = sum(1 for r in recent if r["product_mentioned"])

        product_counts = {}
        for r in recent:
            for product in r.get("products_found", []):
                product_counts[product] = product_counts.get(product, 0) + 1

        return {
            "period_days": days,
            "total_queries": total_queries,
            "queries_with_product": queries_with_product,
            "citation_rate": queries_with_product / total_queries if total_queries else 0,
            "top_products": sorted(
                product_counts.items(),
                key=lambda x: x[1],
                reverse=True
            )[:10]
        }

5.2 效果验证脚本

python 复制代码

def verify_geo_effectiveness(product_code: str, test_queries: list[str]) -> dict:
    """
    验证GEO效果
    通过模拟AI搜索查询，检测产品是否被引用
    """
    results = {
        "product_code": product_code,
        "total_queries": len(test_queries),
        "mentioned_count": 0,
        "queries": []
    }

    for query in test_queries:
        # 这里应接入实际的AI搜索API
        # 示例中仅模拟结果
        mentioned = False  # 实际替换为AI搜索结果判断

        results["queries"].append({
            "query": query,
            "mentioned": mentioned
        })

        if mentioned:
            results["mentioned_count"] += 1

    results["citation_rate"] = (
        results["mentioned_count"] / results["total_queries"]
        if results["total_queries"] else 0
    )

    return results


# 验证用例
TEST_QUERIES = [
    "50岁买什么重疾险",
    "百万医疗险推荐",
    "甲状腺结节能买保险吗",
    "消费型重疾险哪个好",
    "XX重疾险保障范围",
    "重疾险等待期多少天",
    "保险理赔流程",
    "带病投保怎么处理",
]

verification = verify_geo_effectiveness("C0001", TEST_QUERIES)
print(f"\nGEO效果验证报告")
print(f"产品: {verification['product_code']}")
print(f"测试查询: {verification['total_queries']}条")
print(f"被引用次数: {verification['mentioned_count']}次")
print(f"引用率: {verification['citation_rate']:.1%}")

总结

本文介绍了保险行业GEO的完整技术方案，覆盖四个核心环节：

条款NLP解析：使用spaCy自定义实体识别，从非结构化条款文本中提取保障范围、免责条款、等待期等关键字段
知识图谱构建：使用Neo4j构建保险产品-疾病-免责条款的关系图谱，支持多维度查询
FAQ语义匹配：使用SentenceTransformer实现FAQ的语义检索，支持自然语言查询
Schema.org标注：生成符合Schema.org标准的结构化数据，嵌入产品页面HTML

保险行业的GEO核心挑战是产品信息的非结构化。PDF条款、图片格式的保障范围、JavaScript渲染的产品对比表格------这些内容呈现方式在浏览器中渲染得很好，但对于只能读取DOM和纯文本的AI爬虫来说，等同于空白页面。

本文方案的技术价值在于：将条款解析从人工阅读PDF的流程，转变为NLP自动提取+知识图谱关联的自动化流程。当保险产品的保障范围、免责条款、理赔案例都能被机器精确读取时，AI搜索引擎在回答保险相关问题时，才有足够的数据支撑来推荐你的产品。