1. 分词 (Tokenization)
基本定义
分词是将连续的文本序列切分成有意义的词汇单元的过程,是自然语言处理的基础步骤。
实现原理
最大正向匹配 (Maximum Forward Matching)
- 从左到右扫描文本,尽可能匹配最长的词典中的词
- 时间复杂度:O(n²)
最大逆向匹配 (Maximum Backward Matching)
- 从右到左扫描文本,匹配最长词典词
- 中文中通常比正向匹配效果更好
双向匹配 (Bidirectional Matching)
- 同时进行正向和逆向匹配,选择分词数量较少的结果
- 解决歧义问题的有效方法
代码示例
python
import jieba
import nltk
from nltk.tokenize import word_tokenize
# 下载nltk数据(首次运行需要)
nltk.download('punkt')
def chinese_tokenization_demo():
"""中文分词演示"""
text = "自然语言处理是人工智能的重要方向"
# 精确模式
seg_list = jieba.cut(text, cut_all=False)
print("精确模式:", "/".join(seg_list))
# 全模式
seg_list = jieba.cut(text, cut_all=True)
print("全模式:", "/".join(seg_list))
# 搜索引擎模式
seg_list = jieba.cut_for_search(text)
print("搜索引擎模式:", "/".join(seg_list))
def english_tokenization_demo():
"""英文分词演示"""
text = "Natural Language Processing is an important direction of AI."
# 基础分词
tokens = word_tokenize(text)
print("英文分词:", tokens)
# 使用nltk的RegexpTokenizer进行正则分词
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')
tokens = tokenizer.tokenize(text)
print("正则分词:", tokens)
# 实现最大正向匹配算法
def forward_max_match(text, word_dict, max_len=5):
"""最大正向匹配算法实现"""
result = []
index = 0
text_len = len(text)
while index < text_len:
matched = False
for word_len in range(min(max_len, text_len - index), 0, -1):
word = text[index:index + word_len]
if word in word_dict:
result.append(word)
index += word_len
matched = True
break
if not matched:
result.append(text[index])
index += 1
return result
# 演示
if __name__ == "__main__":
chinese_tokenization_demo()
english_tokenization_demo()
# 自定义词典演示
word_dict = {"自然语言", "处理", "人工智能", "重要", "方向"}
text = "自然语言处理是人工智能的重要方向"
print("正向匹配:", forward_max_match(text, word_dict))
使用场景
- 文本预处理和清洗
- 搜索引擎索引构建
- 机器翻译的前期处理
- 情感分析和文本分类
优缺点
优点:
- 提高文本处理效率
- 便于后续特征提取
- 支持多种语言处理
缺点:
- 分词歧义问题
- 新词识别困难
- 依赖词典质量
2. 词性标注 (Part-of-Speech Tagging)
基本定义
词性标注是为分词后的每个词汇单元分配语法类别标签的过程,如名词、动词、形容词等。
实现原理
- 基于规则的方法:使用人工编写的语法规则
- 基于统计的方法:隐马尔可夫模型(HMM)、条件随机场(CRF)
- 基于深度学习的方法:BiLSTM-CRF、Transformer
代码示例
python
import jieba.posseg as pseg
import nltk
from nltk import pos_tag
from collections import Counter
# 下载nltk数据
nltk.download('averaged_perceptron_tagger')
nltk.download('universal_tagset')
def chinese_pos_tagging():
"""中文词性标注"""
text = "自然语言处理是人工智能领域的重要研究方向"
# 使用jieba进行词性标注
words = pseg.cut(text)
for word, flag in words:
print(f"{word}({flag})", end=" ")
print()
# 统计词性频率
words = pseg.cut(text)
pos_counter = Counter()
for word, flag in words:
pos_counter[flag] += 1
print("词性统计:", pos_counter.most_common())
def english_pos_tagging():
"""英文词性标注"""
text = "Natural Language Processing is an important research direction in AI field."
# 分词
tokens = nltk.word_tokenize(text)
# 词性标注
pos_tags = pos_tag(tokens)
print("英文词性标注:", pos_tags)
# 使用通用标签集
pos_tags_universal = pos_tag(tokens, tagset='universal')
print("通用标签集:", pos_tags_universal)
def advanced_pos_analysis():
"""高级词性分析"""
text = """
自然语言处理技术近年来取得了显著进展。
深度学习模型在各项任务中表现出色。
研究人员正在探索更高效的算法。
"""
# 提取名词和动词
words = pseg.cut(text)
nouns = []
verbs = []
for word, flag in words:
if flag.startswith('n'): # 名词
nouns.append(word)
elif flag.startswith('v'): # 动词
verbs.append(word)
print("名词:", Counter(nouns).most_common(5))
print("动词:", Counter(verbs).most_common(5))
# 自定义词性分析函数
def analyze_text_pos_pattern(text):
"""分析文本词性模式"""
words = pseg.cut(text)
pos_sequence = []
for word, flag in words:
pos_sequence.append((word, flag))
# 分析常见的词性模式
patterns = []
for i in range(len(pos_sequence) - 1):
pattern = f"{pos_sequence[i][1]}-{pos_sequence[i+1][1]}"
patterns.append(pattern)
pattern_counter = Counter(patterns)
return pattern_counter.most_common(3)
if __name__ == "__main__":
chinese_pos_tagging()
english_pos_tagging()
advanced_pos_analysis()
sample_text = "美丽的姑娘在花园里唱歌"
patterns = analyze_text_pos_pattern(sample_text)
print("常见词性模式:", patterns)
使用场景
- 语法分析和句法解析
- 信息提取和关系抽取
- 文本生成和机器翻译
- 语义角色标注
优缺点
优点:
- 提供语法结构信息
- 支持深层次文本理解
- 改善信息检索效果
缺点:
- 词性歧义问题
- 依赖分词准确性
- 对新领域适应性有限
3. 命名实体识别 (Named Entity Recognition)
基本定义
命名实体识别是识别文本中特定类型实体(如人名、地名、机构名等)的技术。
实现原理
- 基于规则的方法:模式匹配和词典查找
- 机器学习方法:CRF、SVM
- 深度学习方法:BiLSTM-CRF、BERT、Span-based方法
代码示例
python
import jieba
import jieba.posseg as pseg
import re
from collections import defaultdict
# 配置jieba用户词典增强实体识别
def setup_custom_dict():
"""设置自定义词典提升实体识别效果"""
custom_entities = [
"阿里巴巴/n ORG",
"腾讯/n ORG",
"百度/n ORG",
"清华大学/n ORG",
"北京市/ns LOC",
"上海市/ns LOC",
"人工智能/n TECH"
]
with open('custom_dict.txt', 'w', encoding='utf-8') as f:
for entity in custom_entities:
f.write(entity + '\n')
jieba.load_userdict('custom_dict.txt')
def rule_based_ner(text):
"""基于规则的命名实体识别"""
entities = {
'PERSON': [],
'LOCATION': [],
'ORGANIZATION': [],
'TECHNOLOGY': []
}
# 使用词性标注进行初步识别
words = pseg.cut(text)
for word, flag in words:
if flag == 'nr': # 人名
entities['PERSON'].append(word)
elif flag == 'ns': # 地名
entities['LOCATION'].append(word)
elif flag == 'nt' or flag == 'nz': # 机构名、其他专名
entities['ORGANIZATION'].append(word)
# 使用正则表达式增强识别
# 识别技术术语
tech_pattern = r'[A-Za-z]*[Ll]earning|[A-Za-z]*[Nn]et|Transformer|BERT|GPT'
tech_matches = re.findall(tech_pattern, text)
entities['TECHNOLOGY'].extend(tech_matches)
return entities
def build_knowledge_graph(entities_list):
"""构建简单的知识图谱"""
kg = defaultdict(list)
for entities in entities_list:
# 建立实体间的关系(这里使用简单共现关系)
persons = entities.get('PERSON', [])
orgs = entities.get('ORGANIZATION', [])
locations = entities.get('LOCATION', [])
# 建立人物-机构关系
for person in persons:
for org in orgs:
kg[person].append(f'works_at {org}')
# 建立机构-地点关系
for org in orgs:
for loc in locations:
kg[org].append(f'located_in {loc}')
return kg
def intelligent_qa_system(kg, question):
"""基于知识图谱的智能问答系统示例"""
question_lower = question.lower()
# 简单的问题匹配
if "工作" in question_lower or "在哪" in question_lower:
for entity, relations in kg.items():
for relation in relations:
if "works_at" in relation:
if entity in question:
return f"{entity} {relation}"
elif "位于" in question_lower or "地点" in question_lower:
for entity, relations in kg.items():
for relation in relations:
if "located_in" in relation:
if entity in question:
return f"{entity} {relation}"
return "抱歉,我还没有学会回答这个问题"
# 高级NER使用spaCy(需要安装spacy和zh_core_web_sm)
def advanced_ner_with_spacy():
"""使用spaCy进行高级命名实体识别"""
try:
import spacy
# 加载中文模型(需要先下载: python -m spacy download zh_core_web_sm)
nlp = spacy.load("zh_core_web_sm")
text = "马云在阿里巴巴杭州总部会见了来自腾讯的张小龙"
doc = nlp(text)
print("spaCy NER结果:")
for ent in doc.ents:
print(f"{ent.text} - {ent.label_}")
except ImportError:
print("请先安装spaCy: pip install spacy")
except OSError:
print("请先下载中文模型: python -m spacy download zh_core_web_sm")
def demo_complete_pipeline():
"""完整的NER到知识图谱构建演示"""
# 示例文本
texts = [
"马云在阿里巴巴杭州总部会见了腾讯的张小龙",
"李彦宏在北京百度大厦宣布了新的人工智能计划",
"清华大学的人工智能研究院在北京市开展了深度学习研究"
]
setup_custom_dict()
all_entities = []
print("=== 命名实体识别结果 ===")
for text in texts:
print(f"\n文本: {text}")
entities = rule_based_ner(text)
print("识别到的实体:", entities)
all_entities.append(entities)
# 构建知识图谱
print("\n=== 知识图谱构建 ===")
knowledge_graph = build_knowledge_graph(all_entities)
for entity, relations in knowledge_graph.items():
print(f"{entity}: {relations}")
# 智能问答演示
print("\n=== 智能问答演示 ===")
questions = [
"马云在哪工作?",
"阿里巴巴位于哪里?",
"百度在哪个城市?"
]
for question in questions:
answer = intelligent_qa_system(knowledge_graph, question)
print(f"问: {question}")
print(f"答: {answer}")
if __name__ == "__main__":
demo_complete_pipeline()
advanced_ner_with_spacy()
使用场景
- 知识图谱构建:从文本中提取实体和关系
- 智能问答系统:基于实体关系的问答
- 信息提取:从文档中提取结构化信息
- 推荐系统:基于实体关系的个性化推荐
知识图谱 + 大模型构建知识库
python
import json
from typing import Dict, List, Any
class KnowledgeBase:
"""知识库系统:结合知识图谱和大语言模型"""
def __init__(self):
self.entities = {}
self.relations = []
self.documents = []
def add_entity(self, entity_id: str, entity_type: str, attributes: Dict):
"""添加实体到知识库"""
self.entities[entity_id] = {
'type': entity_type,
'attributes': attributes,
'relations': []
}
def add_relation(self, source_id: str, relation_type: str, target_id: str):
"""添加实体关系"""
relation = {
'source': source_id,
'relation': relation_type,
'target': target_id
}
self.relations.append(relation)
# 更新实体的关系列表
if source_id in self.entities:
self.entities[source_id]['relations'].append(relation)
def query_entity(self, entity_id: str) -> Dict[str, Any]:
"""查询实体信息"""
return self.entities.get(entity_id, {})
def find_related_entities(self, entity_id: str, relation_type: str = None) -> List[Dict]:
"""查找相关实体"""
related = []
for relation in self.relations:
if relation['source'] == entity_id:
if relation_type is None or relation['relation'] == relation_type:
target_info = self.query_entity(relation['target'])
related.append({
'relation': relation['relation'],
'target': relation['target'],
'target_info': target_info
})
return related
def to_json(self) -> str:
"""导出知识库为JSON格式"""
kb_data = {
'entities': self.entities,
'relations': self.relations
}
return json.dumps(kb_data, ensure_ascii=False, indent=2)
# 演示知识库构建和使用
def demo_knowledge_base():
"""知识库构建演示"""
kb = KnowledgeBase()
# 添加实体
kb.add_entity("person_1", "PERSON", {"name": "马云", "position": "创始人"})
kb.add_entity("org_1", "ORGANIZATION", {"name": "阿里巴巴", "industry": "电商"})
kb.add_entity("loc_1", "LOCATION", {"name": "杭州市", "type": "城市"})
# 添加关系
kb.add_relation("person_1", "founder_of", "org_1")
kb.add_relation("org_1", "located_in", "loc_1")
kb.add_relation("person_1", "works_at", "org_1")
# 查询演示
print("=== 知识库查询演示 ===")
ma_yun_info = kb.query_entity("person_1")
print("马云信息:", ma_yun_info)
related_entities = kb.find_related_entities("person_1")
print("马云的相关实体:", related_entities)
# 导出知识库
kb_json = kb.to_json()
print("\n知识库JSON格式:")
print(kb_json)
if __name__ == "__main__":
demo_knowledge_base()
优缺点
命名实体识别的优点:
- 提取结构化信息
- 支持知识图谱构建
- 提升信息检索精度
命名实体识别的缺点:
- 实体歧义问题
- 新实体识别困难
- 依赖训练数据质量
知识图谱+大模型的优势:
- 结合符号知识和统计学习
- 支持可解释的推理
- 知识可更新和扩展
知识图谱+大模型的挑战:
- 知识融合难度大
- 实时性要求高
- 需要大量标注数据
这套技术栈为构建智能问答系统、知识管理系统和决策支持系统提供了完整的技术基础,是现代自然语言处理应用的核心组成部分。