- 自己写的分组Tire树匹配算法,该算法用于云南省人工智能重点实验室与云南电网合作项目(云南电网敏感信息识别系统),用于快速匹配文本将项目中数据算法抽离出来,特此分享!!!
- 可以实现动态的插入、删除操作
python
# 自己写的组Tire树筛选算法
# 该算法用于本实验室和云南电网敏感信息项目,用于快速匹配文本
# 将项目中数据算法抽离出来,特此分享
class TireNode:
def __init__(self):
self.children = {} # 字典类型,类似与JAVA中的map
self.group_ids = set() # 初始化组ID为-1,表示未分配
# Tire树
class Tire:
def __init__(self):
self.root = TireNode()
# 插入
def insert(self, word, group_id):
node = self.root
for char in word:
if char not in node.children:
node.children[char] = TireNode()
node = node.children[char]
if group_id not in node.group_ids:
node.group_ids.add(group_id) # 标记单词所属的组ID
return True
else:
return False # 代表当前Tire树中已经存在
# 搜索
def search(self, word):
node = self.root
for char in word:
if char not in node.children:
return None, word
node = node.children[char]
if len(node.group_ids) != 0: # 如果group_ids不为空,说明已经到达结尾
return node.group_ids,word
return None, word
# 删除
def delete(self, group_id, word):
node = self.root
for char in word:
if char not in node.children:
return False # 删除失败
node = node.children[char]
if group_id not in node.group_ids:
return False
else:
node.group_ids.remove(group_id) # 移除集合中的group_id
return True
# 基于TireTree算法的组关键词筛选
class KeyWords(object):
def __init__(self):
# 创建Tire树
self.tire = Tire()
# 记录每个group_id 所对应的关键词个数
self.tire_group_ids = {}
# 从数据库获取数据
self.gjc_lists = [["电网信息", "电网"], []]
# 将关键词插入Tire树,并记录每个组的关键词数量
for group_id, keywords in enumerate(self.gjc_lists):
for keyword in keywords:
# 向Tire树中插入
success = self.tire.insert(keyword, group_id)
if success: # 如果插入成功才进行更新
if group_id not in self.tire_group_ids:
self.tire_group_ids[group_id] = 1
else:
self.tire_group_ids[group_id] += self.tire_group_ids[group_id]
print()
# 文本匹配,必须匹配上某个组中所有关键词才算是匹配上
def match(self, text):
# 遍历文本,检查关键词
group_dict = {}
for i in range(len(text)):
for j in range(i + 1, len(text) + 1):
group_ids, group_word = self.tire.search(text[i:j])
if group_ids is not None:
# 如果存在,可能有多个,因为不同组可能具有相同的关键词
for group_id in group_ids:
if group_id not in group_dict: # 将查到的group_ids都记录下来
group_dict[group_id] = 1
else:
group_dict[group_id] += 1
# 如果发现某个组个数已经匹配上,则匹配成功
if group_dict[group_id] == self.tire_group_ids[group_id]:
return True
# 如果都没有匹配上,说明没有匹配成功
return False
# 传入一个组以及word 来实现删除
def delete(self, group_id, word):
success = self.tire.delete(group_id, word)
if success: # 如果删除成功,更新tire_group_ids
if group_id in self.tire_group_ids:
self.tire_group_ids[group_id] -= 1
return success
def insert(self, group_id, word):
success = self.tire.insert(word, group_id)
if success:
if group_id not in self.tire_group_ids:
self.tire_group_ids[group_id] = 1
else:
self.tire_group_ids[group_id] += 1