#主要想说一下关键词合并
2.HResults统计工具传入的文本格式:
#!MLF!#
"*No1.lab"
安
徽
省
pro
合
肥
市
ai
bot
a
i
和
en
open
a
i
chat
g
p
t
的
大
家
g
p
t
.
关键词内容:
it
ai
aibot
chatgpt
openai
合肥市
合肥
gpt
省市
安徽
这里说明一下,中文按照字一一换行,英文按照单词换行,但是语音识别发音可能就拆开了,但是想根据关键词,将相邻的字组成关键词如下:
#!MLF!#
"*No1.lab"
安徽
省
pro
合肥市
aibot
ai
和
en
openai
chatgpt
的
大
家
gpt
.
代码实现:(根据DFA算法改的)
python
# -*- coding:utf-8 -*-
import copy
import time,re
time1 = time.time()
"""
DFA算法:
它的基本思想是基于状态转移来检索敏感词,只需要扫描一次待检测文本,就能对所有敏感词进行检测
"""
# DFA算法
class DFAFilter(object):
def __init__(self):
self.keyword_chains = {} # 关键词链表
self.delimit = '\x00' # 限定
# 将敏感词内容添加关键词链表
def add(self, keyword):
keyword = keyword.lower() # 关键词英文变为小写
chars = keyword.strip() # 关键字去除首尾空格和换行
if not chars: # 如果关键词为空直接返回
return
level = self.keyword_chains
# 遍历关键字的每个字
for i in range(len(chars)):
# 如果这个字已经存在字符链的key中就进入其子字典
if chars[i] in level:
level = level[chars[i]]
print(level)
else:
# isinstance(level, dict)判断是否是一个类型,是返回True
if not isinstance(level, dict):
print("print(level)", level)
break
for j in range(i, len(chars)):
level[chars[j]] = {}
last_level, last_char = level, chars[j]
level = level[chars[j]]
last_level[last_char] = {self.delimit: 0}
break
if i == len(chars) - 1:
level[self.delimit] = 0
# 读取关键词
def parse(self, path):
with open(path, encoding='utf-8') as f:
for keyword in f:
self.add(str(keyword).strip())
# print(self.keyword_chains)
def two_eng(self,char):
if len(char)>=2 and re.match(r"[a-zA-Z\']",char):
for letter in char:
# print("letter",letter, level)
if letter in level:
level = level[letter]
if self.delimit in level:
matched = True
else:
print("break", char, level)
break
# 合并关键词
def filter(self, message, repl="*"):
ret = []
start = 0
print(self.keyword_chains)
while start < len(message):
level = self.keyword_chains
matched = False
end = False
key_word = ""
letter_word=""
for i in range(start, len(message)):
char = message[i]
# print(char)
if len(char)>=2:
stop=False
comin=copy.copy(matched)
for letter in char:
if letter in level:
level = level[letter]
letter_word += letter
else:
if comin:
matched = True
else:
matched = False
stop=True
break
# 判断这个char在level还是不在
if stop:
letter_word = ""
break
else:
# print('ok',char)
key_word+=letter_word
letter_word = ""
if self.delimit in level:
matched = True
if i+1 == len(message):
end=True
else:
#一直遍历,直到不存在的时候退出,看char是否ok
if char in level:
level = level[char]
key_word+=char
if self.delimit in level:
matched = True
if i+1 == len(message):
end=True
else:
break
if matched:
ret.append(key_word)
start = i
if end:
break
else:
# if
ret.append(message[start])
start += 1
return ret
if __name__ == "__main__":
gfw = DFAFilter()
path = 'key_word.txt'
gfw.parse(path)
text = ['安', '徽', '省','pro', '合', '肥','市','ai', 'bot','a', 'i', '和','en', 'open', 'a', 'i', 'chat','g','p','t','的', '大', '家','g','p','t']
result = gfw.filter(text)
print(text)
print(result)
time2 = time.time()
print('总共耗时:' + str(time2 - time1) + 's')
for I in result:
print(I)