参考:
https://baike.baidu.com/item/AC自动机算法/22799985
敏感词库参考:https://github.com/konsheng/Sensitive-lexicon
可用于大模型输出内容检测
pip install ahocorasick
import ahocorasick
import asyncio
from typing import List, Dict
class AsyncSensitiveChecker:
"""
高性能异步敏感词检测器 (基于 Aho--Corasick)
- 支持 async 调用
- 支持流式文本检测
"""
def __init__(self, word_file: str):
self.automaton = ahocorasick.Automaton()
self._load_words(word_file)
print(f"✅ 已加载敏感词 {len(self.automaton)} 条")
def _load_words(self, path: str):
with open(path, "r", encoding="utf-8") as f:
for line in f:
word = line.strip()
if word:
self.automaton.add_word(word.lower(), word)
self.automaton.make_automaton()
async def check_text_async(self, text: str) -> Dict[str, List[str]]:
"""异步检测单段文本"""
# 使用 asyncio.to_thread() 避免阻塞主线程
return await asyncio.to_thread(self._check_sync, text)
def _check_sync(self, text: str) -> Dict[str, List[str]]:
"""同步检测核心(被线程池调用)"""
hits = []
lower = text.lower()
for _, word in self.automaton.iter(lower):
hits.append(word)
return {
"ok": len(hits) == 0,
"count": len(hits),
"hits": list(set(hits))
}
async def stream_check(self, text_stream):
"""
异步流式检测器
text_stream: 异步生成器( async for chunk in text_stream )
"""
buffer = ""
async for chunk in text_stream:
buffer += chunk
result = await self.check_text_async(chunk)
if not result["ok"]:
print(f"⚠️ 检测到敏感词: {result['hits']}")
# 可选择立即中断输出
return {"ok": False, "hits": result["hits"]}
return {"ok": True, "hits": []}
# import asyncio
# from async_sensitive_checker import AsyncSensitiveChecker
async def main():
checker = AsyncSensitiveChecker("sensitive_words.txt")
text = "今天讨论的内容涉及非法交易与敏感主题,色情奔放。"
result = await checker.check_text_async(text)
print(f"结果: {result}")
if not result["ok"]:
print("⚠️ 发现敏感词:", result["hits"])
else:
print("✅ 内容安全")
asyncio.run(main())
"ok": True 表示正常,false表示有敏感词