一、什么是 DFA?
DFA(Deterministic Finite Automaton,确定有限自动机)是一种高效的字符串匹配算法。Hutool 利用 DFA 算法实现了敏感词过滤功能,具有时间复杂度 O(n) 的特点,匹配效率极高。
二、添加依赖
html
<dependency>
<groupId>cn.hutool</groupId>
<artifactId>hutool-all</artifactId>
<version>5.8.25</version>
</dependency>
三、核心类:WordTree
Hutool 中使用 cn.hutool.dfa.WordTree 类实现 DFA 敏感词过滤。
四、基本使用
1. 构建敏感词树
java
import cn.hutool.dfa.WordTree;
WordTree wordTree = new WordTree();
// 添加敏感词
wordTree.addWord("色情");
wordTree.addWord("暴力");
wordTree.addWord("赌博");
// 批量添加
wordTree.addWords(Arrays.asList("毒品", "诈骗", "传销"));
2. 查找敏感词
java
String text = "这个网站包含色情和暴力内容,还有赌博信息";
// 查找所有匹配的敏感词
List<String> words = wordTree.matchAll(text);
System.out.println(words); // [色情, 暴力, 赌博]
// 匹配所有敏感词,包含结束索引(性能更好)
List<FoundWord> foundWords = wordTree.matchAllWords(text);
3. 判断是否包含敏感词
java
String text = "你好,世界";
boolean hasWord = wordTree.isMatch(text);
System.out.println(hasWord); // false
String text2 = "这里有色情内容";
boolean hasWord2 = wordTree.isMatch(text2);
System.out.println(hasWord2); // true
五、高级用法
1. 获取匹配详情
java
WordTree wordTree = new WordTree();
wordTree.addWords(Arrays.asList("色情", "黄色"));
String text = "这个色情网站有很多黄色视频";
List<FoundWord> foundWords = wordTree.matchAllWords(text);
for (FoundWord fw : foundWords) {
System.out.println("敏感词:" + fw.getWord());
System.out.println("起始位置:" + fw.getStartIndex());
System.out.println("结束位置:" + fw.getEndIndex());
System.out.println("文本:" + fw.getText());
}
2. 敏感词替换
java
public class SensitiveFilter {
private static final WordTree WORD_TREE = new WordTree();
static {
WORD_TREE.addWords(Arrays.asList("色情", "暴力", "赌博"));
}
public static String replace(String text, char replacement) {
List<FoundWord> foundWords = WORD_TREE.matchAllWords(text);
if (foundWords.isEmpty()) {
return text;
}
StringBuilder result = new StringBuilder(text);
for (FoundWord fw : foundWords) {
int start = fw.getStartIndex();
int end = fw.getEndIndex();
for (int i = start; i <= end; i++) {
result.setCharAt(i, replacement);
}
}
return result.toString();
}
public static String replace(String text, String replacement) {
List<FoundWord> foundWords = WORD_TREE.matchAllWords(text);
if (foundWords.isEmpty()) {
return text;
}
String result = text;
for (FoundWord fw : foundWords) {
String stars = String.join("", Collections.nCopies(fw.getWord().length(), replacement));
result = result.replace(fw.getWord(), stars);
}
return result;
}
}
// 使用示例
String text = "这里有色情和暴力内容";
String filtered = SensitiveFilter.replace(text, "*");
System.out.println(filtered); // 这里有**和**内容
3. 停止词处理(跳过无效字符)
java
WordTree wordTree = new WordTree();
// 设置停止词,这些字符会被忽略
Set<Character> stopWords = new HashSet<>();
stopWords.add(' ');
stopWords.add('-');
stopWords.add('*');
stopWords.add('·');
wordTree.setStopWords(stopWords);
wordTree.addWord("色情");
// 即使中间有停止词也能匹配
String text = "色 情内容";
boolean match = wordTree.isMatch(text); // true
4. 大小写敏感控制
java
WordTree wordTree = new WordTree();
// 默认大小写敏感
wordTree.addWord("ABC");
wordTree.isMatch("abc"); // false
// 转换为统一大小写处理
wordTree.addWord("abc".toLowerCase());
wordTree.isMatch("ABC".toLowerCase()); // true
六、实战:完整的敏感词过滤工具类
java
import cn.hutool.core.util.StrUtil;
import cn.hutool.dfa.WordTree;
import java.util.*;
public class SensitiveWordUtil {
private static final WordTree WORD_TREE = new WordTree();
private static final char DEFAULT_REPLACEMENT = '*';
// 初始化敏感词库
static {
loadSensitiveWords();
}
private static void loadSensitiveWords() {
// 从配置文件或数据库加载
List<String> words = Arrays.asList(
"色情", "暴力", "赌博", "毒品", "诈骗",
"反动", "恐怖", "血腥", "淫秽", "低俗"
);
WORD_TREE.addWords(words);
// 设置停止词
Set<Character> stopWords = new HashSet<>();
stopWords.add(' ');
stopWords.add('-');
stopWords.add('_');
stopWords.add('*');
stopWords.add('.');
stopWords.add(',');
WORD_TREE.setStopWords(stopWords);
}
// 动态添加敏感词
public static void addWord(String word) {
if (StrUtil.isNotBlank(word)) {
WORD_TREE.addWord(word.trim());
}
}
// 批量添加
public static void addWords(Collection<String> words) {
WORD_TREE.addWords(words);
}
// 删除敏感词
public static void removeWord(String word) {
WORD_TREE.clear();
// 重新加载除该词外的所有词(WordTree 无直接删除方法)
// 实际使用中建议重新构建树
}
// 检查是否包含敏感词
public static boolean contains(String text) {
if (StrUtil.isBlank(text)) {
return false;
}
return WORD_TREE.isMatch(text);
}
// 获取所有敏感词
public static List<String> findAll(String text) {
if (StrUtil.isBlank(text)) {
return Collections.emptyList();
}
return WORD_TREE.matchAll(text);
}
// 获取敏感词详细信息
public static List<SensitiveWordInfo> findDetail(String text) {
if (StrUtil.isBlank(text)) {
return Collections.emptyList();
}
List<FoundWord> foundWords = WORD_TREE.matchAllWords(text);
List<SensitiveWordInfo> result = new ArrayList<>();
for (FoundWord fw : foundWords) {
SensitiveWordInfo info = new SensitiveWordInfo();
info.setWord(fw.getWord());
info.setStartIndex(fw.getStartIndex());
info.setEndIndex(fw.getEndIndex());
info.setText(fw.getText());
result.add(info);
}
return result;
}
// 替换敏感词
public static String replace(String text) {
return replace(text, DEFAULT_REPLACEMENT);
}
public static String replace(String text, char replacement) {
if (StrUtil.isBlank(text)) {
return text;
}
List<FoundWord> foundWords = WORD_TREE.matchAllWords(text);
if (foundWords.isEmpty()) {
return text;
}
char[] chars = text.toCharArray();
for (FoundWord fw : foundWords) {
for (int i = fw.getStartIndex(); i <= fw.getEndIndex(); i++) {
chars[i] = replacement;
}
}
return new String(chars);
}
// 敏感词信息类
public static class SensitiveWordInfo {
private String word;
private int startIndex;
private int endIndex;
private String text;
// getter/setter 省略
}
}
// 使用示例
public class Demo {
public static void main(String[] args) {
String text = "这篇文章包含色情和暴力内容,还提到了赌博";
// 检查是否包含敏感词
if (SensitiveWordUtil.contains(text)) {
System.out.println("内容包含敏感词!");
// 查找所有敏感词
List<String> words = SensitiveWordUtil.findAll(text);
System.out.println("敏感词:" + words);
// 替换敏感词
String filtered = SensitiveWordUtil.replace(text);
System.out.println("过滤后:" + filtered);
}
}
}
七、性能优化建议
1. 单例模式使用
java
// WordTree 是线程安全的,可以复用
public class WordTreeHolder {
private static final WordTree INSTANCE = new WordTree();
static {
// 初始化敏感词
INSTANCE.addWords(loadFromDB());
}
public static WordTree getInstance() {
return INSTANCE;
}
}
2. 异步加载敏感词
java
// 使用线程池定时刷新敏感词库
ScheduledExecutorService executor = Executors.newSingleThreadScheduledExecutor();
executor.scheduleAtFixedRate(() -> {
try {
List<String> newWords = loadLatestWords();
WordTree newTree = new WordTree();
newTree.addWords(newWords);
// 替换旧的 WordTree
wordTreeRef.set(newTree);
} catch (Exception e) {
log.error("刷新敏感词失败", e);
}
}, 0, 5, TimeUnit.MINUTES);
八、注意事项
-
内存占用:敏感词越多,树结构越大,需要注意内存使用
-
线程安全 :
WordTree在构建后是只读的,添加新词会修改内部结构,多线程使用时需要注意 -
编码问题:确保文本和敏感词使用统一的字符编码
-
性能考虑 :
matchAllWords比matchAll性能更好(减少字符串创建)
九、总结
Hutool 的 DFA 实现具有以下优点:
-
✅ 时间复杂度 O(n),匹配效率高
-
✅ API 简洁易用
-
✅ 支持停止词过滤
-
✅ 可获取详细匹配信息
适合用于:评论系统、聊天室、文章发布等需要敏感词过滤的场景。