Hutool DFA 教程

一、什么是 DFA？

DFA（Deterministic Finite Automaton，确定有限自动机）是一种高效的字符串匹配算法。Hutool 利用 DFA 算法实现了敏感词过滤功能，具有时间复杂度 O(n) 的特点，匹配效率极高。

二、添加依赖

html 复制代码

<dependency>
    <groupId>cn.hutool</groupId>
    <artifactId>hutool-all</artifactId>
    <version>5.8.25</version>
</dependency>

三、核心类：`WordTree`

Hutool 中使用 cn.hutool.dfa.WordTree 类实现 DFA 敏感词过滤。

四、基本使用

1. 构建敏感词树

java 复制代码

import cn.hutool.dfa.WordTree;

WordTree wordTree = new WordTree();
// 添加敏感词
wordTree.addWord("色情");
wordTree.addWord("暴力");
wordTree.addWord("赌博");
// 批量添加
wordTree.addWords(Arrays.asList("毒品", "诈骗", "传销"));

2. 查找敏感词

java 复制代码

String text = "这个网站包含色情和暴力内容，还有赌博信息";

// 查找所有匹配的敏感词
List<String> words = wordTree.matchAll(text);
System.out.println(words); // [色情, 暴力, 赌博]

// 匹配所有敏感词，包含结束索引（性能更好）
List<FoundWord> foundWords = wordTree.matchAllWords(text);

3. 判断是否包含敏感词

java 复制代码

String text = "你好，世界";
boolean hasWord = wordTree.isMatch(text);
System.out.println(hasWord); // false

String text2 = "这里有色情内容";
boolean hasWord2 = wordTree.isMatch(text2);
System.out.println(hasWord2); // true

五、高级用法

1. 获取匹配详情

java 复制代码

WordTree wordTree = new WordTree();
wordTree.addWords(Arrays.asList("色情", "黄色"));

String text = "这个色情网站有很多黄色视频";

List<FoundWord> foundWords = wordTree.matchAllWords(text);
for (FoundWord fw : foundWords) {
    System.out.println("敏感词：" + fw.getWord());
    System.out.println("起始位置：" + fw.getStartIndex());
    System.out.println("结束位置：" + fw.getEndIndex());
    System.out.println("文本：" + fw.getText());
}

2. 敏感词替换

java 复制代码

public class SensitiveFilter {
    private static final WordTree WORD_TREE = new WordTree();
    
    static {
        WORD_TREE.addWords(Arrays.asList("色情", "暴力", "赌博"));
    }
    
    public static String replace(String text, char replacement) {
        List<FoundWord> foundWords = WORD_TREE.matchAllWords(text);
        if (foundWords.isEmpty()) {
            return text;
        }
        
        StringBuilder result = new StringBuilder(text);
        for (FoundWord fw : foundWords) {
            int start = fw.getStartIndex();
            int end = fw.getEndIndex();
            for (int i = start; i <= end; i++) {
                result.setCharAt(i, replacement);
            }
        }
        return result.toString();
    }
    
    public static String replace(String text, String replacement) {
        List<FoundWord> foundWords = WORD_TREE.matchAllWords(text);
        if (foundWords.isEmpty()) {
            return text;
        }
        
        String result = text;
        for (FoundWord fw : foundWords) {
            String stars = String.join("", Collections.nCopies(fw.getWord().length(), replacement));
            result = result.replace(fw.getWord(), stars);
        }
        return result;
    }
}

// 使用示例
String text = "这里有色情和暴力内容";
String filtered = SensitiveFilter.replace(text, "*");
System.out.println(filtered); // 这里有**和**内容

3. 停止词处理（跳过无效字符）

java 复制代码

WordTree wordTree = new WordTree();
// 设置停止词，这些字符会被忽略
Set<Character> stopWords = new HashSet<>();
stopWords.add(' ');
stopWords.add('-');
stopWords.add('*');
stopWords.add('·');

wordTree.setStopWords(stopWords);
wordTree.addWord("色情");

// 即使中间有停止词也能匹配
String text = "色 情内容";
boolean match = wordTree.isMatch(text); // true

4. 大小写敏感控制

java 复制代码

WordTree wordTree = new WordTree();
// 默认大小写敏感
wordTree.addWord("ABC");
wordTree.isMatch("abc"); // false

// 转换为统一大小写处理
wordTree.addWord("abc".toLowerCase());
wordTree.isMatch("ABC".toLowerCase()); // true

六、实战：完整的敏感词过滤工具类

java 复制代码

import cn.hutool.core.util.StrUtil;
import cn.hutool.dfa.WordTree;
import java.util.*;

public class SensitiveWordUtil {
    private static final WordTree WORD_TREE = new WordTree();
    private static final char DEFAULT_REPLACEMENT = '*';
    
    // 初始化敏感词库
    static {
        loadSensitiveWords();
    }
    
    private static void loadSensitiveWords() {
        // 从配置文件或数据库加载
        List<String> words = Arrays.asList(
            "色情", "暴力", "赌博", "毒品", "诈骗",
            "反动", "恐怖", "血腥", "淫秽", "低俗"
        );
        WORD_TREE.addWords(words);
        
        // 设置停止词
        Set<Character> stopWords = new HashSet<>();
        stopWords.add(' ');
        stopWords.add('-');
        stopWords.add('_');
        stopWords.add('*');
        stopWords.add('.');
        stopWords.add(',');
        WORD_TREE.setStopWords(stopWords);
    }
    
    // 动态添加敏感词
    public static void addWord(String word) {
        if (StrUtil.isNotBlank(word)) {
            WORD_TREE.addWord(word.trim());
        }
    }
    
    // 批量添加
    public static void addWords(Collection<String> words) {
        WORD_TREE.addWords(words);
    }
    
    // 删除敏感词
    public static void removeWord(String word) {
        WORD_TREE.clear();
        // 重新加载除该词外的所有词（WordTree 无直接删除方法）
        // 实际使用中建议重新构建树
    }
    
    // 检查是否包含敏感词
    public static boolean contains(String text) {
        if (StrUtil.isBlank(text)) {
            return false;
        }
        return WORD_TREE.isMatch(text);
    }
    
    // 获取所有敏感词
    public static List<String> findAll(String text) {
        if (StrUtil.isBlank(text)) {
            return Collections.emptyList();
        }
        return WORD_TREE.matchAll(text);
    }
    
    // 获取敏感词详细信息
    public static List<SensitiveWordInfo> findDetail(String text) {
        if (StrUtil.isBlank(text)) {
            return Collections.emptyList();
        }
        
        List<FoundWord> foundWords = WORD_TREE.matchAllWords(text);
        List<SensitiveWordInfo> result = new ArrayList<>();
        
        for (FoundWord fw : foundWords) {
            SensitiveWordInfo info = new SensitiveWordInfo();
            info.setWord(fw.getWord());
            info.setStartIndex(fw.getStartIndex());
            info.setEndIndex(fw.getEndIndex());
            info.setText(fw.getText());
            result.add(info);
        }
        return result;
    }
    
    // 替换敏感词
    public static String replace(String text) {
        return replace(text, DEFAULT_REPLACEMENT);
    }
    
    public static String replace(String text, char replacement) {
        if (StrUtil.isBlank(text)) {
            return text;
        }
        
        List<FoundWord> foundWords = WORD_TREE.matchAllWords(text);
        if (foundWords.isEmpty()) {
            return text;
        }
        
        char[] chars = text.toCharArray();
        for (FoundWord fw : foundWords) {
            for (int i = fw.getStartIndex(); i <= fw.getEndIndex(); i++) {
                chars[i] = replacement;
            }
        }
        return new String(chars);
    }
    
    // 敏感词信息类
    public static class SensitiveWordInfo {
        private String word;
        private int startIndex;
        private int endIndex;
        private String text;
        
        // getter/setter 省略
    }
}

// 使用示例
public class Demo {
    public static void main(String[] args) {
        String text = "这篇文章包含色情和暴力内容，还提到了赌博";
        
        // 检查是否包含敏感词
        if (SensitiveWordUtil.contains(text)) {
            System.out.println("内容包含敏感词！");
            
            // 查找所有敏感词
            List<String> words = SensitiveWordUtil.findAll(text);
            System.out.println("敏感词：" + words);
            
            // 替换敏感词
            String filtered = SensitiveWordUtil.replace(text);
            System.out.println("过滤后：" + filtered);
        }
    }
}

七、性能优化建议

1. 单例模式使用

java 复制代码

// WordTree 是线程安全的，可以复用
public class WordTreeHolder {
    private static final WordTree INSTANCE = new WordTree();
    
    static {
        // 初始化敏感词
        INSTANCE.addWords(loadFromDB());
    }
    
    public static WordTree getInstance() {
        return INSTANCE;
    }
}

2. 异步加载敏感词

java 复制代码

// 使用线程池定时刷新敏感词库
ScheduledExecutorService executor = Executors.newSingleThreadScheduledExecutor();
executor.scheduleAtFixedRate(() -> {
    try {
        List<String> newWords = loadLatestWords();
        WordTree newTree = new WordTree();
        newTree.addWords(newWords);
        // 替换旧的 WordTree
        wordTreeRef.set(newTree);
    } catch (Exception e) {
        log.error("刷新敏感词失败", e);
    }
}, 0, 5, TimeUnit.MINUTES);

八、注意事项

内存占用：敏感词越多，树结构越大，需要注意内存使用
线程安全 ：WordTree 在构建后是只读的，添加新词会修改内部结构，多线程使用时需要注意
编码问题：确保文本和敏感词使用统一的字符编码
性能考虑 ：matchAllWords 比 matchAll 性能更好（减少字符串创建）

九、总结

Hutool 的 DFA 实现具有以下优点：

✅ 时间复杂度 O(n)，匹配效率高
✅ API 简洁易用
✅ 支持停止词过滤
✅ 可获取详细匹配信息

适合用于：评论系统、聊天室、文章发布等需要敏感词过滤的场景。

一、什么是 DFA？

二、添加依赖

三、核心类：WordTree

四、基本使用

1. 构建敏感词树

2. 查找敏感词

3. 判断是否包含敏感词

五、高级用法

1. 获取匹配详情

2. 敏感词替换

3. 停止词处理（跳过无效字符）

4. 大小写敏感控制

六、实战：完整的敏感词过滤工具类

七、性能优化建议

1. 单例模式使用

2. 异步加载敏感词

八、注意事项

九、总结

三、核心类：`WordTree`