使用AC自动机实现敏感词过滤(java)

主要分成2部分

trie树的构建（前缀树，字典树）
fail指针的构建

1. trie 树

同一层级不会有重复的字符
敏感词的最后一个字符会标记，并携带敏感词的长度

2. fail 指针的构建

fail 指针是指在某个分支匹配失败后，重新指向关联的其他分支上

构建fail指针的遍历为层次遍历（广度优先）
root节点的fail指针指向null
如果当前节点的父节点的fail指针指向的节点下存在与当前节点一样的子节点，则当前节点的fail指针指向该子节点，否则指向root节点
如果当前节点的失败节点也是end节点，则将失败节点的长度信息合并到当前节点

java 复制代码

package com.xx.xxx.匹配算法;

import lombok.Data;
import lombok.NoArgsConstructor;
import org.springframework.util.CollectionUtils;

import java.util.*;

/**
 * 多模匹配算法，AC自动机
 * 给出一个字符串，匹配多个敏感词
 * demo:
 * 敏感词库    he say her shr she
 * 被检测字符  sherhsay
 * 检测结果    she her he say
 */
public class AC {
    @Data
    @NoArgsConstructor
    public static class ACNode {
        Character am;
        // 子节点
        Map<Character, ACNode> children = new HashMap<>();
        ACNode failNode;
        // 存储匹配到的敏感字符长度
        List<Integer> wordLength = new ArrayList<>();
        // 是否是结束字符
        private boolean endOfWord;

        public ACNode(Character am) {
            this.am = am;
        }

        public String toString() {
            return "ACNode{" +
                    "am=" + am + "," +
                    "children=" + children +
                    ",wordLength=" + wordLength +
                    '}';
        }


        // 构建字典树
        public static void insert(ACNode root, String s) {
            ACNode temp = root;
            char[] chars = s.toCharArray();
            for (int i = 0; i < chars.length; i++) {
                if (!temp.children.containsKey(chars[i])) {
                    temp.children.put(chars[i], new ACNode(chars[i]));
                }
                temp = temp.children.get(chars[i]);
                // 如果是最后一个字符,则设置为结束字符
                if (i == chars.length - 1) {
                    temp.setEndOfWord(true);
                    temp.getWordLength().add(chars.length);
                }
            }
        }

        // 构建失败指针
        public static void buildFailPoint(ACNode root) {
            // 第一层的失败指针都是执行root,直接让第一层进入队列,方便 BFS
            Queue<ACNode> queue = new LinkedList<>();
            Map<Character, ACNode> childrens = root.getChildren();
            for (ACNode acNode : childrens.values()) {
                queue.offer(acNode);
                acNode.setFailNode(root);
            }
            // 构建剩余节点的失败指针,按层次遍历
            while (!queue.isEmpty()) {
                ACNode pnode = queue.poll();
                childrens = pnode.getChildren();
                Set<Map.Entry<Character, ACNode>> entries = childrens.entrySet();
                for (Map.Entry<Character, ACNode> entry : entries) {
                    // 当前节点的字符
                    Character key = entry.getKey();
                    ACNode cnode = entry.getValue();
                    // 如果当前节点的父节点的fail指针指向的节点下存在与当前节点一样的子节点，则当前节点的fail指针指向该子节点，否则指向root节点
                    if (pnode.failNode.children.containsKey(key)) {
                        cnode.setFailNode(pnode.failNode.children.get(key));
                    } else {
                        cnode.setFailNode(root);
                    }
                    // 如果当前节点的失败节点的wordLength不为空，则将当前节点的失败节点wordLength 合并到到当前节点的wordLength中
                    if (!CollectionUtils.isEmpty(cnode.failNode.wordLength)) {
                        cnode.getWordLength().addAll(cnode.failNode.wordLength);
                    }
                    queue.offer(cnode);
                }
            }

        }

        public static void query(ACNode root, String s) {
            ACNode temp = root;
            char[] chars = s.toCharArray();
            for (int i = 0; i < s.length(); i++) {
                // 如果这个字符串在当前节点的孩子节点找不到，且当前节点的fail指针不是null,则去失败指针去查找
                while (!temp.getChildren().containsKey(chars[i]) && temp.failNode != null) {
                    temp = temp.failNode;
                }
                // 如果当前节点有这个字符，则将temp替换为下面的孩子节点
                if (temp.getChildren().containsKey(chars[i])) {
                    temp = temp.getChildren().get(chars[i]);
                } else {
                    // 如果temp的failNode==null,则为root节点
                    continue;
                }
                // 如果检测到节点是结束字符，则将匹配到的敏感字符打印
                if (temp.isEndOfWord()) {
                    handle(temp, s, i);
                }
            }
        }

        public static void handle(ACNode node, String word, int curPoint) {
            for (Integer wordLen : node.wordLength) {
                int start = curPoint - wordLen + 1;
                String mathStr = word.substring(start, curPoint + 1);
                System.out.println("位置信息:[" + start + "," + curPoint + "),敏感词=" + mathStr);
            }
        }


        public static void main(String[] args) {
            ACNode root = new ACNode('-');
            root.failNode = null;
            insert(root, "黑社会");
            insert(root, "色情");
            insert(root, "黑暗任务");
            insert(root, "黑色会");
            insert(root, "国民党");
            insert(root, "国民");
            buildFailPoint(root);
            query(root, "按计划多久啊是德国 按时间大概是国民党卡的韩国阿克苏接电话ask接电话ask的话asks对话框，节点哈桑打算离开的机会撒的撒" +
                    "旦和了色情垃圾上单拉萨的黑色会啊是的噶时间大概时间大概是孤岛惊魂过去问工业国国民党");
        }


    }
}

参考B站大神 LDLD是程序员的视频

【全程干货】程序员必备算法！AC自动机算法敏感词匹配算法！动画演示讲解，看完轻松掌握，面试官都被你唬住！！_哔哩哔哩_bilibili