高性能AC算法多关键词匹配文本功能Java实现

直接上测试结果:

复制代码
1000000数据集。
1000000关键词(匹配词)

装载消耗时间:20869 毫秒

匹配消耗时间:6599 毫秒

代码和测试案例:

java 复制代码
package com.baian.tggroupmessagematchkeyword.ac;

import lombok.Data;

import java.util.*;

/**
 * @program: tg-parent
 * @description: ac
 * @author: <发哥讲Java-694204477@qq.com>
 * @create: 2023-09-19 17:20
 **/
@Data
public class AhoCorasick {
    private TrieNode root;

    public AhoCorasick() {
        root = new TrieNode();
    }

    public void addKeyword(String keyword) {
        TrieNode current = root;

        for (char ch : keyword.toCharArray()) {
            current = current.getChildren().computeIfAbsent(ch, c -> new TrieNode());
        }

        current.setEndOfWord(true);
        current.addKeyword(keyword);
    }

    public void buildFailureLinks() {
        Queue<TrieNode> queue = new LinkedList<>();
        root.setFailure(null);
        queue.offer(root);

        while (!queue.isEmpty()) {
            TrieNode current = queue.poll();

            for (TrieNode child : current.getChildren().values()) {
                TrieNode failure = current.getFailure();

                while (failure != null && !failure.getChildren().containsKey(child.getKey())) {
                    failure = failure.getFailure();
                }

                if (failure == null) {
                    child.setFailure(root);
                } else {
                    child.setFailure(failure.getChildren().get(child.getKey()));
                    child.addAllKeywords(child.getFailure().getKeywords());
                }

                queue.offer(child);
            }
        }
    }

    public List<String> searchKeywords(String text) {
        List<String> result = new ArrayList<>();
        TrieNode current = root;

        for (int i = 0; i < text.length(); i++) {
            char ch = text.charAt(i);

            while (current != null && !current.getChildren().containsKey(ch)) {
                current = current.getFailure();
            }

            if (current == null) {
                current = root;
            } else {
                current = current.getChildren().get(ch);
                if (current.isEndOfWord()) {
                    result.addAll(current.getKeywords());
                }

                TrieNode failure = current.getFailure();
                while (failure != null) {
                    if (failure.isEndOfWord()) {
                        result.addAll(failure.getKeywords());
                    }
                    failure = failure.getFailure();
                }
            }
        }

        return result;
    }

    public static class TrieNode {
        private char key;
        private boolean endOfWord;
        private TrieNode failure;
        private Map<Character, TrieNode> children;
        private List<String> keywords;

        public TrieNode() {
            children = new HashMap<>();
            keywords = new ArrayList<>();
        }

        public char getKey() {
            return key;
        }

        public void setKey(char key) {
            this.key = key;
        }

        public boolean isEndOfWord() {
            return endOfWord;
        }

        public void setEndOfWord(boolean endOfWord) {
            this.endOfWord = endOfWord;
        }

        public TrieNode getFailure() {
            return failure;
        }

        public void setFailure(TrieNode failure) {
            this.failure = failure;
        }

        public Map<Character, TrieNode> getChildren() {
            return children;
        }

        public List<String> getKeywords() {
            return keywords;
        }

        public void addKeyword(String keyword) {
            keywords.add(keyword);
        }

        public void addAllKeywords(List<String> keywords) {
            this.keywords.addAll(keywords);
        }
    }
}

main:

java 复制代码
package test;

import com.baian.tggroupmessagematchkeyword.ac.AhoCorasick;

import java.util.ArrayList;
import java.util.List;
import java.util.UUID;

/**
 * @program: tg-parent
 * @description: 多样本数据集 测试。
 * @author: <发哥讲Java-694204477@qq.com>
 * @create: 2023-09-19 14:11
 **/
public class TestMain001 {
    public static void main(String[] args) {
        long start0 = System.currentTimeMillis();
        List<String> datas = new ArrayList<>(1000000);
        for (int i = 0; i < 1000000; i++) {
            datas.add(UUID.randomUUID().toString() + UUID.randomUUID().toString());
        }

        AhoCorasick ahoCorasick2 = new AhoCorasick();
        for (int i = 0; i < 1000000; i++) {
            ahoCorasick2.addKeyword(UUID.randomUUID().toString());
        }
        ahoCorasick2.addKeyword("11");
        ahoCorasick2.addKeyword("22");
        ahoCorasick2.buildFailureLinks();
        long end0 = System.currentTimeMillis();
        System.out.println("装载消耗时间:" + (end0 - start0));

        long start = System.currentTimeMillis();
        for (String message : datas) {
            List<String> stringList = ahoCorasick2.searchKeywords(message);
            if (stringList.size() > 0) {
//                System.out.println(stringList + " message:" + message + " size:" + stringList.size());
            }
        }

        long end = System.currentTimeMillis();
        System.out.println("消耗时间:" + (end - start));

    }
}
相关推荐
蓝田~22 分钟前
java.sql.Date 弃用分析与替代方案
java·microsoft
论迹23 分钟前
【JavaEE】-- 计算机是如何工作的
java·java-ee·dubbo
Scabbards_24 分钟前
Canny 边缘检测
人工智能·算法·计算机视觉
计算机-秋大田39 分钟前
微信外卖小城程序设计与实现(LW+源码+讲解)
java·开发语言·微信·微信小程序·小程序·课程设计
2301_793069821 小时前
JAVA 接口、抽象类的关系和用处 详细解析
java·开发语言
YGRHSXZYGR1 小时前
2025年寒假ACM训练赛1
算法
fancc椰1 小时前
C++ - AVL平衡二叉树
数据结构·c++·算法
kerwin_code1 小时前
Sentinel 控制台集成 Nacos 实现规则配置双向同步和持久化存储(提供改造后源码)
java·sentinel
大秦王多鱼1 小时前
【2025年最新版】Java JDK安装、环境配置教程 (图文非常详细)
java·开发语言
—丫丫1 小时前
uniapp商城项目之商品详情
java·javascript·uni-app