高性能AC算法多关键词匹配文本功能Java实现

直接上测试结果:

复制代码
1000000数据集。
1000000关键词(匹配词)

装载消耗时间:20869 毫秒

匹配消耗时间:6599 毫秒

代码和测试案例:

java 复制代码
package com.baian.tggroupmessagematchkeyword.ac;

import lombok.Data;

import java.util.*;

/**
 * @program: tg-parent
 * @description: ac
 * @author: <发哥讲Java-694204477@qq.com>
 * @create: 2023-09-19 17:20
 **/
@Data
public class AhoCorasick {
    private TrieNode root;

    public AhoCorasick() {
        root = new TrieNode();
    }

    public void addKeyword(String keyword) {
        TrieNode current = root;

        for (char ch : keyword.toCharArray()) {
            current = current.getChildren().computeIfAbsent(ch, c -> new TrieNode());
        }

        current.setEndOfWord(true);
        current.addKeyword(keyword);
    }

    public void buildFailureLinks() {
        Queue<TrieNode> queue = new LinkedList<>();
        root.setFailure(null);
        queue.offer(root);

        while (!queue.isEmpty()) {
            TrieNode current = queue.poll();

            for (TrieNode child : current.getChildren().values()) {
                TrieNode failure = current.getFailure();

                while (failure != null && !failure.getChildren().containsKey(child.getKey())) {
                    failure = failure.getFailure();
                }

                if (failure == null) {
                    child.setFailure(root);
                } else {
                    child.setFailure(failure.getChildren().get(child.getKey()));
                    child.addAllKeywords(child.getFailure().getKeywords());
                }

                queue.offer(child);
            }
        }
    }

    public List<String> searchKeywords(String text) {
        List<String> result = new ArrayList<>();
        TrieNode current = root;

        for (int i = 0; i < text.length(); i++) {
            char ch = text.charAt(i);

            while (current != null && !current.getChildren().containsKey(ch)) {
                current = current.getFailure();
            }

            if (current == null) {
                current = root;
            } else {
                current = current.getChildren().get(ch);
                if (current.isEndOfWord()) {
                    result.addAll(current.getKeywords());
                }

                TrieNode failure = current.getFailure();
                while (failure != null) {
                    if (failure.isEndOfWord()) {
                        result.addAll(failure.getKeywords());
                    }
                    failure = failure.getFailure();
                }
            }
        }

        return result;
    }

    public static class TrieNode {
        private char key;
        private boolean endOfWord;
        private TrieNode failure;
        private Map<Character, TrieNode> children;
        private List<String> keywords;

        public TrieNode() {
            children = new HashMap<>();
            keywords = new ArrayList<>();
        }

        public char getKey() {
            return key;
        }

        public void setKey(char key) {
            this.key = key;
        }

        public boolean isEndOfWord() {
            return endOfWord;
        }

        public void setEndOfWord(boolean endOfWord) {
            this.endOfWord = endOfWord;
        }

        public TrieNode getFailure() {
            return failure;
        }

        public void setFailure(TrieNode failure) {
            this.failure = failure;
        }

        public Map<Character, TrieNode> getChildren() {
            return children;
        }

        public List<String> getKeywords() {
            return keywords;
        }

        public void addKeyword(String keyword) {
            keywords.add(keyword);
        }

        public void addAllKeywords(List<String> keywords) {
            this.keywords.addAll(keywords);
        }
    }
}

main:

java 复制代码
package test;

import com.baian.tggroupmessagematchkeyword.ac.AhoCorasick;

import java.util.ArrayList;
import java.util.List;
import java.util.UUID;

/**
 * @program: tg-parent
 * @description: 多样本数据集 测试。
 * @author: <发哥讲Java-694204477@qq.com>
 * @create: 2023-09-19 14:11
 **/
public class TestMain001 {
    public static void main(String[] args) {
        long start0 = System.currentTimeMillis();
        List<String> datas = new ArrayList<>(1000000);
        for (int i = 0; i < 1000000; i++) {
            datas.add(UUID.randomUUID().toString() + UUID.randomUUID().toString());
        }

        AhoCorasick ahoCorasick2 = new AhoCorasick();
        for (int i = 0; i < 1000000; i++) {
            ahoCorasick2.addKeyword(UUID.randomUUID().toString());
        }
        ahoCorasick2.addKeyword("11");
        ahoCorasick2.addKeyword("22");
        ahoCorasick2.buildFailureLinks();
        long end0 = System.currentTimeMillis();
        System.out.println("装载消耗时间:" + (end0 - start0));

        long start = System.currentTimeMillis();
        for (String message : datas) {
            List<String> stringList = ahoCorasick2.searchKeywords(message);
            if (stringList.size() > 0) {
//                System.out.println(stringList + " message:" + message + " size:" + stringList.size());
            }
        }

        long end = System.currentTimeMillis();
        System.out.println("消耗时间:" + (end - start));

    }
}
相关推荐
chools3 小时前
【AI超级智能体】快速搞懂工具调用Tool Calling 和 MCP协议
java·人工智能·学习·ai
李白你好3 小时前
TongWeb EJB 反序列化生成工具(Java-Chain 插件)
java·安全
leobertlan3 小时前
好玩系列:用20元实现快乐保存器
android·人工智能·算法
青梅橘子皮3 小时前
C语言---指针的应用以及一些面试题
c语言·开发语言·算法
U盘失踪了4 小时前
Java 的 JAR 是什么?
java·jar
_深海凉_4 小时前
LeetCode热题100-有效的括号
linux·算法·leetcode
今天又在写代码5 小时前
java-v2
java·开发语言
competes5 小时前
慈善基金投资底层逻辑应用 顶层代码低代码配置平台开发结构方式数据存储模块
java·开发语言·数据库·windows·sql
2501_913061346 小时前
网络原理知识
java·网络
希望永不加班6 小时前
Spring AOP 代理模式:CGLIB 与 JDK 动态代理区别
java·开发语言·后端·spring·代理模式