高性能AC算法多关键词匹配文本功能Java实现

直接上测试结果:

复制代码
1000000数据集。
1000000关键词(匹配词)

装载消耗时间:20869 毫秒

匹配消耗时间:6599 毫秒

代码和测试案例:

java 复制代码
package com.baian.tggroupmessagematchkeyword.ac;

import lombok.Data;

import java.util.*;

/**
 * @program: tg-parent
 * @description: ac
 * @author: <发哥讲Java-694204477@qq.com>
 * @create: 2023-09-19 17:20
 **/
@Data
public class AhoCorasick {
    private TrieNode root;

    public AhoCorasick() {
        root = new TrieNode();
    }

    public void addKeyword(String keyword) {
        TrieNode current = root;

        for (char ch : keyword.toCharArray()) {
            current = current.getChildren().computeIfAbsent(ch, c -> new TrieNode());
        }

        current.setEndOfWord(true);
        current.addKeyword(keyword);
    }

    public void buildFailureLinks() {
        Queue<TrieNode> queue = new LinkedList<>();
        root.setFailure(null);
        queue.offer(root);

        while (!queue.isEmpty()) {
            TrieNode current = queue.poll();

            for (TrieNode child : current.getChildren().values()) {
                TrieNode failure = current.getFailure();

                while (failure != null && !failure.getChildren().containsKey(child.getKey())) {
                    failure = failure.getFailure();
                }

                if (failure == null) {
                    child.setFailure(root);
                } else {
                    child.setFailure(failure.getChildren().get(child.getKey()));
                    child.addAllKeywords(child.getFailure().getKeywords());
                }

                queue.offer(child);
            }
        }
    }

    public List<String> searchKeywords(String text) {
        List<String> result = new ArrayList<>();
        TrieNode current = root;

        for (int i = 0; i < text.length(); i++) {
            char ch = text.charAt(i);

            while (current != null && !current.getChildren().containsKey(ch)) {
                current = current.getFailure();
            }

            if (current == null) {
                current = root;
            } else {
                current = current.getChildren().get(ch);
                if (current.isEndOfWord()) {
                    result.addAll(current.getKeywords());
                }

                TrieNode failure = current.getFailure();
                while (failure != null) {
                    if (failure.isEndOfWord()) {
                        result.addAll(failure.getKeywords());
                    }
                    failure = failure.getFailure();
                }
            }
        }

        return result;
    }

    public static class TrieNode {
        private char key;
        private boolean endOfWord;
        private TrieNode failure;
        private Map<Character, TrieNode> children;
        private List<String> keywords;

        public TrieNode() {
            children = new HashMap<>();
            keywords = new ArrayList<>();
        }

        public char getKey() {
            return key;
        }

        public void setKey(char key) {
            this.key = key;
        }

        public boolean isEndOfWord() {
            return endOfWord;
        }

        public void setEndOfWord(boolean endOfWord) {
            this.endOfWord = endOfWord;
        }

        public TrieNode getFailure() {
            return failure;
        }

        public void setFailure(TrieNode failure) {
            this.failure = failure;
        }

        public Map<Character, TrieNode> getChildren() {
            return children;
        }

        public List<String> getKeywords() {
            return keywords;
        }

        public void addKeyword(String keyword) {
            keywords.add(keyword);
        }

        public void addAllKeywords(List<String> keywords) {
            this.keywords.addAll(keywords);
        }
    }
}

main:

java 复制代码
package test;

import com.baian.tggroupmessagematchkeyword.ac.AhoCorasick;

import java.util.ArrayList;
import java.util.List;
import java.util.UUID;

/**
 * @program: tg-parent
 * @description: 多样本数据集 测试。
 * @author: <发哥讲Java-694204477@qq.com>
 * @create: 2023-09-19 14:11
 **/
public class TestMain001 {
    public static void main(String[] args) {
        long start0 = System.currentTimeMillis();
        List<String> datas = new ArrayList<>(1000000);
        for (int i = 0; i < 1000000; i++) {
            datas.add(UUID.randomUUID().toString() + UUID.randomUUID().toString());
        }

        AhoCorasick ahoCorasick2 = new AhoCorasick();
        for (int i = 0; i < 1000000; i++) {
            ahoCorasick2.addKeyword(UUID.randomUUID().toString());
        }
        ahoCorasick2.addKeyword("11");
        ahoCorasick2.addKeyword("22");
        ahoCorasick2.buildFailureLinks();
        long end0 = System.currentTimeMillis();
        System.out.println("装载消耗时间:" + (end0 - start0));

        long start = System.currentTimeMillis();
        for (String message : datas) {
            List<String> stringList = ahoCorasick2.searchKeywords(message);
            if (stringList.size() > 0) {
//                System.out.println(stringList + " message:" + message + " size:" + stringList.size());
            }
        }

        long end = System.currentTimeMillis();
        System.out.println("消耗时间:" + (end - start));

    }
}
相关推荐
努力学算法的蒟蒻几秒前
day25(12.5)——leetcode面试经典150
算法·leetcode·职场和发展
xlq223222 分钟前
23.二叉树搜索树(下)
数据结构·c++·算法
找不到、了9 分钟前
栈帧四要素:JVM 方法执行的完整上下文
java·jvm
程序员小假10 分钟前
我们来说一说 Redis IO 多路复用模型
java·后端
okseekw12 分钟前
一篇吃透函数式编程:Lambda表达式与方法引用
java·后端
程序员根根12 分钟前
JavaSE 进阶:IO 流核心知识点(字节流 vs 字符流 + 缓冲流优化 + 实战案例)
java
爱装代码的小瓶子13 分钟前
【c++知识铺子】最后一块拼图-多态
java·开发语言·c++
认真敲代码的小火龙14 分钟前
【JAVA项目】基于JAVA的超市订单管理系统
java·开发语言·课程设计
油丶酸萝卜别吃17 分钟前
在springboot项目中怎么发送请求,设置参数,获取另外一个服务上的数据
java·spring boot·后端
lzh2004091920 分钟前
【数据结构】二叉搜索树
数据结构·算法