高性能AC算法多关键词匹配文本功能Java实现

直接上测试结果:

复制代码
1000000数据集。
1000000关键词(匹配词)

装载消耗时间:20869 毫秒

匹配消耗时间:6599 毫秒

代码和测试案例:

java 复制代码
package com.baian.tggroupmessagematchkeyword.ac;

import lombok.Data;

import java.util.*;

/**
 * @program: tg-parent
 * @description: ac
 * @author: <发哥讲Java-694204477@qq.com>
 * @create: 2023-09-19 17:20
 **/
@Data
public class AhoCorasick {
    private TrieNode root;

    public AhoCorasick() {
        root = new TrieNode();
    }

    public void addKeyword(String keyword) {
        TrieNode current = root;

        for (char ch : keyword.toCharArray()) {
            current = current.getChildren().computeIfAbsent(ch, c -> new TrieNode());
        }

        current.setEndOfWord(true);
        current.addKeyword(keyword);
    }

    public void buildFailureLinks() {
        Queue<TrieNode> queue = new LinkedList<>();
        root.setFailure(null);
        queue.offer(root);

        while (!queue.isEmpty()) {
            TrieNode current = queue.poll();

            for (TrieNode child : current.getChildren().values()) {
                TrieNode failure = current.getFailure();

                while (failure != null && !failure.getChildren().containsKey(child.getKey())) {
                    failure = failure.getFailure();
                }

                if (failure == null) {
                    child.setFailure(root);
                } else {
                    child.setFailure(failure.getChildren().get(child.getKey()));
                    child.addAllKeywords(child.getFailure().getKeywords());
                }

                queue.offer(child);
            }
        }
    }

    public List<String> searchKeywords(String text) {
        List<String> result = new ArrayList<>();
        TrieNode current = root;

        for (int i = 0; i < text.length(); i++) {
            char ch = text.charAt(i);

            while (current != null && !current.getChildren().containsKey(ch)) {
                current = current.getFailure();
            }

            if (current == null) {
                current = root;
            } else {
                current = current.getChildren().get(ch);
                if (current.isEndOfWord()) {
                    result.addAll(current.getKeywords());
                }

                TrieNode failure = current.getFailure();
                while (failure != null) {
                    if (failure.isEndOfWord()) {
                        result.addAll(failure.getKeywords());
                    }
                    failure = failure.getFailure();
                }
            }
        }

        return result;
    }

    public static class TrieNode {
        private char key;
        private boolean endOfWord;
        private TrieNode failure;
        private Map<Character, TrieNode> children;
        private List<String> keywords;

        public TrieNode() {
            children = new HashMap<>();
            keywords = new ArrayList<>();
        }

        public char getKey() {
            return key;
        }

        public void setKey(char key) {
            this.key = key;
        }

        public boolean isEndOfWord() {
            return endOfWord;
        }

        public void setEndOfWord(boolean endOfWord) {
            this.endOfWord = endOfWord;
        }

        public TrieNode getFailure() {
            return failure;
        }

        public void setFailure(TrieNode failure) {
            this.failure = failure;
        }

        public Map<Character, TrieNode> getChildren() {
            return children;
        }

        public List<String> getKeywords() {
            return keywords;
        }

        public void addKeyword(String keyword) {
            keywords.add(keyword);
        }

        public void addAllKeywords(List<String> keywords) {
            this.keywords.addAll(keywords);
        }
    }
}

main:

java 复制代码
package test;

import com.baian.tggroupmessagematchkeyword.ac.AhoCorasick;

import java.util.ArrayList;
import java.util.List;
import java.util.UUID;

/**
 * @program: tg-parent
 * @description: 多样本数据集 测试。
 * @author: <发哥讲Java-694204477@qq.com>
 * @create: 2023-09-19 14:11
 **/
public class TestMain001 {
    public static void main(String[] args) {
        long start0 = System.currentTimeMillis();
        List<String> datas = new ArrayList<>(1000000);
        for (int i = 0; i < 1000000; i++) {
            datas.add(UUID.randomUUID().toString() + UUID.randomUUID().toString());
        }

        AhoCorasick ahoCorasick2 = new AhoCorasick();
        for (int i = 0; i < 1000000; i++) {
            ahoCorasick2.addKeyword(UUID.randomUUID().toString());
        }
        ahoCorasick2.addKeyword("11");
        ahoCorasick2.addKeyword("22");
        ahoCorasick2.buildFailureLinks();
        long end0 = System.currentTimeMillis();
        System.out.println("装载消耗时间:" + (end0 - start0));

        long start = System.currentTimeMillis();
        for (String message : datas) {
            List<String> stringList = ahoCorasick2.searchKeywords(message);
            if (stringList.size() > 0) {
//                System.out.println(stringList + " message:" + message + " size:" + stringList.size());
            }
        }

        long end = System.currentTimeMillis();
        System.out.println("消耗时间:" + (end - start));

    }
}
相关推荐
黑胡子大叔的小屋1 小时前
基于springboot的海洋知识服务平台的设计与实现
java·spring boot·毕业设计
ThisIsClark1 小时前
【后端面试总结】深入解析进程和线程的区别
java·jvm·面试
火星机器人life1 小时前
基于ceres优化的3d激光雷达开源算法
算法·3d
虽千万人 吾往矣1 小时前
golang LeetCode 热题 100(动态规划)-更新中
算法·leetcode·动态规划
雷神乐乐2 小时前
Spring学习(一)——Sping-XML
java·学习·spring
arnold662 小时前
华为OD E卷(100分)34-转盘寿司
算法·华为od
小林coding2 小时前
阿里云 Java 后端一面,什么难度?
java·后端·mysql·spring·阿里云
V+zmm101342 小时前
基于小程序宿舍报修系统的设计与实现ssm+论文源码调试讲解
java·小程序·毕业设计·mvc·ssm
ZZTC2 小时前
Floyd算法及其扩展应用
算法
文大。3 小时前
2024年广西职工职业技能大赛-Spring
java·spring·网络安全