直接上测试结果:
1000000数据集。
1000000关键词(匹配词)
装载消耗时间:20869 毫秒
匹配消耗时间:6599 毫秒
代码和测试案例:
java
package com.baian.tggroupmessagematchkeyword.ac;
import lombok.Data;
import java.util.*;
/**
* @program: tg-parent
* @description: ac
* @author: <发哥讲Java-694204477@qq.com>
* @create: 2023-09-19 17:20
**/
@Data
public class AhoCorasick {
private TrieNode root;
public AhoCorasick() {
root = new TrieNode();
}
public void addKeyword(String keyword) {
TrieNode current = root;
for (char ch : keyword.toCharArray()) {
current = current.getChildren().computeIfAbsent(ch, c -> new TrieNode());
}
current.setEndOfWord(true);
current.addKeyword(keyword);
}
public void buildFailureLinks() {
Queue<TrieNode> queue = new LinkedList<>();
root.setFailure(null);
queue.offer(root);
while (!queue.isEmpty()) {
TrieNode current = queue.poll();
for (TrieNode child : current.getChildren().values()) {
TrieNode failure = current.getFailure();
while (failure != null && !failure.getChildren().containsKey(child.getKey())) {
failure = failure.getFailure();
}
if (failure == null) {
child.setFailure(root);
} else {
child.setFailure(failure.getChildren().get(child.getKey()));
child.addAllKeywords(child.getFailure().getKeywords());
}
queue.offer(child);
}
}
}
public List<String> searchKeywords(String text) {
List<String> result = new ArrayList<>();
TrieNode current = root;
for (int i = 0; i < text.length(); i++) {
char ch = text.charAt(i);
while (current != null && !current.getChildren().containsKey(ch)) {
current = current.getFailure();
}
if (current == null) {
current = root;
} else {
current = current.getChildren().get(ch);
if (current.isEndOfWord()) {
result.addAll(current.getKeywords());
}
TrieNode failure = current.getFailure();
while (failure != null) {
if (failure.isEndOfWord()) {
result.addAll(failure.getKeywords());
}
failure = failure.getFailure();
}
}
}
return result;
}
public static class TrieNode {
private char key;
private boolean endOfWord;
private TrieNode failure;
private Map<Character, TrieNode> children;
private List<String> keywords;
public TrieNode() {
children = new HashMap<>();
keywords = new ArrayList<>();
}
public char getKey() {
return key;
}
public void setKey(char key) {
this.key = key;
}
public boolean isEndOfWord() {
return endOfWord;
}
public void setEndOfWord(boolean endOfWord) {
this.endOfWord = endOfWord;
}
public TrieNode getFailure() {
return failure;
}
public void setFailure(TrieNode failure) {
this.failure = failure;
}
public Map<Character, TrieNode> getChildren() {
return children;
}
public List<String> getKeywords() {
return keywords;
}
public void addKeyword(String keyword) {
keywords.add(keyword);
}
public void addAllKeywords(List<String> keywords) {
this.keywords.addAll(keywords);
}
}
}
main:
java
package test;
import com.baian.tggroupmessagematchkeyword.ac.AhoCorasick;
import java.util.ArrayList;
import java.util.List;
import java.util.UUID;
/**
* @program: tg-parent
* @description: 多样本数据集 测试。
* @author: <发哥讲Java-694204477@qq.com>
* @create: 2023-09-19 14:11
**/
public class TestMain001 {
public static void main(String[] args) {
long start0 = System.currentTimeMillis();
List<String> datas = new ArrayList<>(1000000);
for (int i = 0; i < 1000000; i++) {
datas.add(UUID.randomUUID().toString() + UUID.randomUUID().toString());
}
AhoCorasick ahoCorasick2 = new AhoCorasick();
for (int i = 0; i < 1000000; i++) {
ahoCorasick2.addKeyword(UUID.randomUUID().toString());
}
ahoCorasick2.addKeyword("11");
ahoCorasick2.addKeyword("22");
ahoCorasick2.buildFailureLinks();
long end0 = System.currentTimeMillis();
System.out.println("装载消耗时间:" + (end0 - start0));
long start = System.currentTimeMillis();
for (String message : datas) {
List<String> stringList = ahoCorasick2.searchKeywords(message);
if (stringList.size() > 0) {
// System.out.println(stringList + " message:" + message + " size:" + stringList.size());
}
}
long end = System.currentTimeMillis();
System.out.println("消耗时间:" + (end - start));
}
}