使用Apache Lucene构建高效的全文搜索服务
在现代应用程序中,全文搜索功能是不可或缺的一部分。无论是电子商务网站、内容管理系统,还是数据分析平台,快速、准确地搜索大量数据是提升用户体验的关键。Apache Lucene 是一个强大的全文搜索引擎库,它提供了高效的索引和搜索功能,能够轻松集成到Java应用程序中。本文将介绍如何使用Apache Lucene构建一个高效的全文搜索服务,并通过一个实际的Java代码示例来展示其核心功能。
1. Lucene简介
Apache Lucene 是一个高性能、全功能的文本搜索引擎库,使用Java编写。它提供了强大的索引和搜索功能,支持多种查询类型,如布尔查询、范围查询、模糊查询等。Lucene的核心优势在于其高效的索引结构和灵活的API,使得开发者可以轻松地构建复杂的搜索功能。
2. 项目结构
在这个示例中,我们将构建一个简单的搜索服务,用于索引和搜索拍卖交易历史记录(AtcoinDealhistory)。项目的主要类 LuceneService 负责管理索引的创建、更新和搜索操作。
3. 索引创建与更新
在Lucene中,索引的创建和更新是通过 IndexWriter 来完成的。IndexWriter 负责将文档(Document)添加到索引中,并确保索引的高效存储和检索。
java
public void indexDocument(List<AtcoinDealhistory> list) throws IOException {
long startTime = System.currentTimeMillis(); // 记录开始时间
// 配置IndexWriter
IndexWriterConfig config = new IndexWriterConfig(analyzer);
config.setMaxBufferedDocs(20); // 设置最大缓冲文档数
config.setRAMBufferSizeMB(2048.0); // 设置RAM缓冲区大小
config.setUseCompoundFile(true); // 使用复合文件格式
// 使用try-with-resources确保IndexWriter正确关闭
try (IndexWriter indexWriter = new IndexWriter(directory, config)) {
for (AtcoinDealhistory atcoinDealhistory : list) {
Document doc = new Document();
if (atcoinDealhistory.getAuctionId() != null) {
// 添加字段到文档
doc.add(new StringField("id", atcoinDealhistory.getId().toString(), Store.YES));
doc.add(new StringField("auction_id", atcoinDealhistory.getAuctionId(), Store.YES));
doc.add(new TextField("auction_name", atcoinDealhistory.getAuctionName(), Store.YES));
doc.add(new StringField("amount", atcoinDealhistory.getAmount(), Store.YES));
doc.add(new TextField("data", atcoinDealhistory.getData(), Store.YES));
doc.add(new StringField("picture", atcoinDealhistory.getPicture(), Store.YES));
// 处理日期字段
if (atcoinDealhistory.getDealdate() != null) {
try {
SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd");
String dateStr = dateFormat.format(atcoinDealhistory.getDealdate());
// 添加日期字段,用于排序
doc.add(new SortedNumericDocValuesField("date", atcoinDealhistory.getDealdate().getTime()));
// 添加日期字段,用于存储
doc.add(new StringField("dealdate", dateStr, Store.YES));
} catch (Exception e) {
e.printStackTrace();
}
}
}
// 将文档添加到索引
indexWriter.addDocument(doc);
indexWriter.commit(); // 提交更改
}
} catch (Exception e) {
e.printStackTrace();
}
long endTime = System.currentTimeMillis(); // 记录结束时间
System.out.println("Index creation time: " + (endTime - startTime) + " milliseconds");
}
在这个方法中,我们首先配置了 IndexWriter,然后遍历 AtcoinDealhistory 对象列表,将每个对象的字段添加到 Document 中,并将其写入索引。我们还处理了日期字段,确保它们可以用于排序和存储。
4. 搜索功能
Lucene 的搜索功能是通过 IndexSearcher 来实现的。IndexSearcher 负责执行查询并返回匹配的文档。我们可以使用多种查询类型来构建复杂的搜索条件。
java
public List<AtcoinDealhistory> search(AtcoinDealhistory atcoinDealhistory) throws IOException, org.apache.lucene.queryparser.classic.ParseException {
List<AtcoinDealhistory> results = new ArrayList<>();
long startTime = System.currentTimeMillis(); // 记录开始时间
// 打开索引目录
try (DirectoryReader reader = DirectoryReader.open(directory)) {
IndexSearcher searcher = new IndexSearcher(reader);
BooleanQuery.Builder booleanQueryBuilder = new BooleanQuery.Builder();
// 处理日期范围查询
String startQueryDealDate = atcoinDealhistory.getStartQueryDealDate();
String endQueryDealDate = atcoinDealhistory.getEndQueryDealDate();
if (startQueryDealDate != null && endQueryDealDate != null) {
TermRangeQuery dateRangeQuery = TermRangeQuery.newStringRange("dealdate", startQueryDealDate, endQueryDealDate, true, true);
booleanQueryBuilder.add(dateRangeQuery, Occur.MUST);
}
// 处理金额范围查询
Integer startQueryAmount = atcoinDealhistory.getStartQueryAmount();
Integer endQueryAmount = atcoinDealhistory.getEndQueryAmount();
if (startQueryAmount != null && endQueryAmount != null) {
TermRangeQuery amountRangeQuery = TermRangeQuery.newStringRange("amount", Integer.toString(startQueryAmount), Integer.toString(endQueryAmount), true, true);
booleanQueryBuilder.add(amountRangeQuery, Occur.MUST);
}
// 处理拍卖名称的关键词查询
List<String> terms = analyzeQueryString(analyzer, atcoinDealhistory.getAuctionName());
for (String term : terms) {
TermQuery termQuery = new TermQuery(new Term("auction_name", term));
booleanQueryBuilder.add(termQuery, Occur.MUST);
}
// 设置排序规则,根据日期字段降序排列
SortField sortDate = new SortField("date", SortField.Type.LONG, true);
Sort sort = new Sort(SortField.FIELD_SCORE, sortDate);
// 执行查询
ScoreDoc[] hits = searcher.search(booleanQueryBuilder.build(), 100, sort).scoreDocs;
SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd");
// 遍历查询结果
for (ScoreDoc hit : hits) {
Document doc = searcher.doc(hit.doc);
AtcoinDealhistory result = new AtcoinDealhistory();
result.setId(Long.valueOf(doc.get("id")));
result.setAuctionId(doc.get("auction_id"));
result.setAuctionName(doc.get("auction_name"));
result.setAmount(doc.get("amount"));
result.setData(doc.get("data"));
result.setPicture(doc.get("picture"));
// 处理日期字段
if (doc.get("dealdate") != null) {
try {
Date dealdate = dateFormat.parse(doc.get("dealdate"));
result.setDealdate(dealdate);
} catch (Exception e) {
e.printStackTrace();
}
}
results.add(result);
}
}
long endTime = System.currentTimeMillis(); // 记录结束时间
System.out.println("Index search time: " + (endTime - startTime) + " milliseconds");
return results;
}
在这个方法中,我们首先打开索引目录并创建 IndexSearcher。然后,我们构建了一个布尔查询(BooleanQuery),用于处理日期范围、金额范围和关键词查询。最后,我们执行查询并遍历结果,将匹配的文档转换为 AtcoinDealhistory 对象并返回。
5. 分词器
Lucene 的分词器(Analyzer)用于将文本分解为单词或词语。在这个示例中,我们使用了 StandardAnalyzer,它是Lucene提供的一个标准分词器,适用于大多数英文文本。
java
public LuceneService() throws IOException {
// 索引目录的路径
this.directory = FSDirectory.open(Paths.get(INDEX_DIR));
// 标准索引解析器
this.analyzer = new StandardAnalyzer();
// 第三方分词器解析器
// this.analyzer = new ReIKAnalyzer(false);
}
java
public List<String> analyzeQueryString(Analyzer analyzer, String queryString) throws IOException {
List<String> terms = new ArrayList<>();
// 使用分词器处理查询字符串
try (TokenStream tokenStream = analyzer.tokenStream("auction_name", new StringReader(queryString))) {
CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
tokenStream.reset();
// 遍历分词结果
while (tokenStream.incrementToken()) {
terms.add(charTermAttribute.toString());
}
tokenStream.end();
}
return terms;
}
这个方法将查询字符串分解为多个词语,并将它们添加到列表中,以便在搜索时使用。
6. 总结
通过这个示例,我们展示了如何使用Apache Lucene构建一个高效的全文搜索服务。Lucene提供了强大的索引和搜索功能,使得开发者可以轻松地处理复杂的搜索需求。无论是处理结构化数据还是非结构化文本,Lucene都能提供高效的解决方案。
7. 附录
maven 依赖
java
<!-- 搜索模块 -->
<!-- Lucene Core Dependency -->
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-core</artifactId>
<version>8.11.0</version> <!-- 请根据需要选择合适的版本 -->
</dependency>
<!-- Lucene Analyzers Dependency -->
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-analyzers-common</artifactId>
<version>8.11.0</version>
</dependency>
<!-- 如果需要其他Lucene模块,也可以继续添加 -->
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-queryparser</artifactId>
<version>8.11.0</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-highlighter</artifactId>
<version>8.11.0</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-memory</artifactId>
<version>8.11.0</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-queries</artifactId>
<version>8.11.0</version>
</dependency>
<!-- <dependency>
<groupId>org.truenewx</groupId>
<artifactId>ik-analyzer-lucene</artifactId>
<version>5.0.1</version>
</dependency>-->
<dependency>
<groupId>com.github.keran213539</groupId>
<artifactId>IK_Analyzer</artifactId>
<version>2012FF_hf1_1</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.ansj/ansj_seg -->
<!-- <dependency>
<groupId>org.ansj</groupId>
<artifactId>ansj_seg</artifactId>
<version>5.1.6</version>
</dependency>-->
多线程索引创建
索引创建的时间有点久了,增加多线程处理
java
// 创建或更新索引
public void indexDocument(List<AtcoinDealhistory> list) throws IOException {
long startTime = System.currentTimeMillis(); // 记录开始时间
//引入多线程
ExecutorService executorService = Executors.newFixedThreadPool(numThreads);
try {
for (AtcoinDealhistory atcoinDealhistory : list) {
executorService.submit(() -> {
try {
indexDocument(this.directory,this.analyzer,atcoinDealhistory);
} catch (IOException e) {
e.printStackTrace();
}
});
}
executorService.shutdown();
executorService.awaitTermination(Long.MAX_VALUE, TimeUnit.NANOSECONDS);
long endTime = System.currentTimeMillis(); // 记录结束时间
// 计算索引创建时间
long indexCreationTime = endTime - startTime;
System.out.println("Index creation time: " + indexCreationTime/1000 + " milliseconds");
} catch (Exception e) {
System.out.println(e.getMessage());
e.printStackTrace();
}
}
IK分词器重写
因为Lucene版本问题,IKAnalyzer 需要进行重写
java
package com.atcoin.busi.test;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Tokenizer;
import org.wltea.analyzer.cfg.Configuration;
public class ReIKAnalyzer extends Analyzer {
private boolean useSmart;
//分词器配置项
private Configuration cfg;
public Configuration getCfg() {
return cfg;
}
public void setCfg(Configuration cfg) {
this.cfg = cfg;
}
public boolean useSmart() {
return useSmart;
}
public void setUseSmart(boolean useSmart) {
this.useSmart = useSmart;
}
/**
* IK分词器Lucene 5.4.0 Analyzer接口实现类
*
* 默认细粒度切分算法
*/
public ReIKAnalyzer() {
this(false);
}
/**
* IK分词器Lucene 5.4.0 Analyzer接口实现类
*
* @param useSmart
* 当为true时,分词器进行智能切分
*/
public ReIKAnalyzer(boolean useSmart) {
super();
this.useSmart = useSmart;
}
/**
* IK分词器Lucene 5.4.0 Analyzer接口实现类
*
* @param cfg
*/
public ReIKAnalyzer(Configuration cfg) {
super();
this.setCfg(cfg);
}
/**
* 重载Analyzer接口,构造分词组件
*
* @param fieldName
* the name of the fields content passed to the
* TokenStreamComponents sink as a reader
*/
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer _IKTokenizer = new ReIKTokenizer(this.useSmart());
return new TokenStreamComponents(_IKTokenizer);
}
}
java
package com.atcoin.busi.test;
import java.io.IOException;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.wltea.analyzer.cfg.Configuration;
import org.wltea.analyzer.core.IKSegmenter;
import org.wltea.analyzer.core.Lexeme;
public class ReIKTokenizer extends Tokenizer {
// IK分词器实现
private IKSegmenter _IKImplement;
// 词元文本属性
private final CharTermAttribute termAtt;
// 词元位移属性
private final OffsetAttribute offsetAtt;
// 词元分类属性(该属性分类参考org.wltea.analyzer.core.Lexeme中的分类常量)
private final TypeAttribute typeAtt;
// 记录最后一个词元的结束位置
private int endPosition;
/**
* Lucene 5.4.0 Tokenizer适配器类构造函数
*
* @param in
* @param useSmart
*/
public ReIKTokenizer(boolean useSmart) {
super();
offsetAtt = addAttribute(OffsetAttribute.class);
termAtt = addAttribute(CharTermAttribute.class);
typeAtt = addAttribute(TypeAttribute.class);
_IKImplement = new IKSegmenter(input, useSmart);
}
/**
* Lucene 5.4.0 Tokenizer适配器类构造函数
*
* @param in
* @param cfg
*/
public ReIKTokenizer(Configuration cfg) {
super();
offsetAtt = addAttribute(OffsetAttribute.class);
termAtt = addAttribute(CharTermAttribute.class);
typeAtt = addAttribute(TypeAttribute.class);
_IKImplement = new IKSegmenter(input, cfg);
}
@Override
public boolean incrementToken() throws IOException {
// 清除所有的词元属性
clearAttributes();
Lexeme nextLexeme = _IKImplement.next();
if (nextLexeme != null) {
// 将Lexeme转成Attributes
// 设置词元文本
termAtt.append(nextLexeme.getLexemeText());
// 设置词元长度
termAtt.setLength(nextLexeme.getLength());
// 设置词元位移
offsetAtt.setOffset(nextLexeme.getBeginPosition(),
nextLexeme.getEndPosition());
// 记录分词的最后位置
endPosition = nextLexeme.getEndPosition();
// 记录词元分类
typeAtt.setType(nextLexeme.getLexemeTypeString());
// 返会true告知还有下个词元
return true;
}
// 返会false告知词元输出完毕
return false;
}
@Override
public void reset() throws IOException {
super.reset();
_IKImplement.reset(input);
}
@Override
public final void end() {
// set final offset
int finalOffset = correctOffset(this.endPosition);
offsetAtt.setOffset(finalOffset, finalOffset);
}
}
接口调用
java
package com.atcoin.busi.controller;
import java.io.IOException;
import java.util.List;
import javax.annotation.PreDestroy;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.bind.annotation.RestController;
import com.atcoin.busi.domain.AtcoinDealhistory;
import com.atcoin.busi.service.IAtcoinDealhistoryService;
import com.atcoin.busi.service.impl.LuceneService;
@RestController
@RequestMapping("/lucene")
public class AtcoinLuceneController {
@Autowired
private IAtcoinDealhistoryService atcoinDealhistoryService;
@Autowired
private LuceneService luceneService;
// @PostMapping("/index")
// public String indexDocument(@RequestParam String id, @RequestParam String content) {
// try {
// luceneService.indexDocument(id, content);
// return "Document indexed successfully.";
// } catch (IOException e) {
// e.printStackTrace();
// return "Failed to index document.";
// }
// }
@GetMapping("/createIndex")
public String createIndex() {
try {
List<AtcoinDealhistory> list = atcoinDealhistoryService.selectAtcoinDealhistoryIndex();
luceneService.indexDocument(list);
return "Document indexed successfully.";
} catch (IOException e) {
e.printStackTrace();
return "Failed to index document.";
}
}
@GetMapping("/search")
public List<AtcoinDealhistory> search(@RequestParam String keywords) {
try {
return luceneService.search(keywords);
} catch (IOException | org.apache.lucene.queryparser.classic.ParseException e) {
e.printStackTrace();
return null;
}
}
// 在应用关闭时关闭Lucene资源
@PreDestroy
public void close() {
try {
luceneService.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}