需求描述:热搜方式的分词查询。
数据环境:mysql中存在已爬数据表名t_bj(id,titile,content,publishtime),使用logstash同步到es中,springboot连接es实现rest接口给前端页面。
springboot
dom文件:
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-data-elasticsearch</artifactId>
</dependency>
<dependency>
<groupId>org.elasticsearch</groupId>
<artifactId>elasticsearch</artifactId>
<version>7.17.3</version>
</dependency>
yml文件:
elasticsearch:
uris: 192.168.0.1:9200
connection-timeout: 1s
socket-timeout: 30s
java文件:
RsController.java
@RestController
@RequestMapping("/rs")
public class RsController {
@Autowired
private TBjService tBjService;
@PostMapping("/bjKey")
public PageInfo getTLawEs(@RequestBody ReqKey reqKey) {
return tService.getTBj(reqKey.getKeyValue(),reqKey.getPageNum(),reqKey.getPageSize());
}
}
ReqKey.java
@Data
public class ReqKey {
private String keyValue;
private int pageNum;
private int pageSize;
}
TBj.java
@Data
@Document(indexName = "t_bj_index")
public class TBj {
private String id;
private String title;
private String url;
private String content;
private String publishtime;
private String createtime;
}
RsConfig.java
@EnableElasticsearchRepositories(basePackages = {"cn.rs.elastic.repository"})
@Configuration
public class RsConfig {
@Value("${spring.elasticsearch.uris}")
private String hostAndPort;
@Bean
public RestHighLevelClient elasticsearchClient() {
ClientConfiguration clientConfiguration = ClientConfiguration.builder()
.connectedTo(hostAndPort)
.build();
return RestClients.create(clientConfiguration).rest();
}
}
TBjRepository.java
public interface TBjRepository extends ElasticsearchRepository<TBj, String> {
}
TBjService.java
@Service
public class TBjService {
private final TjRepository tBjRepository;
@Autowired
RsUtils rsUtils;
@Autowired
public TBjService(TBjRepository TBjRepository) {
this.tBjRepository = TBjRepository;
}
public PageInfo<TBj> getTBj(String keywords, Integer pageNum, Integer pageSize) {
SearchSourceBuilder searchSourceBuilder = new SearchSourceBuilder();
String[] queryFields = { "title"};
QueryBuilder queryBuilder = QueryBuilders.multiMatchQuery(keywords, queryFields)
.field("content", 2F)
.tieBreaker(0.3F);
searchSourceBuilder.query(queryBuilder);
List<String> highFields = ListUtil.toList(queryFields);
highFields.add("content");
return esUtils.page("t_bj_index", searchSourceBuilder, TBj.class, pageNum,pageSize, highFields);
}
}
RsUtils.java
package cn.rs.elastic.utils;
import cn.hutool.core.collection.CollectionUtil;
import cn.hutool.json.JSONUtil;
import com.github.pagehelper.PageInfo;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
import org.elasticsearch.action.bulk.BulkRequest;
import org.elasticsearch.action.bulk.BulkResponse;
import org.elasticsearch.action.delete.DeleteRequest;
import org.elasticsearch.action.search.SearchRequest;
import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.action.search.SearchScrollRequest;
import org.elasticsearch.action.support.WriteRequest;
import org.elasticsearch.client.RequestOptions;
import org.elasticsearch.client.RestHighLevelClient;
import org.elasticsearch.common.text.Text;
import org.elasticsearch.search.SearchHit;
import org.elasticsearch.search.builder.SearchSourceBuilder;
import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder;
import org.elasticsearch.search.fetch.subphase.highlight.HighlightField;
import org.springframework.stereotype.Component;
import org.springframework.util.CollectionUtils;
import org.springframework.util.ReflectionUtils;
import javax.annotation.Resource;
import java.io.IOException;
import java.lang.reflect.Field;
import java.lang.reflect.Method;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Map;
@Slf4j
@Component
public class EsUtils {
@Resource
private RestHighLevelClient restHighLevelClient;
public PageInfo page(String index, SearchSourceBuilder searchSourceBuilder, Class resultClass,
int currentPage, int size, List highFields) {
SearchRequest request = new SearchRequest(index);
if (CollectionUtil.isNotEmpty(highFields)) {
buildHighLight(searchSourceBuilder, highFields);
}
int num = (currentPage - 1) * size;
searchSourceBuilder.from(num)
.size(size);
request.source(searchSourceBuilder);
SearchResponse response = null;
try {
response = restHighLevelClient.search(request, RequestOptions.DEFAULT);
} catch (IOException e) {
e.printStackTrace();
}
assert response != null;
return analysisResponse(response, resultClass, currentPage, size, highFields);
}
private <T> PageInfo<T> analysisResponse(SearchResponse response, Class<T> resultClass, int currentPage, int size, List<String> highFields) {
SearchHit[] searchHits = response.getHits().getHits();
List<T> retList = new ArrayList<>(searchHits.length);
for (SearchHit searchHit : searchHits) {
String strJson = searchHit.getSourceAsString();
T t = JSONUtil.toBean(strJson, resultClass);
try {
setId(resultClass, t, String.valueOf(searchHit.getId()));
} catch (Exception e) {
log.info("rs 查询数据设置主键id值异常", e);
}
if (!CollectionUtils.isEmpty(highFields)) {
Map<String, HighlightField> highlightFieldMap = searchHit.getHighlightFields();
HighlightField highlightField;
for (String field : highFields) {
highlightField = highlightFieldMap.get(field);
if (highlightField != null) {
Text[] fragments = highlightField.getFragments();
StringBuilder builder = new StringBuilder();
for (Text text : fragments) {
builder.append(text);
}
setValue(resultClass, t, builder.toString(), field);
}
}
}
retList.add(t);
}
long totalNum = response.getHits().getTotalHits().value;
PageInfo<T> pageVo = new PageInfo<>();
pageVo.setPageNum(currentPage);
pageVo.setPageSize(size);
pageVo.setTotal(totalNum);
pageVo.setList(retList);
return pageVo;
}
@SneakyThrows
private <T> void setId(Class<T> resultClass, T t, Object id) {
Field field = ReflectionUtils.findField(resultClass, "id");
if (null != field) {
field.setAccessible(true);
Object object = ReflectionUtils.getField(field, t);
if (object == null) {
Method method = resultClass.getMethod("setId", String.class);
ReflectionUtils.invokeMethod(method, t, id);
}
}
}
@SneakyThrows
private <T> void setValue(Class<T> resultClass, T t, Object fieldValue, String fieldName) {
Field field = ReflectionUtils.findField(resultClass, fieldName);
if (null != field) {
field.setAccessible(true);
String methodName = "set".concat(captureName(fieldName));
Method method = resultClass.getMethod(methodName, String.class);
ReflectionUtils.invokeMethod(method, t, fieldValue);
}
}
private String captureName(String str) {
char[] cs = str.toCharArray();
cs[0] -= 32;
return String.valueOf(cs);
}
private void buildHighLight(SearchSourceBuilder searchSourceBuilder, List<String> fields) {
HighlightBuilder highlightBuilder = new HighlightBuilder();
fields.forEach(highlightBuilder::field);
highlightBuilder.preTags("<em>");
highlightBuilder.postTags("</em>");
searchSourceBuilder.highlighter(highlightBuilder);
}
@AllArgsConstructor
@Data
public class ScrollPageBean<T> {
private String scrollId;
private PageInfo<T> scrollPage;
}
}
elasticSearch
作用:热搜引擎
1下载安装
2配置参数。conf/elasticsearch.yml
cluster.name: rs-single-node-cluster
node.name: rs-single-node
node.roles: ["master", "data"]
path.data: /path/to/data
path.logs: /path/to/logs
network.host: 0.0.0.0
http.port: 9200
cluster.initial_master_nodes: ["sj-single-node"]
xpack.security.enabled: false
xpack.security.enrollment.enabled: false
xpack.security.http.ssl:
enabled: false
keystore.path: certs/http.p12
xpack.security.transport.ssl:
enabled: false
verification_mode: certificate
keystore.path: certs/transport.p12
truststore.path: certs/transport.p12
http.host: 0.0.0.0
transport.port: 9300
xpack.ml.enabled: false
http.cors.enabled: true
http.cors.allow-origin: "*"
2启动。bin/elasticsearch.bat
3启动成功默认端口9200,可录入http://127.0.0.1:9200 查看启动参数
logstash
作用:同步mysql数据到es,可指定同步策略如间隔时间等
1下载安装。
2配置mysql和es连接。
主目录下创建目录和文件mysql_rs_conf/mysql_rs.conf。并将驱动文件拷贝至该目录下。配置文件内容:
input {
jdbc {
jdbc_connection_string => "jdbc:mysql://192.168.1.1:3306/dbBj?useUnicode=true&useSSL=false&characterEncoding=utf-8&serverTimezone=Asia/Shanghai&rewriteBatchedStatements=true"
jdbc_user => "bj"
jdbc_password => "123456"
jdbc_driver_library => "D:/logstash/logstash-8.12.2/mysql_es_conf/mysql-connector-j-8.0.33.jar"
jdbc_driver_class => "com.mysql.jdbc.Driver"
jdbc_paging_enabled => "true"
jdbc_page_size => "10000"
statement => "select * from t_bj"
schedule => "* * * * *"
}
}
output {
if [table] == "t_bj" {
elasticsearch {
hosts => "192.168.1.36:9200"
index => "t_bj_index"
document_id => "%{id}"
}
}
stdout {
codec => json_lines
}
}
3启动。默认端口5601
可手创建启动文件lg.cmd。内容:
chcp 65001
logstash -f ./mysql_rs_conf/mysql_rs.conf
4验证与访问。
kinana
作用:可查看同步数据、测试、监控
1下载安装。
2配置指向ES。
3启动。默认端口5601
4验证与访问。URL:127.0.0.1:5601/app/integrations/browse
mysql
作用:热搜的中间过渡存储,用于存储爬虫数据。
建表:
CREATE TABLE t_bj
(
id
int(11) NOT NULL AUTO_INCREMENT,
title
varchar(1000) NOT NULL COMMENT '文章标题',
url
varchar(1000) DEFAULT NULL COMMENT '文章地址',
content
longtext COMMENT '文章内容',
publishtime
varchar(30) DEFAULT NULL COMMENT '发布时间',
createtime
datetime DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
PRIMARY KEY (id
,title
) USING BTREE
) ENGINE=InnoDB AUTO_INCREMENT=492 DEFAULT CHARSET=utf8 COMMENT='保健'