jsoup:https://jsoup.org/
依赖
xml
<!--解析网页-->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.10.2</version>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.62</version>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-data-elasticsearch</artifactId>
<version>7.6.2</version>
</dependency>
配置类
java
@Configuration
public class ElasticsearchClientConfig {
@Bean
public RestHighLevelClient restHighLevelClient(){
return new RestHighLevelClient(RestClient.builder(
// 是集群可以配置多个
new HttpHost("127.0.0.1",9200,"http"))
);
}
}
api 使用
java
@SpringBootTest
class ApplicationTests {
@Autowired
// 在容器中的bean的id
@Qualifier("restHighLevelClient")
private RestHighLevelClient restHighLevelClient;
// 创建索引
@Test
void createIndex() throws IOException {
// 创建索引请求
CreateIndexRequest createIndexRequest = new CreateIndexRequest("springboot_01");
// 客户端执行请求,返回响应
CreateIndexResponse createIndexResponse = restHighLevelClient.indices().create(createIndexRequest, RequestOptions.DEFAULT);
System.out.println(createIndexResponse);
}
// 获取索引
@Test
void getIndex() throws IOException {
GetIndexRequest getIndexRequest = new GetIndexRequest("springboot_01");
// 是否存在
boolean exists = restHighLevelClient.indices().exists(getIndexRequest, RequestOptions.DEFAULT);
System.out.println(exists);
GetIndexResponse getIndexResponse = restHighLevelClient.indices().get(getIndexRequest, RequestOptions.DEFAULT);
System.out.println(getIndexResponse);
}
// 删除索引
@Test
void deleteIndex() throws IOException {
DeleteIndexRequest deleteIndexRequest = new DeleteIndexRequest("springboot_01");
AcknowledgedResponse acknowledgedResponse = restHighLevelClient.indices().delete(deleteIndexRequest, RequestOptions.DEFAULT);
// 是否成功删除
System.out.println(acknowledgedResponse.isAcknowledged());
}
@Test
void testDocument() throws IOException {
// 添加文档
IndexRequest indexRequest = new IndexRequest("springboot_01");
// 文档id
indexRequest.id("1");
indexRequest.timeout("1s");
// JSON格式数据
indexRequest.source(JSON.toJSONString(new User("zhangsan",1)), XContentType.JSON);
// 发送请求
IndexResponse indexResponse = restHighLevelClient.index(indexRequest, RequestOptions.DEFAULT);
System.out.println(indexResponse.toString());
// 更新文档
UpdateRequest updateRequest = new UpdateRequest("springboot_01","1");
updateRequest.doc(JSON.toJSONString(new User("zhangsan",3)), XContentType.JSON);
UpdateResponse updateResponse = restHighLevelClient.update(updateRequest, RequestOptions.DEFAULT);
System.out.println(updateResponse);
// 获取文档信息
GetRequest getRequest = new GetRequest("springboot_01","1");
// 判断文档是否存在
boolean exists = restHighLevelClient.exists(getRequest, RequestOptions.DEFAULT);
System.out.println(exists);
GetResponse getResponse = restHighLevelClient.get(getRequest, RequestOptions.DEFAULT);
System.out.println(getResponse);
// 删除文档记录
DeleteRequest deleteRequest = new DeleteRequest("springboot_01","1");
DeleteResponse deleteResponse = restHighLevelClient.delete(deleteRequest, RequestOptions.DEFAULT);
System.out.println(deleteResponse);
}
// 文档批量插入
@Test
void addDocumentBatch() throws IOException {
BulkRequest bulkRequest = new BulkRequest();
bulkRequest.timeout("10s");
for (int i = 0; i < 5; i++) {
bulkRequest.add(
new IndexRequest("springboot_01")
// 不设置id默认是随机id
.id(""+(i+1))
.source(JSON.toJSONString(new User("zhangsan",i+1)), XContentType.JSON)
);
}
BulkResponse bulkResponse = restHighLevelClient.bulk(bulkRequest, RequestOptions.DEFAULT);
// 是否失败
System.out.println(bulkResponse.hasFailures());
}
// 文档查询
@Test
void searchDocument() throws IOException {
SearchRequest searchRequest = new SearchRequest("springboot_01");
// 构建搜索条件
SearchSourceBuilder searchSourceBuilder = new SearchSourceBuilder();
// 匹配所有
//MatchAllQueryBuilder matchAllQueryBuilder = QueryBuilders.matchAllQuery();
// 精确查询
TermQueryBuilder termQueryBuilder = QueryBuilders.termQuery("name", "zhangsan");
searchSourceBuilder.query(termQueryBuilder);
searchSourceBuilder.timeout(new TimeValue(60, TimeUnit.SECONDS));
// 搜索条件放入请求
searchRequest.source(searchSourceBuilder);
SearchResponse searchResponse = restHighLevelClient.search(searchRequest, RequestOptions.DEFAULT);
System.out.println(JSON.toJSONString(searchResponse.getHits()));
for (SearchHit searchHit : searchResponse.getHits().getHits()) {
System.out.println(searchHit.getSourceAsMap());
}
}
}
应用
java
@Service
public class IndexService {
@Autowired
private RestHighLevelClient restHighLevelClient;
// 解析数据放入es索引中
public Boolean parse(String keyword) throws IOException {
// 批量存入
BulkRequest bulkRequest = new BulkRequest();
bulkRequest.timeout("60s");
// Jsoup解析网页
Document document = Jsoup.parse(new URL("https://cn.bing.com/search?q=" + keyword), 30000);
// document 就是浏览器的 document 对象
Element ol = document.getElementById("b_results");
Elements li = ol.getElementsByTag("li");
for (Element el : li) {
String href = el.getElementsByTag("a").eq(0).attr("href");
bulkRequest.add(new IndexRequest("springboot_01").source(JSON.toJSONString(new User(href,0)), XContentType.JSON));
}
BulkResponse bulkResponse = restHighLevelClient.bulk(bulkRequest, RequestOptions.DEFAULT);
return !bulkResponse.hasFailures();
}
// 从es索引中查询
public List<Map<String,Object>> searchES(String keyword) throws IOException {
// 创建搜索请求
SearchRequest searchRequest = new SearchRequest("springboot_01");
// 构建搜索条件
SearchSourceBuilder searchSourceBuilder = new SearchSourceBuilder();
// 精确搜索
TermQueryBuilder termQueryBuilder = QueryBuilders.termQuery("name", keyword);
searchSourceBuilder.query(termQueryBuilder);
searchSourceBuilder.timeout(new TimeValue(60, TimeUnit.SECONDS));
// 高亮设置
HighlightBuilder highlightBuilder = new HighlightBuilder();
highlightBuilder.field("name");
highlightBuilder.preTags("<span style='color:green'>");
highlightBuilder.postTags("</span>");
searchSourceBuilder.highlighter(highlightBuilder);
searchRequest.source(searchSourceBuilder);
// 执行搜索
SearchResponse searchResponse = restHighLevelClient.search(searchRequest, RequestOptions.DEFAULT);
List<Map<String,Object>> list = new ArrayList<>();
// 解析结果
for (SearchHit documentFields : searchResponse.getHits().getHits()) {
Map<String, HighlightField> highlightFieldMap = documentFields.getHighlightFields();// 获取高亮字段
HighlightField highlightField = highlightFieldMap.get("name");// xxx<span style='color:green'>keyword</span>xxx
Map<String, Object> sourceAsMap = documentFields.getSourceAsMap();// 原查询结果
// 高亮内容替换原内容
if (highlightField != null){
Text[] fragments = highlightField.fragments();
String name = "";
for (Text fragment : fragments) {
name += fragment;
}
sourceAsMap.put("name", name);
}
list.add(sourceAsMap);
}
return list;
}
}