一:爬取京东数据
package com.esjd.Utils;
import lombok.SneakyThrows;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.net.MalformedURLException;
import java.net.URL;
public class HtmlParseUtil {
@SneakyThrows
public static void main(String[] args) {
//获取请求 https://search.jd.com/Search?keyword=java 需要联网
String url = "https://search.jd.com/Search?keyword=java";
//解析网页 jsoup返回的 Document就是游览器 Document对象
Document document = Jsoup.parse(new URL(url),30000);
//获取网页idJ_goodsList
Element element = document.getElementById("J_goodsList");
System.out.println(element.html());
//获取所有的li元素
Elements elements = document.getElementsByTag("li");
for (Element element1 : elements) {
String img = element1.getElementsByTag("img").eq(0).attr("data-lazy-img");
String price = element1.getElementsByClass("p-price").eq(0).text();
String title = element1.getElementsByClass("p-name").eq(0).text();
System.out.println("______________________________________--");
System.out.println(img);
System.out.println(price);
System.out.println(title);
}
}
}
封装成工具类
@SneakyThrows
public List<Content> paresJD(String keyword){
//获取请求 https://search.jd.com/Search?keyword=java 需要联网
String urlKeywords = URLEncoder.encode(keyword, "UTF-8");
//获取请求 https://search.jd.com/Search?keyword=java
//前提: 需要联网, 而且不能获取到AJAX!
String url ="https://search.jd.com/Search?keyword=" + urlKeywords + "&enc=utf-8";
//解析网页 jsoup返回的 Document就是游览器 Document对象
Document document = Jsoup.parse(new URL(url),30000);
//获取网页idJ_goodsList
Element element = document.getElementById("J_goodsList");
//System.out.println(element.html());
//获取所有的li元素
Elements elements = document.getElementsByTag("li");
ArrayList<Content> goodsList = new ArrayList<>();
for (Element element1 : elements) {
if (element1.attr("class").equalsIgnoreCase("gl-item")) {
String img = element1.getElementsByTag("img").eq(0).attr("data-lazy-img");
String price = element1.getElementsByClass("p-price").eq(0).text();
String title = element1.getElementsByClass("p-name").eq(0).text();
Content content = new Content();
content.setTitle(title);
content.setPrice(price);
content.setImg(img);
goodsList.add(content);
}
}
return goodsList;
}
编写pojo类
@Data
@AllArgsConstructor
@NoArgsConstructor
public class Content {
//根据业务需求自己添加属性
private String title;
private String img;
private String price;
}
解析数据到es中
@Autowired
// 不能直接使用 @Autowired 需要spring容器
private RestHighLevelClient restHighLevelClient;
//解析数据放入es中
public Boolean parseContent(String keywords) throws IOException {
List<Content> contents = new HtmlParseUtil().paresJD(keywords);
//把查询的数据放入es中
BulkRequest bulkRequest = new BulkRequest();
bulkRequest.timeout("2m");
for (int i = 0; i < contents.size(); i++) {
System.out.println(JSON.toJSONString(contents.get(i)));
bulkRequest.add(new IndexRequest("jd_goods")
.source(JSON.toJSONString(
contents.get(i)), XContentType.JSON));
}
BulkResponse bulk = restHighLevelClient.bulk(bulkRequest, RequestOptions.DEFAULT);
//判断返回是否成功
return !bulk.hasFailures();
}}
对应的controller接口
@Autowired
private ContentService contentService;
//爬取数据到es中
@GetMapping("/pares/{keyword}")
public Boolean pares(@PathVariable("keyword") String keyword) throws IOException{
return contentService.parseContent(keyword);
}
二:前后端分离进行搜索实现
搜索实现和搜索高亮实现
新建前端模板进行请求接口编写
new Vue({
el:"#app",
data:{
keyword: '',
results: []
},
methods:{
searchKey(){
var keyword = this.keyword;
console.log(keyword);
//搜索分页
// axios.get("search/"+keyword+"/1/10").then(response =>{
// console.log(response);
// //绑定数据
// this.results = response.data;
// })
//实现搜索高亮
axios.get("/HighlightBuilder/"+keyword+"/1/10").then(response =>{
console.log(response);
//绑定数据
this.results = response.data;
})
}
}
})
编写service层
//2. 获取这些数据实现搜索功能
public List<Map<String ,Object>> searchPage(String keyword ,int pageNo,int pageSize) throws IOException {
if(pageNo<=1){
pageNo = 1;
}
//条件搜索
SearchRequest searchRequest = new SearchRequest("jd_goods");
SearchSourceBuilder sourceBuilder = new SearchSourceBuilder();
//分页
sourceBuilder.from(pageNo);
sourceBuilder.size(pageSize);
//精准匹配
TermQueryBuilder termQueryBuilder = QueryBuilders.termQuery("title", keyword);
sourceBuilder.query(termQueryBuilder);
sourceBuilder.timeout(new TimeValue(60, TimeUnit.SECONDS));
//执行搜索
searchRequest.source(sourceBuilder);
SearchResponse searchResponse = restHighLevelClient.search(searchRequest, RequestOptions.DEFAULT);
//解析结果
ArrayList<Map<String,Object>> list = new ArrayList<>();
for (SearchHit documentFields : searchResponse.getHits().getHits()) {
//把所有结果遍历出来然后封装到list集合里面
list.add( documentFields.getSourceAsMap());
}
return list;
}
//2. 获取这些数据实现搜索高亮功能
public List<Map<String ,Object>> searchHighlightBuilder(String keyword ,int pageNo,int pageSize) throws IOException {
if(pageNo<=1){
pageNo = 1;
}
//条件搜索
SearchRequest searchRequest = new SearchRequest("jd_goods");
SearchSourceBuilder sourceBuilder = new SearchSourceBuilder();
//分页
sourceBuilder.from(pageNo);
sourceBuilder.size(pageSize);
//精准匹配
TermQueryBuilder termQueryBuilder = QueryBuilders.termQuery("title", keyword);
sourceBuilder.query(termQueryBuilder);
sourceBuilder.timeout(new TimeValue(60, TimeUnit.SECONDS));
//高亮
HighlightBuilder highlightBuilder = new HighlightBuilder();
//设置标题高亮
highlightBuilder.field("title");
//关闭多个高亮字段显示
//highlightBuilder.requireFieldMatch(true);
//设置高亮样式
highlightBuilder.preTags("<span style='color:red'>");
highlightBuilder.postTags("</span>");
sourceBuilder.highlighter(highlightBuilder);
//执行搜索
searchRequest.source(sourceBuilder);
SearchResponse searchResponse = restHighLevelClient.search(searchRequest, RequestOptions.DEFAULT);
//解析结果
ArrayList<Map<String,Object>> list = new ArrayList<>();
for (SearchHit hit : searchResponse.getHits().getHits()) {
//解析高亮的字段
Map<String, HighlightField> highlightFields = hit.getHighlightFields();
//获取标题
HighlightField title = highlightFields.get("title");
//原来的结果
Map<String, Object> sourceAsMap = hit.getSourceAsMap();
//解析高亮字段 把原先的字段替换为高亮字段
if (title!= null){
Text[] fragments = title.fragments();
StringBuilder n_title = new StringBuilder();
for (Text text : fragments) {
n_title.append(text);
}
sourceAsMap.put("title", n_title.toString());
}
// if (title!= null){
// Text[] fragments = title.fragments();
// String n_title = "";
// for (Text text : fragments) {
// n_title+= text;
// }
// sourceAsMap.put("title",n_title);
//
//
// }
//把所有结果遍历出来然后封装到list集合里面
list.add(sourceAsMap);
}
return list;
}
实现的接口controller
//对数据进行分页
@GetMapping("/search/{keyword}/{pageNo}/{pageSize}")
public List<Map<String,Object>> search(@PathVariable("keyword") String keyword,
@PathVariable("pageNo") int pageNo,
@PathVariable("pageSize")int pageSize) throws IOException {
return contentService.searchPage(keyword, pageNo, pageSize);
}
//高亮
@GetMapping("HighlightBuilder/{keyword}/{pageNo}/{pageSize}")
public List<Map<String,Object>> searchHighlightBuilder(@PathVariable("keyword") String keyword,
@PathVariable("pageNo") int pageNo,
@PathVariable("pageSize")int pageSize) throws IOException {
return contentService.searchHighlightBuilder(keyword, pageNo, pageSize);
}
null){
Text[] fragments = title.fragments();
StringBuilder n_title = new StringBuilder();
for (Text text : fragments) {
n_title.append(text);
}
sourceAsMap.put("title", n_title.toString());
}
// if (title!= null){
// Text[] fragments = title.fragments();
// String n_title = "";
// for (Text text : fragments) {
// n_title+= text;
// }
// sourceAsMap.put("title",n_title);
//
//
// }
//把所有结果遍历出来然后封装到list集合里面
list.add(sourceAsMap);
}
return list;
}
#### 实现的接口controller
//对数据进行分页
@GetMapping("/search/{keyword}/{pageNo}/{pageSize}")
public List<Map<String,Object>> search(@PathVariable("keyword") String keyword,
@PathVariable("pageNo") int pageNo,
@PathVariable("pageSize")int pageSize) throws IOException {
return contentService.searchPage(keyword, pageNo, pageSize);
}
//高亮
@GetMapping("HighlightBuilder/{keyword}/{pageNo}/{pageSize}")
public List<Map<String,Object>> searchHighlightBuilder(@PathVariable("keyword") String keyword,
@PathVariable("pageNo") int pageNo,
@PathVariable("pageSize")int pageSize) throws IOException {
return contentService.searchHighlightBuilder(keyword, pageNo, pageSize);
}