ElasticSearch仿京东搜索

一:爬取京东数据

复制代码
package com.esjd.Utils;

import lombok.SneakyThrows;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.net.MalformedURLException;
import java.net.URL;

public class HtmlParseUtil {
    @SneakyThrows
    public static void main(String[] args)   {
        //获取请求 https://search.jd.com/Search?keyword=java 需要联网
        String url = "https://search.jd.com/Search?keyword=java";


        //解析网页 jsoup返回的 Document就是游览器 Document对象
        Document document = Jsoup.parse(new URL(url),30000);

        //获取网页idJ_goodsList
        Element element = document.getElementById("J_goodsList");
      System.out.println(element.html());
        //获取所有的li元素
        Elements elements = document.getElementsByTag("li");
        for (Element element1 : elements) {
                String img = element1.getElementsByTag("img").eq(0).attr("data-lazy-img");
                String price = element1.getElementsByClass("p-price").eq(0).text();
                String title = element1.getElementsByClass("p-name").eq(0).text();
                System.out.println("______________________________________--");
                System.out.println(img);
                System.out.println(price);
                System.out.println(title);
        }
    }
}
封装成工具类
复制代码
@SneakyThrows
public List<Content> paresJD(String keyword){
    //获取请求 https://search.jd.com/Search?keyword=java 需要联网
    String urlKeywords = URLEncoder.encode(keyword, "UTF-8");

    //获取请求 https://search.jd.com/Search?keyword=java
    //前提: 需要联网, 而且不能获取到AJAX!
    String url ="https://search.jd.com/Search?keyword=" + urlKeywords + "&enc=utf-8";


    //解析网页 jsoup返回的 Document就是游览器 Document对象
    Document document = Jsoup.parse(new URL(url),30000);

    //获取网页idJ_goodsList
    Element element = document.getElementById("J_goodsList");
    //System.out.println(element.html());
    //获取所有的li元素
    Elements elements = document.getElementsByTag("li");


    ArrayList<Content>  goodsList = new ArrayList<>();
    for (Element element1 : elements) {
        if (element1.attr("class").equalsIgnoreCase("gl-item")) {
            String img = element1.getElementsByTag("img").eq(0).attr("data-lazy-img");
            String price = element1.getElementsByClass("p-price").eq(0).text();
            String title = element1.getElementsByClass("p-name").eq(0).text();
       Content content = new Content();
        content.setTitle(title);
        content.setPrice(price);
        content.setImg(img);
        goodsList.add(content);
    }
    }
    return goodsList;
}
编写pojo类
复制代码
@Data
@AllArgsConstructor
@NoArgsConstructor

public class Content {
    //根据业务需求自己添加属性
    private  String title;
    private String img;
    private String price;

}
解析数据到es中
复制代码
  @Autowired
//  不能直接使用     @Autowired 需要spring容器
  private RestHighLevelClient restHighLevelClient;


  //解析数据放入es中
  public Boolean  parseContent(String keywords) throws IOException {
      List<Content> contents = new HtmlParseUtil().paresJD(keywords);
      //把查询的数据放入es中
      BulkRequest bulkRequest = new BulkRequest();
      bulkRequest.timeout("2m");
      for (int i = 0; i < contents.size(); i++) {
          System.out.println(JSON.toJSONString(contents.get(i)));
          bulkRequest.add(new IndexRequest("jd_goods")
                      .source(JSON.toJSONString(
                              contents.get(i)), XContentType.JSON));

      }

      BulkResponse bulk = restHighLevelClient.bulk(bulkRequest, RequestOptions.DEFAULT);
      //判断返回是否成功
      return  !bulk.hasFailures();

  }}
对应的controller接口
复制代码
@Autowired
private  ContentService contentService;


//爬取数据到es中
@GetMapping("/pares/{keyword}")
public Boolean pares(@PathVariable("keyword") String  keyword) throws IOException{
     return  contentService.parseContent(keyword);
}
二:前后端分离进行搜索实现

搜索实现和搜索高亮实现

新建前端模板进行请求接口编写

复制代码
new Vue({
     el:"#app",
    data:{
         keyword: '',
        results: []
    },
    methods:{
         searchKey(){
             var keyword = this.keyword;
             console.log(keyword);
             //搜索分页
             // axios.get("search/"+keyword+"/1/10").then(response =>{
             //     console.log(response);
             //     //绑定数据
             //     this.results = response.data;
             // })
             //实现搜索高亮
             axios.get("/HighlightBuilder/"+keyword+"/1/10").then(response =>{
                 console.log(response);
                 //绑定数据
                 this.results = response.data;
             })
         }
    }
})
编写service层
复制代码
  //2. 获取这些数据实现搜索功能
    public List<Map<String ,Object>> searchPage(String keyword ,int  pageNo,int pageSize) throws IOException {
        if(pageNo<=1){
            pageNo = 1;
        }

        //条件搜索
        SearchRequest searchRequest = new SearchRequest("jd_goods");
        SearchSourceBuilder sourceBuilder = new SearchSourceBuilder();


        //分页
        sourceBuilder.from(pageNo);
        sourceBuilder.size(pageSize);
        //精准匹配
        TermQueryBuilder termQueryBuilder = QueryBuilders.termQuery("title", keyword);

        sourceBuilder.query(termQueryBuilder);
        sourceBuilder.timeout(new TimeValue(60, TimeUnit.SECONDS));



        //执行搜索
        searchRequest.source(sourceBuilder);
        SearchResponse searchResponse = restHighLevelClient.search(searchRequest, RequestOptions.DEFAULT);

        //解析结果
        ArrayList<Map<String,Object>> list = new ArrayList<>();
        for (SearchHit documentFields : searchResponse.getHits().getHits()) {
            //把所有结果遍历出来然后封装到list集合里面
           list.add( documentFields.getSourceAsMap());
        }


        return  list;
    }

    //2. 获取这些数据实现搜索高亮功能
    public List<Map<String ,Object>> searchHighlightBuilder(String keyword ,int  pageNo,int pageSize) throws IOException {
        if(pageNo<=1){
            pageNo = 1;
        }

        //条件搜索
        SearchRequest searchRequest = new SearchRequest("jd_goods");
        SearchSourceBuilder sourceBuilder = new SearchSourceBuilder();


        //分页
        sourceBuilder.from(pageNo);
        sourceBuilder.size(pageSize);
        //精准匹配
        TermQueryBuilder termQueryBuilder = QueryBuilders.termQuery("title", keyword);

        sourceBuilder.query(termQueryBuilder);
        sourceBuilder.timeout(new TimeValue(60, TimeUnit.SECONDS));

        //高亮
        HighlightBuilder highlightBuilder = new HighlightBuilder();
        //设置标题高亮
        highlightBuilder.field("title");
        //关闭多个高亮字段显示
        //highlightBuilder.requireFieldMatch(true);
        //设置高亮样式
        highlightBuilder.preTags("<span style='color:red'>");
        highlightBuilder.postTags("</span>");
        sourceBuilder.highlighter(highlightBuilder);



        //执行搜索
        searchRequest.source(sourceBuilder);
        SearchResponse searchResponse = restHighLevelClient.search(searchRequest, RequestOptions.DEFAULT);

        //解析结果
        ArrayList<Map<String,Object>> list = new ArrayList<>();
        for (SearchHit hit : searchResponse.getHits().getHits()) {


            //解析高亮的字段
            Map<String, HighlightField> highlightFields = hit.getHighlightFields();
            //获取标题
            HighlightField title = highlightFields.get("title");
            //原来的结果
            Map<String, Object> sourceAsMap = hit.getSourceAsMap();
            //解析高亮字段 把原先的字段替换为高亮字段
            if (title!= null){
                Text[] fragments = title.fragments();
                StringBuilder n_title = new StringBuilder();
                for (Text text : fragments) {
                    n_title.append(text);
                }
                sourceAsMap.put("title", n_title.toString());


            }
//            if (title!= null){
//                Text[] fragments = title.fragments();
//                String n_title = "";
//                for (Text text : fragments) {
//                    n_title+= text;
//                }
//                sourceAsMap.put("title",n_title);
//
//
//            }

            //把所有结果遍历出来然后封装到list集合里面
            list.add(sourceAsMap);

        }


        return  list;
    }
实现的接口controller
复制代码
//对数据进行分页
@GetMapping("/search/{keyword}/{pageNo}/{pageSize}")
public List<Map<String,Object>> search(@PathVariable("keyword") String keyword,
                                       @PathVariable("pageNo") int pageNo,
                                       @PathVariable("pageSize")int pageSize) throws IOException {

    return  contentService.searchPage(keyword, pageNo, pageSize);
}

//高亮
@GetMapping("HighlightBuilder/{keyword}/{pageNo}/{pageSize}")
public List<Map<String,Object>> searchHighlightBuilder(@PathVariable("keyword") String keyword,
                                       @PathVariable("pageNo") int pageNo,
                                       @PathVariable("pageSize")int pageSize) throws IOException {

    return  contentService.searchHighlightBuilder(keyword, pageNo, pageSize);
}

null){

Text[] fragments = title.fragments();

StringBuilder n_title = new StringBuilder();

for (Text text : fragments) {

n_title.append(text);

}

sourceAsMap.put("title", n_title.toString());

复制代码
        }

// if (title!= null){

// Text[] fragments = title.fragments();

// String n_title = "";

// for (Text text : fragments) {

// n_title+= text;

// }

// sourceAsMap.put("title",n_title);

//

//

// }

复制代码
        //把所有结果遍历出来然后封装到list集合里面
        list.add(sourceAsMap);

    }


    return  list;
}


#### 实现的接口controller

//对数据进行分页

@GetMapping("/search/{keyword}/{pageNo}/{pageSize}")

public List<Map<String,Object>> search(@PathVariable("keyword") String keyword,

@PathVariable("pageNo") int pageNo,

@PathVariable("pageSize")int pageSize) throws IOException {

复制代码
return  contentService.searchPage(keyword, pageNo, pageSize);

}

//高亮

@GetMapping("HighlightBuilder/{keyword}/{pageNo}/{pageSize}")

public List<Map<String,Object>> searchHighlightBuilder(@PathVariable("keyword") String keyword,

@PathVariable("pageNo") int pageNo,

@PathVariable("pageSize")int pageSize) throws IOException {

复制代码
return  contentService.searchHighlightBuilder(keyword, pageNo, pageSize);

}

复制代码
相关推荐
MacroZheng11 小时前
横空出世!MyBatis-Plus 同款 ES ORM 框架,用起来够优雅!
java·后端·elasticsearch
武子康11 小时前
大数据-100 Spark DStream 转换操作全面总结:map、reduceByKey 到 transform 的实战案例
大数据·后端·spark
expect7g12 小时前
Flink KeySelector
大数据·后端·flink
极限实验室1 天前
搜索百科(1):Lucene —— 打开现代搜索世界的第一扇门
搜索引擎·lucene
阿里云大数据AI技术1 天前
StarRocks 助力数禾科技构建实时数仓:从数据孤岛到智能决策
大数据
Lx3521 天前
Hadoop数据处理优化:减少Shuffle阶段的性能损耗
大数据·hadoop
Elasticsearch1 天前
平衡尺度:利用权重使倒数排序融合 (RRF) 更加智能
elasticsearch
武子康2 天前
大数据-99 Spark Streaming 数据源全面总结:原理、应用 文件流、Socket、RDD队列流
大数据·后端·spark
阿里云大数据AI技术2 天前
大数据公有云市场第一,阿里云占比47%!
大数据
Lx3522 天前
Hadoop容错机制深度解析:保障作业稳定运行
大数据·hadoop