当前位置:   article > 正文

ElasticSearch篇——实现京东搜索(含爬虫操作/解析数据到ES/精准查询/对接前端/高亮查询)一篇文章带你速通~~~~~~~~_elasticsearch加爬虫搜集

elasticsearch加爬虫搜集

话不多说,兄弟们坐好。马上开冲!

爬取解析数据

1、项目导入jsoup依赖

  1. <!--解析网页-->
  2. <dependency>
  3. <groupId>org.jsoup</groupId>
  4. <artifactId>jsoup</artifactId>
  5. <version>1.15.4</version>
  6. </dependency>

2、编写工具类解析网页

  1. package com.guohui.util;
  2. import com.guohui.po.Content;
  3. import org.jsoup.Jsoup;
  4. import org.jsoup.nodes.Document;
  5. import org.jsoup.nodes.Element;
  6. import org.jsoup.select.Elements;
  7. import org.springframework.stereotype.Component;
  8. import java.net.URL;
  9. import java.util.ArrayList;
  10. import java.util.List;
  11. @Component
  12. public class HtmlParseUtil {
  13. public static void main(String[] args) throws Exception {
  14. new HtmlParseUtil().parseJD("java").forEach(System.out::println);
  15. }
  16. //抽成方法
  17. public List<Content> parseJD(String keywords) throws Exception {
  18. //获取请求:这里注意ajax异步请求是获取不到的
  19. String url = "https://search.jd.com/Search?keyword=" + keywords;
  20. //解析网页(Jsoup返回Document就是浏览器Document对象)
  21. Document document = Jsoup.parse(new URL(url), 30000);
  22. //所有你在js中可以使用的方法,这里都能用
  23. Element element = document.getElementById("J_goodsList");
  24. //获取所有的li元素
  25. Elements elements = element.getElementsByTag("li");
  26. ArrayList<Content> goodsList = new ArrayList<>();
  27. //获取元素中的内容,这里el 就是每一个li标签了
  28. for (Element el : elements) {
  29. //关于这种图片特别多的网站,所有的图片都是延迟加载的
  30. String img = el.getElementsByTag("img").eq(0).attr("data-lazy-img");
  31. String price = el.getElementsByClass("p-price").eq(0).text();
  32. String title = el.getElementsByClass("p-name").eq(0).text();
  33. Content content = new Content();
  34. content.setTitle(title);
  35. content.setImg(img);
  36. content.setPrice(price);
  37. goodsList.add(content);
  38. }
  39. return goodsList;
  40. }
  41. }

编写业务层

业务层代码希望大家能够认真阅读!

  1. import com.alibaba.fastjson2.JSON;
  2. import com.guohui.po.Content;
  3. import com.guohui.service.ContentService;
  4. import com.guohui.util.HtmlParseUtil;
  5. import lombok.extern.slf4j.Slf4j;
  6. import org.elasticsearch.action.bulk.BulkRequest;
  7. import org.elasticsearch.action.bulk.BulkResponse;
  8. import org.elasticsearch.action.index.IndexRequest;
  9. import org.elasticsearch.action.search.SearchRequest;
  10. import org.elasticsearch.action.search.SearchResponse;
  11. import org.elasticsearch.client.RequestOptions;
  12. import org.elasticsearch.client.RestHighLevelClient;
  13. import org.elasticsearch.common.text.Text;
  14. import org.elasticsearch.common.unit.TimeValue;
  15. import org.elasticsearch.common.xcontent.XContentType;
  16. import org.elasticsearch.index.query.QueryBuilders;
  17. import org.elasticsearch.index.query.TermQueryBuilder;
  18. import org.elasticsearch.search.SearchHit;
  19. import org.elasticsearch.search.builder.SearchSourceBuilder;
  20. import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder;
  21. import org.elasticsearch.search.fetch.subphase.highlight.HighlightField;
  22. import org.springframework.beans.factory.annotation.Autowired;
  23. import org.springframework.stereotype.Service;
  24. import java.io.IOException;
  25. import java.util.ArrayList;
  26. import java.util.List;
  27. import java.util.Map;
  28. import java.util.concurrent.TimeUnit;
  29. @Service
  30. @Slf4j
  31. public class ContentServiceImpl implements ContentService {
  32. @Autowired
  33. private HtmlParseUtil htmlParseUtil;
  34. @Autowired
  35. private RestHighLevelClient restHighLevelClient;
  36. /**
  37. * @Author xgh
  38. * @Description 解析数据放入ES
  39. * @Date 2023/7/23 16:09
  40. * @Return
  41. **/
  42. @Override
  43. public boolean parseContent(String keyWorld) throws Exception {
  44. //调用封装的解析页面的工具类,获得页面上的li集合
  45. List<Content> contents = htmlParseUtil.parseJD(keyWorld);
  46. log.info("解析的数据为:{}", contents.toString());
  47. //批量添加文档
  48. BulkRequest bulkRequest = new BulkRequest();
  49. for (int i = 0; i < contents.size(); i++) {
  50. log.info("数据为:{}", contents.get(i));
  51. bulkRequest.add(new IndexRequest("jd_goods")
  52. .source(JSON.toJSONString(contents.get(i)), XContentType.JSON));
  53. }
  54. //批量执行请求
  55. BulkResponse responses = restHighLevelClient.bulk(bulkRequest, RequestOptions.DEFAULT);
  56. return !responses.hasFailures();
  57. }
  58. /**
  59. * @Author xgh
  60. * @Description 获取数据后搜索数据
  61. * @Date 2023/7/23 16:39
  62. * @Return
  63. **/
  64. @Override
  65. public List<Map<String, Object>> searchData(String keyWord, int pageNo, int pageSize) throws IOException {
  66. //分页的判断
  67. if (pageNo <= 1) {
  68. pageNo = 1;
  69. }
  70. //创建查询的请求
  71. SearchRequest searchRequest = new SearchRequest("jd_goods");
  72. //构建查询条件
  73. SearchSourceBuilder searchSourceBuilder = new SearchSourceBuilder();
  74. //通过工具来构建构建查询条件
  75. TermQueryBuilder termQueryBuilder = QueryBuilders.termQuery("title", keyWord);
  76. //调用query方法实现精准查询
  77. searchSourceBuilder.query(termQueryBuilder);
  78. //分页
  79. searchSourceBuilder.from(pageNo);
  80. searchSourceBuilder.size(pageSize);
  81. //设置查询的超时时间
  82. searchSourceBuilder.timeout(new TimeValue(66, TimeUnit.SECONDS));
  83. //封装搜索
  84. searchRequest.source(searchSourceBuilder);
  85. //调用客户端来查询
  86. SearchResponse searchResponse = restHighLevelClient.search(searchRequest, RequestOptions.DEFAULT);
  87. //解析查询的结果
  88. List<Map<String, Object>> mapList = new ArrayList<>();
  89. for (SearchHit documentFields : searchResponse.getHits().getHits()) {
  90. //调用getSourceAsMap方法将查询出的结果转换成map,然后将map添加到List集合中!
  91. mapList.add(documentFields.getSourceAsMap());
  92. }
  93. return mapList;
  94. }
  95. /**
  96. * @Author xgh
  97. * @Description 高亮查询
  98. * @Date 2023/7/23 17:13
  99. * @Return
  100. **/
  101. @Override
  102. public List<Map<String, Object>> searchDataHighLight(String keyWord, int pageNo, int pageSize) throws IOException {
  103. //分页的判断
  104. if (pageNo <= 1) {
  105. pageNo = 1;
  106. }
  107. //创建查询的请求
  108. SearchRequest searchRequest = new SearchRequest("jd_goods");
  109. //构建查询条件
  110. SearchSourceBuilder searchSourceBuilder = new SearchSourceBuilder();
  111. //通过工具来构建构建查询条件
  112. TermQueryBuilder termQueryBuilder = QueryBuilders.termQuery("title", keyWord);
  113. //调用query方法实现精准查询
  114. searchSourceBuilder.query(termQueryBuilder);
  115. //配置高亮!!!!!!
  116. HighlightBuilder highlightBuilder = new HighlightBuilder();
  117. //当存在多个相同的字段的时候,关闭多个高亮,只高亮一个地方即可
  118. highlightBuilder.requireFieldMatch(false);
  119. //设置高亮的字段
  120. highlightBuilder.field("title");
  121. //设置高亮的颜色
  122. highlightBuilder.preTags("<span style='color=red'>");
  123. highlightBuilder.postTags("</span>");
  124. searchSourceBuilder.highlighter(highlightBuilder);
  125. //分页
  126. searchSourceBuilder.from(pageNo);
  127. searchSourceBuilder.size(pageSize);
  128. //设置查询的超时时间
  129. searchSourceBuilder.timeout(new TimeValue(66, TimeUnit.SECONDS));
  130. //封装搜索
  131. searchRequest.source(searchSourceBuilder);
  132. //调用客户端来查询
  133. SearchResponse searchResponse = restHighLevelClient.search(searchRequest, RequestOptions.DEFAULT);
  134. //解析查询的结果
  135. List<Map<String, Object>> mapList = new ArrayList<>();
  136. for (SearchHit documentFields : searchResponse.getHits().getHits()) {
  137. //解析高亮的字段
  138. Map<String, HighlightField> highlightFields = documentFields.getHighlightFields();
  139. HighlightField title = highlightFields.get("title");
  140. Map<String, Object> sourceAsMap = documentFields.getSourceAsMap(); //这是原来的查询结果
  141. //将高亮的字段把原来结果的字段替换掉即可!!!!!!
  142. if (title != null){
  143. Text[] fragments = title.fragments();
  144. String new_title = "";
  145. for (Text text : fragments) {
  146. new_title += text;
  147. }
  148. //将高亮的字段替换掉原来的字段
  149. sourceAsMap.put("title",new_title);
  150. }
  151. mapList.add(sourceAsMap);
  152. }
  153. return mapList;
  154. }
  155. }

编写API层

  1. import com.guohui.service.ContentService;
  2. import org.springframework.beans.factory.annotation.Autowired;
  3. import org.springframework.web.bind.annotation.GetMapping;
  4. import org.springframework.web.bind.annotation.PathVariable;
  5. import org.springframework.web.bind.annotation.RestController;
  6. import java.io.IOException;
  7. import java.util.List;
  8. import java.util.Map;
  9. @RestController
  10. public class ContentApi {
  11. @Autowired
  12. private ContentService contentService;
  13. /**
  14. * @Author xgh
  15. * @Description 解析页面数据,存储到ES中
  16. * @Date 2023/7/23 16:58
  17. * @Return
  18. **/
  19. @GetMapping("/parse/{keyWord}")
  20. public Boolean parse(@PathVariable("keyWord") String keyWord) throws Exception {
  21. return contentService.parseContent(keyWord);
  22. }
  23. /**
  24. * @Author xgh
  25. * @Description 查询ES数据
  26. * @Date 2023/7/23 16:58
  27. * @Return
  28. **/
  29. @GetMapping("/search/{keyWord}/{pageNo}/{pageSize}")
  30. public List<Map<String,Object>> searchData(@PathVariable("keyWord") String keyWord,
  31. @PathVariable("pageNo") int pageNo,
  32. @PathVariable("pageSize") int pageSize) throws IOException {
  33. return contentService.searchData(keyWord,pageNo,pageSize);
  34. }
  35. /**
  36. * @Author xgh
  37. * @Description 高亮查询ES数据
  38. * @Date 2023/7/23 16:58
  39. * @Return
  40. **/
  41. @GetMapping("/searchHigh/{keyWord}/{pageNo}/{pageSize}")
  42. public List<Map<String,Object>> searchDataHighLight(@PathVariable("keyWord") String keyWord,
  43. @PathVariable("pageNo") int pageNo,
  44. @PathVariable("pageSize") int pageSize) throws IOException {
  45. return contentService.searchDataHighLight(keyWord,pageNo,pageSize);
  46. }
  47. }

调用API响应结果

1、调用爬取数据并存储到ES的方法

观察ES的head可视化工具中是否存入了跟ES相关的数据

2、直接查询数据

3、调用高亮查询

最后需要前端通过下面的标签将html语言进行解析即可!!!!!!

至此,关于ES中爬取页面数据实现文章检索功能已经介绍完毕,详细的使用希望大家能够认真阅读各个层的代码,注释也很详细的哈!

后续还会持续的更新,希望打大家可以继续关注~

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/小蓝xlanll/article/detail/543495
推荐阅读
相关标签
  

闽ICP备14008679号