赞
踩
话不多说,兄弟们坐好。马上开冲!
1、项目导入jsoup依赖
- <!--解析网页-->
- <dependency>
- <groupId>org.jsoup</groupId>
- <artifactId>jsoup</artifactId>
- <version>1.15.4</version>
- </dependency>
2、编写工具类解析网页
- package com.guohui.util;
-
- import com.guohui.po.Content;
- import org.jsoup.Jsoup;
- import org.jsoup.nodes.Document;
- import org.jsoup.nodes.Element;
- import org.jsoup.select.Elements;
- import org.springframework.stereotype.Component;
-
- import java.net.URL;
- import java.util.ArrayList;
- import java.util.List;
-
- @Component
- public class HtmlParseUtil {
- public static void main(String[] args) throws Exception {
- new HtmlParseUtil().parseJD("java").forEach(System.out::println);
- }
-
- //抽成方法
- public List<Content> parseJD(String keywords) throws Exception {
- //获取请求:这里注意ajax异步请求是获取不到的
- String url = "https://search.jd.com/Search?keyword=" + keywords;
-
- //解析网页(Jsoup返回Document就是浏览器Document对象)
- Document document = Jsoup.parse(new URL(url), 30000);
-
- //所有你在js中可以使用的方法,这里都能用
- Element element = document.getElementById("J_goodsList");
- //获取所有的li元素
- Elements elements = element.getElementsByTag("li");
- ArrayList<Content> goodsList = new ArrayList<>();
- //获取元素中的内容,这里el 就是每一个li标签了
- for (Element el : elements) {
- //关于这种图片特别多的网站,所有的图片都是延迟加载的
- String img = el.getElementsByTag("img").eq(0).attr("data-lazy-img");
- String price = el.getElementsByClass("p-price").eq(0).text();
- String title = el.getElementsByClass("p-name").eq(0).text();
- Content content = new Content();
- content.setTitle(title);
- content.setImg(img);
- content.setPrice(price);
- goodsList.add(content);
- }
- return goodsList;
- }
- }
业务层代码希望大家能够认真阅读!
- import com.alibaba.fastjson2.JSON;
- import com.guohui.po.Content;
- import com.guohui.service.ContentService;
- import com.guohui.util.HtmlParseUtil;
- import lombok.extern.slf4j.Slf4j;
- import org.elasticsearch.action.bulk.BulkRequest;
- import org.elasticsearch.action.bulk.BulkResponse;
- import org.elasticsearch.action.index.IndexRequest;
- import org.elasticsearch.action.search.SearchRequest;
- import org.elasticsearch.action.search.SearchResponse;
- import org.elasticsearch.client.RequestOptions;
- import org.elasticsearch.client.RestHighLevelClient;
- import org.elasticsearch.common.text.Text;
- import org.elasticsearch.common.unit.TimeValue;
- import org.elasticsearch.common.xcontent.XContentType;
- import org.elasticsearch.index.query.QueryBuilders;
- import org.elasticsearch.index.query.TermQueryBuilder;
- import org.elasticsearch.search.SearchHit;
- import org.elasticsearch.search.builder.SearchSourceBuilder;
- import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder;
- import org.elasticsearch.search.fetch.subphase.highlight.HighlightField;
- import org.springframework.beans.factory.annotation.Autowired;
- import org.springframework.stereotype.Service;
-
- import java.io.IOException;
- import java.util.ArrayList;
- import java.util.List;
- import java.util.Map;
- import java.util.concurrent.TimeUnit;
-
- @Service
- @Slf4j
- public class ContentServiceImpl implements ContentService {
-
- @Autowired
- private HtmlParseUtil htmlParseUtil;
-
- @Autowired
- private RestHighLevelClient restHighLevelClient;
-
- /**
- * @Author xgh
- * @Description 解析数据放入ES
- * @Date 2023/7/23 16:09
- * @Return
- **/
- @Override
- public boolean parseContent(String keyWorld) throws Exception {
- //调用封装的解析页面的工具类,获得页面上的li集合
- List<Content> contents = htmlParseUtil.parseJD(keyWorld);
- log.info("解析的数据为:{}", contents.toString());
-
- //批量添加文档
- BulkRequest bulkRequest = new BulkRequest();
- for (int i = 0; i < contents.size(); i++) {
- log.info("数据为:{}", contents.get(i));
- bulkRequest.add(new IndexRequest("jd_goods")
- .source(JSON.toJSONString(contents.get(i)), XContentType.JSON));
- }
-
- //批量执行请求
- BulkResponse responses = restHighLevelClient.bulk(bulkRequest, RequestOptions.DEFAULT);
- return !responses.hasFailures();
- }
-
- /**
- * @Author xgh
- * @Description 获取数据后搜索数据
- * @Date 2023/7/23 16:39
- * @Return
- **/
- @Override
- public List<Map<String, Object>> searchData(String keyWord, int pageNo, int pageSize) throws IOException {
- //分页的判断
- if (pageNo <= 1) {
- pageNo = 1;
- }
-
- //创建查询的请求
- SearchRequest searchRequest = new SearchRequest("jd_goods");
-
- //构建查询条件
- SearchSourceBuilder searchSourceBuilder = new SearchSourceBuilder();
-
- //通过工具来构建构建查询条件
- TermQueryBuilder termQueryBuilder = QueryBuilders.termQuery("title", keyWord);
- //调用query方法实现精准查询
- searchSourceBuilder.query(termQueryBuilder);
-
- //分页
- searchSourceBuilder.from(pageNo);
- searchSourceBuilder.size(pageSize);
-
- //设置查询的超时时间
- searchSourceBuilder.timeout(new TimeValue(66, TimeUnit.SECONDS));
-
- //封装搜索
- searchRequest.source(searchSourceBuilder);
-
- //调用客户端来查询
- SearchResponse searchResponse = restHighLevelClient.search(searchRequest, RequestOptions.DEFAULT);
-
- //解析查询的结果
- List<Map<String, Object>> mapList = new ArrayList<>();
- for (SearchHit documentFields : searchResponse.getHits().getHits()) {
- //调用getSourceAsMap方法将查询出的结果转换成map,然后将map添加到List集合中!
- mapList.add(documentFields.getSourceAsMap());
- }
- return mapList;
- }
-
- /**
- * @Author xgh
- * @Description 高亮查询
- * @Date 2023/7/23 17:13
- * @Return
- **/
- @Override
- public List<Map<String, Object>> searchDataHighLight(String keyWord, int pageNo, int pageSize) throws IOException {
- //分页的判断
- if (pageNo <= 1) {
- pageNo = 1;
- }
-
- //创建查询的请求
- SearchRequest searchRequest = new SearchRequest("jd_goods");
-
- //构建查询条件
- SearchSourceBuilder searchSourceBuilder = new SearchSourceBuilder();
-
- //通过工具来构建构建查询条件
- TermQueryBuilder termQueryBuilder = QueryBuilders.termQuery("title", keyWord);
- //调用query方法实现精准查询
- searchSourceBuilder.query(termQueryBuilder);
-
- //配置高亮!!!!!!
- HighlightBuilder highlightBuilder = new HighlightBuilder();
- //当存在多个相同的字段的时候,关闭多个高亮,只高亮一个地方即可
- highlightBuilder.requireFieldMatch(false);
-
- //设置高亮的字段
- highlightBuilder.field("title");
- //设置高亮的颜色
- highlightBuilder.preTags("<span style='color=red'>");
- highlightBuilder.postTags("</span>");
- searchSourceBuilder.highlighter(highlightBuilder);
-
- //分页
- searchSourceBuilder.from(pageNo);
- searchSourceBuilder.size(pageSize);
-
- //设置查询的超时时间
- searchSourceBuilder.timeout(new TimeValue(66, TimeUnit.SECONDS));
-
- //封装搜索
- searchRequest.source(searchSourceBuilder);
-
- //调用客户端来查询
- SearchResponse searchResponse = restHighLevelClient.search(searchRequest, RequestOptions.DEFAULT);
-
- //解析查询的结果
- List<Map<String, Object>> mapList = new ArrayList<>();
- for (SearchHit documentFields : searchResponse.getHits().getHits()) {
- //解析高亮的字段
- Map<String, HighlightField> highlightFields = documentFields.getHighlightFields();
- HighlightField title = highlightFields.get("title");
- Map<String, Object> sourceAsMap = documentFields.getSourceAsMap(); //这是原来的查询结果
- //将高亮的字段把原来结果的字段替换掉即可!!!!!!
- if (title != null){
- Text[] fragments = title.fragments();
- String new_title = "";
- for (Text text : fragments) {
- new_title += text;
- }
- //将高亮的字段替换掉原来的字段
- sourceAsMap.put("title",new_title);
- }
- mapList.add(sourceAsMap);
- }
- return mapList;
- }
-
-
- }
- import com.guohui.service.ContentService;
- import org.springframework.beans.factory.annotation.Autowired;
- import org.springframework.web.bind.annotation.GetMapping;
- import org.springframework.web.bind.annotation.PathVariable;
- import org.springframework.web.bind.annotation.RestController;
-
- import java.io.IOException;
- import java.util.List;
- import java.util.Map;
-
- @RestController
- public class ContentApi {
- @Autowired
- private ContentService contentService;
-
- /**
- * @Author xgh
- * @Description 解析页面数据,存储到ES中
- * @Date 2023/7/23 16:58
- * @Return
- **/
- @GetMapping("/parse/{keyWord}")
- public Boolean parse(@PathVariable("keyWord") String keyWord) throws Exception {
-
- return contentService.parseContent(keyWord);
- }
-
- /**
- * @Author xgh
- * @Description 查询ES数据
- * @Date 2023/7/23 16:58
- * @Return
- **/
- @GetMapping("/search/{keyWord}/{pageNo}/{pageSize}")
- public List<Map<String,Object>> searchData(@PathVariable("keyWord") String keyWord,
- @PathVariable("pageNo") int pageNo,
- @PathVariable("pageSize") int pageSize) throws IOException {
- return contentService.searchData(keyWord,pageNo,pageSize);
- }
-
- /**
- * @Author xgh
- * @Description 高亮查询ES数据
- * @Date 2023/7/23 16:58
- * @Return
- **/
- @GetMapping("/searchHigh/{keyWord}/{pageNo}/{pageSize}")
- public List<Map<String,Object>> searchDataHighLight(@PathVariable("keyWord") String keyWord,
- @PathVariable("pageNo") int pageNo,
- @PathVariable("pageSize") int pageSize) throws IOException {
- return contentService.searchDataHighLight(keyWord,pageNo,pageSize);
- }
- }
-
1、调用爬取数据并存储到ES的方法
观察ES的head可视化工具中是否存入了跟ES相关的数据
2、直接查询数据
3、调用高亮查询
最后需要前端通过下面的标签将html语言进行解析即可!!!!!!
至此,关于ES中爬取页面数据实现文章检索功能已经介绍完毕,详细的使用希望大家能够认真阅读各个层的代码,注释也很详细的哈!
后续还会持续的更新,希望打大家可以继续关注~
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。