当前位置:   article > 正文

Java调用Elasticsearch API实现全文检索,搭配MinIO文件存储_java如何使用es ingest-attachment插件去读文件内容

java如何使用es ingest-attachment插件去读文件内容

应用背景:
对存储在MinIO服务器的文件实现全文检索。也可以是其他服务器或本地文件,本文仅详细介绍MinIO文件的读取及转换。通过Elasticsearch的Ingest-Attachment插件抽取文件内容,支持Word、Excel、PDF、TXT等格式文件,无需手动解析文件内容。

上代码,详细解释可以阅读注释、

1.引入依赖

springboot已经管理好了依赖,只需引入spring-boot-starter-data-elasticsearch

  1. <dependency>
  2. <groupId>org.springframework.boot</groupId>
  3. <artifactId>spring-boot-starter-data-elasticsearch</artifactId>
  4. </dependency>

 

2.配置文件

  1. elasticsearch:
  2. host: 192.168.2.154
  3. port: 9200

3.配置类

  1. @Setter
  2. @ConfigurationProperties(prefix = "elasticsearch")
  3. @Configuration
  4. public class ElasticSearchConfig {
  5. private String host;
  6. private Integer port;
  7. @Bean
  8. public RestHighLevelClient restHighLevelClient(){
  9. RestHighLevelClient client = new RestHighLevelClient(RestClient.builder(new HttpHost(this.host, this.port)));
  10. return client;
  11. }
  12. }

4.实现类

  1. package com.dmp.document.service.impl;
  2. import com.alibaba.fastjson2.JSONObject;
  3. import com.dmp.common.constant.HttpStatus;
  4. import com.dmp.common.core.page.PageDomain;
  5. import com.dmp.common.core.page.TableDataInfo;
  6. import com.dmp.common.core.page.TableSupport;
  7. import com.dmp.document.domain.dto.DocElasticsearchDto;
  8. import com.dmp.document.domain.entity.DocDocument;
  9. import com.dmp.document.service.DocDocumentService;
  10. import com.dmp.document.service.ElasticsearchService;
  11. import com.dmp.document.service.MinioClientService;
  12. import com.dmp.system.service.ISysConfigService;
  13. import lombok.extern.slf4j.Slf4j;
  14. import org.apache.commons.compress.utils.IOUtils;
  15. import org.elasticsearch.action.delete.DeleteRequest;
  16. import org.elasticsearch.action.index.IndexRequest;
  17. import org.elasticsearch.action.index.IndexResponse;
  18. import org.elasticsearch.action.search.SearchRequest;
  19. import org.elasticsearch.action.search.SearchResponse;
  20. import org.elasticsearch.client.RequestOptions;
  21. import org.elasticsearch.client.RestHighLevelClient;
  22. import org.elasticsearch.common.xcontent.XContentType;
  23. import org.elasticsearch.index.query.BoolQueryBuilder;
  24. import org.elasticsearch.index.query.MultiMatchQueryBuilder;
  25. import org.elasticsearch.index.query.QueryBuilders;
  26. import org.elasticsearch.search.SearchHit;
  27. import org.elasticsearch.search.builder.SearchSourceBuilder;
  28. import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder;
  29. import org.elasticsearch.search.fetch.subphase.highlight.HighlightField;
  30. import org.elasticsearch.common.text.Text;
  31. import org.springframework.beans.factory.annotation.Autowired;
  32. import org.springframework.stereotype.Service;
  33. import java.io.*;
  34. import java.net.HttpURLConnection;
  35. import java.net.URL;
  36. import java.util.*;
  37. import static com.sun.webkit.network.URLs.newURL;
  38. import static org.elasticsearch.client.RequestOptions.DEFAULT;
  39. /**
  40. * @author daixin
  41. * @version 1.0
  42. * @description: TODO
  43. * @date 2022/11/23 17:38
  44. */
  45. @Slf4j
  46. @Service
  47. public class ElasticsearchServiceImpl implements ElasticsearchService {
  48. @Autowired
  49. private RestHighLevelClient esClient;
  50. @Autowired
  51. private MinioClientService minioClientService;
  52. @Autowired
  53. private DocDocumentService docDocumentService;
  54. @Autowired
  55. private ISysConfigService sysConfigService;
  56. public String createFileIndex(String id, String projectId) throws Exception {
  57. String result = null;
  58. InputStream is = null;
  59. try{
  60. Date date1 = new Date();
  61. //查询系统内存储的文件key
  62. DocDocument docDocument = docDocumentService.getById(id);
  63. String path = docDocument.getPath();
  64. //获取minio下载签名
  65. String url = minioClientService.getDownloadLink("file-bucket",path);
  66. //请求minio获取文件流
  67. URL url2= newURL(url);
  68. HttpURLConnection conn=(HttpURLConnection) url2.openConnection();
  69. conn.setDoInput(true);
  70. conn.connect();
  71. is = conn.getInputStream();
  72. //转码base64
  73. byte[] fileByteStream = IOUtils.toByteArray(is);
  74. String base64String = new String(Base64.getEncoder().encodeToString(fileByteStream).getBytes(), "UTF-8");
  75. //封装ES请求
  76. IndexRequest request;
  77. Map attachmentMap = new HashMap();
  78. attachmentMap.put("data", base64String);
  79. attachmentMap.put("fileName", docDocument.getName());
  80. attachmentMap.put("projectId",projectId);
  81. //查询系统参数
  82. String esIndex = sysConfigService.selectConfigByKey("es_index");
  83. String esPipe = sysConfigService.selectConfigByKey("es_pipe");
  84. //配置查询请求参数
  85. request = new IndexRequest(esIndex);
  86. request.id(String.valueOf(docDocument.getId()));
  87. request.setPipeline(esPipe);//文件抽取管道,需提前创建
  88. request.source(JSONObject.toJSONString(attachmentMap), XContentType.JSON);
  89. IndexResponse response = esClient.index(request, RequestOptions.DEFAULT);
  90. response.status().toString();
  91. Date date2 = new Date();
  92. log.info("创建索引-----耗时:{}ms" , (date2.getTime() - date1.getTime()));
  93. }catch(Exception e){
  94. throw e;
  95. }finally {
  96. is.close();
  97. }
  98. return result;
  99. }
  100. @Override
  101. public TableDataInfo matchContent(String content, String projectId) {
  102. //此处为若依框架提供的分页,可改为你自己的分页
  103. PageDomain pageDomain = TableSupport.buildPageRequest();
  104. Integer pageNum = pageDomain.getPageNum();
  105. Integer pageSize = pageDomain.getPageSize();
  106. TableDataInfo rspData = new TableDataInfo();
  107. rspData.setCode(HttpStatus.SUCCESS);
  108. rspData.setMsg("查询成功");
  109. //查询系统参数
  110. String esIndex = sysConfigService.selectConfigByKey("es_index");
  111. SearchRequest searchRequest = new SearchRequest(esIndex);
  112. //布尔查询,检索标题和内容,过滤项目id
  113. SearchSourceBuilder sourceBuilder = new SearchSourceBuilder();
  114. MultiMatchQueryBuilder matchQueryBuilder = QueryBuilders.multiMatchQuery(content, "attachment.content","fileName");
  115. BoolQueryBuilder boolQueryBuilder = QueryBuilders.boolQuery();
  116. boolQueryBuilder.must(matchQueryBuilder);
  117. if(projectId != null){
  118. boolQueryBuilder.filter(QueryBuilders.termQuery("projectId", projectId));
  119. }
  120. sourceBuilder.query(boolQueryBuilder);
  121. //配置高亮
  122. HighlightBuilder highlightBuilder = new HighlightBuilder();
  123. highlightBuilder.field("attachment.content"); //content字段高亮
  124. highlightBuilder.field("fileName");//fileName字段高亮
  125. highlightBuilder.preTags("<span style='color:red'>"); //高亮前缀
  126. highlightBuilder.postTags("</span>"); //高亮后缀
  127. sourceBuilder.highlighter(highlightBuilder);
  128. //分页查询
  129. sourceBuilder.from((pageNum-1)*pageSize).size(pageSize);
  130. searchRequest.source(sourceBuilder);
  131. SearchResponse searchResponse = null;
  132. try {
  133. searchResponse = esClient.search(searchRequest, DEFAULT);
  134. } catch (Throwable e) {
  135. //捕捉最高级别异常,确保打印详细信息
  136. e.printStackTrace();
  137. }
  138. if(searchResponse.getHits() == null){
  139. rspData.setTotal(0);
  140. rspData.setRows(null);
  141. return rspData;
  142. }
  143. List<DocElasticsearchDto> docElasticsearchList = new ArrayList<>();
  144. Long totalHits = searchResponse.getHits().getTotalHits().value;//匹配总条数,用于分页显示
  145. for (SearchHit hit : searchResponse.getHits()){
  146. //查询结果
  147. String source = hit.getSourceAsString();
  148. DocElasticsearchDto docElasticsearchDto = JSONObject.parseObject(source, DocElasticsearchDto.class);
  149. docElasticsearchDto.setId(hit.getId());
  150. //处理高亮字段
  151. Map<String, HighlightField> map = hit.getHighlightFields();
  152. if(map.containsKey("attachment.content")) {
  153. StringBuilder matchContent = new StringBuilder();
  154. for(Text t : map.get("attachment.content").fragments()){
  155. matchContent.append(t.toString());
  156. }
  157. docElasticsearchDto.getAttachment().put("content",matchContent.toString());
  158. }
  159. if(map.containsKey("fileName")) {
  160. StringBuilder matchFileName = new StringBuilder();
  161. for(Text t : map.get("fileName").fragments()){
  162. matchFileName.append(t.toString());
  163. }
  164. docElasticsearchDto.setFileName(matchFileName.toString());
  165. }
  166. docElasticsearchList.add(docElasticsearchDto);
  167. }
  168. rspData.setTotal(totalHits);
  169. rspData.setRows(docElasticsearchList);
  170. return rspData;
  171. }
  172. @Override
  173. public void deleteFileIndex(String id) throws IOException {
  174. //查询系统参数,ES索引名称
  175. String esIndex = sysConfigService.selectConfigByKey("es_index");
  176. //删除索引
  177. DeleteRequest deleteRequest = new DeleteRequest(esIndex,id);
  178. esClient.delete(deleteRequest, RequestOptions.DEFAULT);
  179. }
  180. }

示例的实现是先从数据库查询到保存的文件信息,然后从minio文件存储服务器获取文件流,由于minio提供以签名的方式获取流,这里就直接使用了,你也可以是其他服务器,或者直接获取文件对象。在创建索引的时候直接发送文件流,Ingest-Attachment插件会帮你实现转换。Ingest-Attachment的安装可参考Docker安装Elasticsearch及相关插件详细步骤,全程亲测避坑_冰糖码奇朵的博客-CSDN博客

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/小桥流水78/article/detail/862329
推荐阅读
相关标签
  

闽ICP备14008679号