当前位置:   article > 正文

Elasticsearch-高级搜索(拼音|首字母|简繁|二级搜索)_java整合elasticsearch 7.17.5 实现中文+拼音+简繁搜索

java整合elasticsearch 7.17.5 实现中文+拼音+简繁搜索

需求:

  1. 中文搜索、英文搜索、中英混搜
  2. 全拼搜索、首字母搜索、中文+全拼、中文+首字母混搜
  3. 简繁搜索
  4. 二级搜索(对第一次搜索结果,再进行搜索)

一、ES相关插件

IK分词

GitHub - medcl/elasticsearch-analysis-ik: The IK Analysis plugin integrates Lucene IK analyzer into elasticsearch, support customized dictionary.

拼音:

https://github.com/medcl/elasticsearch-analysis-pinyin

简繁体:

https://github.com/medcl/elasticsearch-analysis-stconvert


二、什么是 analysis

        analysis分析是 Elasticsearch 在文档发送之前对文档正文执行的过程,以添加到反向索引中(inverted index)。 在将文档添加到索引之前,Elasticsearch 会为每个分析的字段执行许多步骤:

  •     Character filtering (字符过滤器): 使用字符过滤器转换字符
  •     Breaking text into tokens (把文字转化为标记): 将文本分成一组一个或多个标记
  •     Token filtering:使用标记过滤器转换每个标记
  •     Token indexing:把这些标记存于索引中

详细介绍:Elasticsearch: analyzer_Elastic 中国社区官方博客的博客-CSDN博客_elasticsearch analyzer如果大家之前看过我写的文章“开始使用Elasticsearch (3)”,在文章的最后部分写了有关于analyzer的有关介绍。在今天的文章中,我们来进一步了解analyzer。 analyzer执行将输入字符流分解为token的过程,它一般发生在两个场合:在indexing的时候,也即在建立索引的时候在searching的时候,也即在搜索时,分析需要搜索的词语什么是analysis...https://blog.csdn.net/UbuntuTouch/article/details/100392478

三、索引模板

  1. PUT /_template/test_template
  2. {
  3. "index_patterns": [
  4. "test-*"
  5. ],
  6. "aliases": {
  7. "test_read": {}
  8. },
  9. "settings": {
  10. "index": {
  11. "max_result_window": "100000",
  12. "refresh_interval": "5s",
  13. "number_of_shards": "5",
  14. "translog": {
  15. "flush_threshold_size": "1024mb",
  16. "sync_interval": "30s",
  17. "durability": "async"
  18. },
  19. "number_of_replicas": "1"
  20. },
  21. "analysis": {
  22. "char_filter": {
  23. "tsconvert": {
  24. "type": "stconvert",
  25. "convert_type": "t2s"
  26. }
  27. },
  28. "analyzer": {
  29. "ik_t2s_pinyin_analyzer": {
  30. "type": "custom",
  31. "char_filter": [
  32. "tsconvert"
  33. ],
  34. "tokenizer": "ik_max_word",
  35. "filter": [
  36. "pinyin_filter",
  37. "lowercase"
  38. ]
  39. },
  40. "stand_t2s_pinyin_analyzer": {
  41. "type": "custom",
  42. "char_filter": [
  43. "tsconvert"
  44. ],
  45. "tokenizer": "standard",
  46. "filter": [
  47. "pinyin_filter",
  48. "lowercase"
  49. ]
  50. },
  51. "ik_t2s_analyzer": {
  52. "type": "custom",
  53. "char_filter": [
  54. "tsconvert"
  55. ],
  56. "tokenizer": "ik_max_word",
  57. "filter": [
  58. "lowercase"
  59. ]
  60. },
  61. "stand_t2s_analyzer": {
  62. "type": "custom",
  63. "char_filter": [
  64. "tsconvert"
  65. ],
  66. "tokenizer": "standard",
  67. "filter": [
  68. "lowercase"
  69. ]
  70. },
  71. "ik_pinyin_analyzer": {
  72. "type": "custom",
  73. "tokenizer": "ik_max_word",
  74. "filter": [
  75. "pinyin_filter",
  76. "lowercase"
  77. ]
  78. },
  79. "stand_pinyin_analyzer": {
  80. "type": "custom",
  81. "tokenizer": "standard",
  82. "filter": [
  83. "pinyin_filter",
  84. "lowercase"
  85. ]
  86. },
  87. "keyword_t2s_pinyin_analyzer": {
  88. "filter": [
  89. "pinyin_filter",
  90. "lowercase"
  91. ],
  92. "char_filter": [
  93. "tsconvert"
  94. ],
  95. "type": "custom",
  96. "tokenizer": "keyword"
  97. },
  98. "keyword_pinyin_analyzer": {
  99. "filter": [
  100. "pinyin_filter",
  101. "lowercase"
  102. ],
  103. "type": "custom",
  104. "tokenizer": "keyword"
  105. }
  106. },
  107. "filter": {
  108. "pinyin_first_letter_and_full_pinyin_filter": {
  109. "type": "pinyin",
  110. "keep_first_letter": true,
  111. "keep_separate_first_letter": false,
  112. "keep_full_pinyin": false,
  113. "keep_joined_full_pinyin": true,
  114. "keep_none_chinese": true,
  115. "none_chinese_pinyin_tokenize": false,
  116. "keep_none_chinese_in_joined_full_pinyin": true,
  117. "keep_original": false,
  118. "limit_first_letter_length": 1000,
  119. "lowercase": true,
  120. "trim_whitespace": true,
  121. "remove_duplicated_term": true
  122. }
  123. }
  124. }
  125. },
  126. "mappings": {
  127. "properties": {
  128. "name": {
  129. "index_phrases": true,
  130. "analyzer": "ik_max_word",
  131. "index": true,
  132. "type": "text",
  133. "fields": {
  134. "keyword": {
  135. "ignore_above": 256,
  136. "type": "keyword"
  137. },
  138. "stand": {
  139. "analyzer": "standard",
  140. "type": "text"
  141. },
  142. "STPA": {
  143. "type": "text",
  144. "analyzer": "stand_t2s_pinyin_analyzer"
  145. },
  146. "ITPA": {
  147. "type": "text",
  148. "analyzer": "ik_t2s_pinyin_analyzer"
  149. }
  150. }
  151. },
  152. "desc": {
  153. "index_phrases": true,
  154. "analyzer": "ik_max_word",
  155. "index": true,
  156. "type": "text",
  157. "fields": {
  158. "keyword": {
  159. "ignore_above": 256,
  160. "type": "keyword"
  161. },
  162. "stand": {
  163. "analyzer": "standard",
  164. "type": "text"
  165. },
  166. "STPA": {
  167. "type": "text",
  168. "analyzer": "stand_t2s_pinyin_analyzer"
  169. },
  170. "ITPA": {
  171. "type": "text",
  172. "analyzer": "ik_t2s_pinyin_analyzer"
  173. }
  174. }
  175. },
  176. "abstr": {
  177. "index_phrases": true,
  178. "analyzer": "ik_max_word",
  179. "index": true,
  180. "type": "text",
  181. "fields": {
  182. "keyword": {
  183. "ignore_above": 256,
  184. "type": "keyword"
  185. },
  186. "stand": {
  187. "analyzer": "standard",
  188. "type": "text"
  189. },
  190. "STPA": {
  191. "type": "text",
  192. "analyzer": "stand_t2s_pinyin_analyzer"
  193. },
  194. "ITPA": {
  195. "type": "text",
  196. "analyzer": "ik_t2s_pinyin_analyzer"
  197. }
  198. }
  199. }
  200. }
  201. }
  202. }

四、DSL语句

  1. GET /test_read/_search
  2. {
  3. "from": 0,
  4. "size": 10,
  5. "terminate_after": 100000,
  6. "query": {
  7. "bool": {
  8. "must": [
  9. {
  10. "query_string": {
  11. "query": "bj天安门 OR 测试",
  12. "fields": [
  13. "name.ITPA"
  14. ],
  15. "type": "phrase",
  16. "default_operator": "and"
  17. }
  18. }
  19. ],
  20. "adjust_pure_negative": true,
  21. "boost": 1
  22. }
  23. },
  24. "post_filter": {
  25. "bool": {
  26. "must": [
  27. {
  28. "match": {
  29. "name": "天安门"
  30. }
  31. }
  32. ]
  33. }
  34. },
  35. "highlight": {
  36. "fragment_size": 1000,
  37. "pre_tags": [
  38. "<span style=\"color:red;background:yellow;\">"
  39. ],
  40. "post_tags": [
  41. "</span>"
  42. ],
  43. "fields": {
  44. "name.stand": {},
  45. "desc.stand": {},
  46. "abstr.stand": {},
  47. "name.IPA": {},
  48. "desc.IPA": {},
  49. "abstr.IPA": {},
  50. "name.ITPA": {},
  51. "desc.ITPA": {},
  52. "abstr.ITPA": {}
  53. }
  54. }
  55. }

post_filter:后过滤器 | Elasticsearch: 权威指南 | Elastic

PS:post_filter实现二次搜索功能,post_filter无法使用es高亮功能,需要自己通过代码进行手动标记高亮;根据上面的DSL语句,可写出对应的代码啦~

拼音插件配置:

  • keep_first_letter:这个参数会将词的第一个字母全部拼起来.例如:刘德华->ldh.默认为:true
  • keep_separate_first_letter:这个会将第一个字母一个个分开.例如:刘德华->l,d,h.默认为:flase.如果开启,可能导致查询结果太过于模糊,准确率太低.
  • limit_first_letter_length:设置最大keep_first_letter结果的长度,默认为:16
  • keep_full_pinyin:如果打开,它将保存词的全拼,并按字分开保存.例如:刘德华> [liu,de,hua],默认为:true
  • keep_joined_full_pinyin:如果打开将保存词的全拼.例如:刘德华> [liudehua],默认为:false
  • keep_none_chinese:将非中文字母或数字保留在结果中.默认为:true
  • keep_none_chinese_together:保证非中文在一起.默认为: true, 例如: DJ音乐家 -> DJ,yin,yue,jia, 如果设置为:false, 例如: DJ音乐家 -> D,J,yin,yue,jia, 注意: keep_none_chinese应该先开启.
  • keep_none_chinese_in_first_letter:将非中文字母保留在首字母中.例如: 刘德华AT2016->ldhat2016, 默认为:true
  • keep_none_chinese_in_joined_full_pinyin:将非中文字母保留为完整拼音. 例如: 刘德华2016->liudehua2016, 默认为: false
  • none_chinese_pinyin_tokenize:如果他们是拼音,切分非中文成单独的拼音项. 默认为:true,例如: liudehuaalibaba13zhuanghan -> liu,de,hua,a,li,ba,ba,13,zhuang,han, 注意: keep_none_chinese和keep_none_chinese_together需要先开启.
  • keep_original:是否保持原词.默认为:false
  • lowercase:小写非中文字母.默认为:true
  • trim_whitespace:去掉空格.默认为:true
  • remove_duplicated_term:保存索引时删除重复的词语.例如: de的>de, 默认为: false, 注意:开启可能会影响位置相关的查询.
  • ignore_pinyin_offset:在6.0之后,严格限制偏移量,不允许使用重叠的标记.使用此参数时,忽略偏移量将允许使用重叠的标记.请注意,所有与位置相关的查询或突出显示都将变为错误,您应使用多个字段并为不同的字段指定不同的设置查询目的.如果需要偏移量,请将其设置为false。默认值:true
声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/代码探险家/article/detail/835944
推荐阅读
相关标签
  

闽ICP备14008679号