当前位置:   article > 正文

ElasticSearch中全文搜索(单词搜索、多次搜索、组合搜索和权重搜索)

全文搜索

全文搜索两个最重要的方面是:

  • 相关性(Relevance) 它是评价查询与其结果间的相关程度,并根据这种相关程度对结果排名的一种能力,这种计算方式可以是 TF/IDF 方法、地理位置邻近、模糊相似,或其他的某些算法。
  • 分词(Analysis) 它是将文本块转换为有区别的、规范化的 token 的一个过程,目的是为了创建倒排索引以及 查询倒排索引。

一、构造数据

1、数据库中当前数据

 2、构建索引

  1. PUT http://127.0.0.1:9200/study
  2. # 请求数据
  3. {
  4. "settings": {
  5. "index": {
  6. "number_of_shards": "1",
  7. "number_of_replicas": "0"
  8. }
  9. },
  10. "mappings": {
  11. "properties": {
  12. "name": {
  13. "type": "text"
  14. },
  15. "age": {
  16. "type": "integer"
  17. },
  18. "mail": {
  19. "type": "keyword"
  20. },
  21. "hobby": {
  22. "type": "text",
  23. "analyzer": "ik_max_word"
  24. }
  25. }
  26. }
  27. }
  28. # 响应数据
  29. {
  30. "acknowledged": true,
  31. "shards_acknowledged": true,
  32. "index": "study"
  33. }

3、添加数据

  1. POST http://127.0.0.1:9200/study/_bulk
  2. # 请求数据
  3. {"index":{"_index":"study"}}
  4. {"name":"张三","age": 20,"mail": "111@qq.com","hobby":"羽毛球、乒乓球、足球"}
  5. {"index":{"_index":"study"}}
  6. {"name":"李四","age": 21,"mail": "222@qq.com","hobby":"羽毛球、乒乓球、足球、篮球"}
  7. {"index":{"_index":"study"}}
  8. {"name":"王五","age": 22,"mail": "333@qq.com","hobby":"羽毛球、篮球、游泳、听音乐"}
  9. {"index":{"_index":"study"}}
  10. {"name":"赵六","age": 23,"mail": "444@qq.com","hobby":"跑步、游泳"}
  11. {"index":{"_index":"study"}}
  12. {"name":"孙七","age": 24,"mail": "555@qq.com","hobby":"听音乐、看电影"}
  13. # 响应数据
  14. {
  15. "took": 16,
  16. "errors": false,
  17. "items": [
  18. {
  19. "index": {
  20. "_index": "study",
  21. "_type": "_doc",
  22. "_id": "i6jJdoIBU4c5cKp3GGKx",
  23. "_version": 1,
  24. "result": "created",
  25. "_shards": {
  26. "total": 1,
  27. "successful": 1,
  28. "failed": 0
  29. },
  30. "_seq_no": 0,
  31. "_primary_term": 1,
  32. "status": 201
  33. }
  34. },
  35. {
  36. "index": {
  37. "_index": "study",
  38. "_type": "_doc",
  39. "_id": "jKjJdoIBU4c5cKp3GGKx",
  40. "_version": 1,
  41. "result": "created",
  42. "_shards": {
  43. "total": 1,
  44. "successful": 1,
  45. "failed": 0
  46. },
  47. "_seq_no": 1,
  48. "_primary_term": 1,
  49. "status": 201
  50. }
  51. },
  52. {
  53. "index": {
  54. "_index": "study",
  55. "_type": "_doc",
  56. "_id": "jajJdoIBU4c5cKp3GGKx",
  57. "_version": 1,
  58. "result": "created",
  59. "_shards": {
  60. "total": 1,
  61. "successful": 1,
  62. "failed": 0
  63. },
  64. "_seq_no": 2,
  65. "_primary_term": 1,
  66. "status": 201
  67. }
  68. },
  69. {
  70. "index": {
  71. "_index": "study",
  72. "_type": "_doc",
  73. "_id": "jqjJdoIBU4c5cKp3GGKx",
  74. "_version": 1,
  75. "result": "created",
  76. "_shards": {
  77. "total": 1,
  78. "successful": 1,
  79. "failed": 0
  80. },
  81. "_seq_no": 3,
  82. "_primary_term": 1,
  83. "status": 201
  84. }
  85. },
  86. {
  87. "index": {
  88. "_index": "study",
  89. "_type": "_doc",
  90. "_id": "j6jJdoIBU4c5cKp3GGKx",
  91. "_version": 1,
  92. "result": "created",
  93. "_shards": {
  94. "total": 1,
  95. "successful": 1,
  96. "failed": 0
  97. },
  98. "_seq_no": 4,
  99. "_primary_term": 1,
  100. "status": 201
  101. }
  102. }
  103. ]
  104. }

二、全文搜索

2.1、单词搜索

  1. POST http://127.0.0.1:9200/study/_search
  2. # 请求数据
  3. {
  4. "query": {
  5. "match": {
  6. "hobby": "音乐"
  7. }
  8. },
  9. "highlight": {
  10. "fields": {
  11. "hobby": {}
  12. }
  13. }
  14. }
  15. # 响应数据
  16. {
  17. "took": 67,
  18. "timed_out": false,
  19. "_shards": {
  20. "total": 1,
  21. "successful": 1,
  22. "skipped": 0,
  23. "failed": 0
  24. },
  25. "hits": {
  26. "total": {
  27. "value": 2,
  28. "relation": "eq"
  29. },
  30. "max_score": 0.9395274,
  31. "hits": [
  32. {
  33. "_index": "study",
  34. "_type": "_doc",
  35. "_id": "j6jJdoIBU4c5cKp3GGKx",
  36. "_score": 0.9395274,
  37. "_source": {
  38. "name": "孙七",
  39. "age": 24,
  40. "mail": "555@qq.com",
  41. "hobby": "听音乐、看电影"
  42. },
  43. "highlight": {
  44. "hobby": [
  45. "听<em>音乐</em>、看电影"
  46. ]
  47. }
  48. },
  49. {
  50. "_index": "study",
  51. "_type": "_doc",
  52. "_id": "jajJdoIBU4c5cKp3GGKx",
  53. "_score": 0.77041256,
  54. "_source": {
  55. "name": "王五",
  56. "age": 22,
  57. "mail": "333@qq.com",
  58. "hobby": "羽毛球、篮球、游泳、听音乐"
  59. },
  60. "highlight": {
  61. "hobby": [
  62. "羽毛球、篮球、游泳、听<em>音乐</em>"
  63. ]
  64. }
  65. }
  66. ]
  67. }
  68. }

过程说明:

1. 检查字段类型

        爱好 hobby 字段是一个 text 类型( 指定了IK分词器),这意味着查询字符串本身也应该被分词。

2. 分析查询字符串 。

        将查询的字符串 “音乐” 传入IK分词器中,输出的结果是单个项 音乐。因为只有一个单词项,所以 match 查询执 行的是单个底层 term 查询。

3. 查找匹配文档 。

        用 term 查询在倒排索引中查找 “音乐” 然后获取一组包含该项的文档,本例的结果是文档:3 、5 。

4. 为每个文档评分 。

        用 term 查询计算每个文档相关度评分 _score ,这是种将 词频(term frequency,即词 “音乐” 在相关文档的 hobby 字段中出现的频率)和 反向文档频率(inverse document frequency,即词 “音乐” 在所有文档的 hobby 字段中出现的频率),以及字段的长度(即字段越短相关度越高)相结合的计算方式。

2.2、单词搜索

  1. POST http://127.0.0.1:9200/study/_search
  2. # 请求数据
  3. {
  4. "query": {
  5. "match": {
  6. "hobby": "音乐 篮球"
  7. }
  8. },
  9. "highlight": {
  10. "fields": {
  11. "hobby": {}
  12. }
  13. }
  14. }
  15. # 响应数据
  16. {
  17. "took": 22,
  18. "timed_out": false,
  19. "_shards": {
  20. "total": 1,
  21. "successful": 1,
  22. "skipped": 0,
  23. "failed": 0
  24. },
  25. "hits": {
  26. "total": {
  27. "value": 3,
  28. "relation": "eq"
  29. },
  30. "max_score": 1.5408251,
  31. "hits": [
  32. {
  33. "_index": "study",
  34. "_type": "_doc",
  35. "_id": "jajJdoIBU4c5cKp3GGKx",
  36. "_score": 1.5408251,
  37. "_source": {
  38. "name": "王五",
  39. "age": 22,
  40. "mail": "333@qq.com",
  41. "hobby": "羽毛球、篮球、游泳、听音乐"
  42. },
  43. "highlight": {
  44. "hobby": [
  45. "羽毛球、<em>篮球</em>、游泳、听<em>音乐</em>"
  46. ]
  47. }
  48. },
  49. {
  50. "_index": "study",
  51. "_type": "_doc",
  52. "_id": "j6jJdoIBU4c5cKp3GGKx",
  53. "_score": 0.9395274,
  54. "_source": {
  55. "name": "孙七",
  56. "age": 24,
  57. "mail": "555@qq.com",
  58. "hobby": "听音乐、看电影"
  59. },
  60. "highlight": {
  61. "hobby": [
  62. "听<em>音乐</em>、看电影"
  63. ]
  64. }
  65. },
  66. {
  67. "_index": "study",
  68. "_type": "_doc",
  69. "_id": "jKjJdoIBU4c5cKp3GGKx",
  70. "_score": 0.77041256,
  71. "_source": {
  72. "name": "李四",
  73. "age": 21,
  74. "mail": "222@qq.com",
  75. "hobby": "羽毛球、乒乓球、足球、篮球"
  76. },
  77. "highlight": {
  78. "hobby": [
  79. "羽毛球、乒乓球、足球、<em>篮球</em>"
  80. ]
  81. }
  82. }
  83. ]
  84. }
  85. }

        上面查询中只要是包含篮球和音乐的都被查询出来了。但是这有时候不能达到我们的要求,我们大部分时候都是希望两个词是同时包含的。这时候可以使用elasticsearch中指定词之间逻辑关系operator:"and"

  1. POST http://127.0.0.1:9200/study/_search
  2. # 请求数据
  3. {
  4. "query": {
  5. "match": {
  6. "hobby": {
  7. "query": "音乐 篮球",
  8. "operator": "and"
  9. }
  10. }
  11. },
  12. "highlight": {
  13. "fields": {
  14. "hobby": {}
  15. }
  16. }
  17. }
  18. # 响应结果
  19. {
  20. "took": 6,
  21. "timed_out": false,
  22. "_shards": {
  23. "total": 1,
  24. "successful": 1,
  25. "skipped": 0,
  26. "failed": 0
  27. },
  28. "hits": {
  29. "total": {
  30. "value": 1,
  31. "relation": "eq"
  32. },
  33. "max_score": 1.5408251,
  34. "hits": [
  35. {
  36. "_index": "study",
  37. "_type": "_doc",
  38. "_id": "jajJdoIBU4c5cKp3GGKx",
  39. "_score": 1.5408251,
  40. "_source": {
  41. "name": "王五",
  42. "age": 22,
  43. "mail": "333@qq.com",
  44. "hobby": "羽毛球、篮球、游泳、听音乐"
  45. },
  46. "highlight": {
  47. "hobby": [
  48. "羽毛球、<em>篮球</em>、游泳、听<em>音乐</em>"
  49. ]
  50. }
  51. }
  52. ]
  53. }
  54. }

        上面的测试结果都是选择了"and"和"or"两个极端情况下,但是在真正搜索中,我们不会使用这两个极端情况的,这样就需要另外一种查询方式,即为只需要符合一定的相似度就可以查询到的数据,在elasticsearch中就支持这种查询方式,如使用minimum_should_match来指定匹配度,如60%。

  1. POST http://127.0.0.1:9200/study/_search
  2. # 请求数据
  3. {
  4. "query": {
  5. "match": {
  6. "hobby": {
  7. "query": "游泳 羽毛球",
  8. "minimum_should_match": "80%"
  9. }
  10. }
  11. },
  12. "highlight": {
  13. "fields": {
  14. "hobby": {}
  15. }
  16. }
  17. }
  18. # 响应数据
  19. {
  20. "took": 6,
  21. "timed_out": false,
  22. "_shards": {
  23. "total": 1,
  24. "successful": 1,
  25. "skipped": 0,
  26. "failed": 0
  27. },
  28. "hits": {
  29. "total": {
  30. "value": 3,
  31. "relation": "eq"
  32. },
  33. "max_score": 2.1933634,
  34. "hits": [
  35. {
  36. "_index": "study",
  37. "_type": "_doc",
  38. "_id": "jajJdoIBU4c5cKp3GGKx",
  39. "_score": 2.1933634,
  40. "_source": {
  41. "name": "王五",
  42. "age": 22,
  43. "mail": "333@qq.com",
  44. "hobby": "羽毛球、篮球、游泳、听音乐"
  45. },
  46. "highlight": {
  47. "hobby": [
  48. "<em>羽毛球</em>、篮球、<em>游泳</em>、听音乐"
  49. ]
  50. }
  51. },
  52. {
  53. "_index": "study",
  54. "_type": "_doc",
  55. "_id": "i6jJdoIBU4c5cKp3GGKx",
  56. "_score": 1.7171206,
  57. "_source": {
  58. "name": "张三",
  59. "age": 20,
  60. "mail": "111@qq.com",
  61. "hobby": "羽毛球、乒乓球、足球"
  62. },
  63. "highlight": {
  64. "hobby": [
  65. "<em>羽毛球</em>、乒乓<em>球</em>、足球"
  66. ]
  67. }
  68. },
  69. {
  70. "_index": "study",
  71. "_type": "_doc",
  72. "_id": "jKjJdoIBU4c5cKp3GGKx",
  73. "_score": 1.6262295,
  74. "_source": {
  75. "name": "李四",
  76. "age": 21,
  77. "mail": "222@qq.com",
  78. "hobby": "羽毛球、乒乓球、足球、篮球"
  79. },
  80. "highlight": {
  81. "hobby": [
  82. "<em>羽毛球</em>、乒乓<em>球</em>、足球、篮球"
  83. ]
  84. }
  85. }
  86. ]
  87. }
  88. }

2.3、组合搜索

        在搜索时除了上面的方法外,还可以使用过滤器中的bool组合搜索。

  1. POST http://127.0.0.1:9200/study/_search
  2. # 请求数据
  3. {
  4. "query": {
  5. "bool": {
  6. "must": {
  7. "match": {
  8. "hobby": "篮球"
  9. }
  10. },
  11. "must_not": {
  12. "match": {
  13. "hobby": "音乐"
  14. }
  15. },
  16. "should": [
  17. {
  18. "match": {
  19. "hobby": "游泳"
  20. }
  21. }
  22. ]
  23. }
  24. },
  25. "highlight": {
  26. "fields": {
  27. "hobby": {}
  28. }
  29. }
  30. }
  31. # 响应数据
  32. {
  33. "took": 6,
  34. "timed_out": false,
  35. "_shards": {
  36. "total": 1,
  37. "successful": 1,
  38. "skipped": 0,
  39. "failed": 0
  40. },
  41. "hits": {
  42. "total": {
  43. "value": 1,
  44. "relation": "eq"
  45. },
  46. "max_score": 0.77041256,
  47. "hits": [
  48. {
  49. "_index": "study",
  50. "_type": "_doc",
  51. "_id": "jKjJdoIBU4c5cKp3GGKx",
  52. "_score": 0.77041256,
  53. "_source": {
  54. "name": "李四",
  55. "age": 21,
  56. "mail": "222@qq.com",
  57. "hobby": "羽毛球、乒乓球、足球、篮球"
  58. },
  59. "highlight": {
  60. "hobby": [
  61. "羽毛球、乒乓球、足球、<em>篮球</em>"
  62. ]
  63. }
  64. }
  65. ]
  66. }
  67. }

注意:上面示例中在搜索结果中必须包含篮球,不能包含音乐,如果包含了游泳,那么它的相似度更高。

评分的计算规则

  • bool 查询会为每个文档计算相关度评分 _score , 再将所有匹配的 must 和 should 语句的分数 _score 求和, 最后除以 must 和 should 语句的总数。
  • must_not 语句不会影响评分; 它的作用只是将不相关的文档排除。

注意:默认情况下,should中的内容不是必须匹配的,如果查询语句中没有must,那么就会至少匹配其中一个。当然了, 也可以通过minimum_should_match参数进行控制,该值可以是数字也可以的百分比。

  1. POST http://127.0.0.1:9200/study/_search
  2. # 请求数据
  3. {
  4. "query": {
  5. "bool": {
  6. "should": [
  7. {
  8. "match": {
  9. "hobby": "游泳"
  10. }
  11. },
  12. {
  13. "match": {
  14. "hobby": "篮球"
  15. }
  16. },
  17. {
  18. "match": {
  19. "hobby": "音乐"
  20. }
  21. }
  22. ],
  23. "minimum_should_match": 2
  24. }
  25. },
  26. "highlight": {
  27. "fields": {
  28. "hobby": {}
  29. }
  30. }
  31. }
  32. # 响应数据
  33. {
  34. "took": 4,
  35. "timed_out": false,
  36. "_shards": {
  37. "total": 1,
  38. "successful": 1,
  39. "skipped": 0,
  40. "failed": 0
  41. },
  42. "hits": {
  43. "total": {
  44. "value": 1,
  45. "relation": "eq"
  46. },
  47. "max_score": 2.3112378,
  48. "hits": [
  49. {
  50. "_index": "study",
  51. "_type": "_doc",
  52. "_id": "jajJdoIBU4c5cKp3GGKx",
  53. "_score": 2.3112378,
  54. "_source": {
  55. "name": "王五",
  56. "age": 22,
  57. "mail": "333@qq.com",
  58. "hobby": "羽毛球、篮球、游泳、听音乐"
  59. },
  60. "highlight": {
  61. "hobby": [
  62. "羽毛球、<em>篮球</em>、<em>游泳</em>、听<em>音乐</em>"
  63. ]
  64. }
  65. }
  66. ]
  67. }
  68. }

2.4、权重搜索

        在一些情况下,可能会对某些词增加权重来影响该条数据的得分。

  1. POST http://127.0.0.1:9200/study/_search
  2. # 请求数据
  3. {
  4. "query": {
  5. "bool": {
  6. "must": {
  7. "match": {
  8. "hobby": {
  9. "query": "游泳篮球",
  10. "operator": "and"
  11. }
  12. }
  13. },
  14. "should": [
  15. {
  16. "match": {
  17. "hobby": {
  18. "query": "音乐",
  19. "boost": 10
  20. }
  21. }
  22. },
  23. {
  24. "match": {
  25. "hobby": {
  26. "query": "跑步",
  27. "boost": 2
  28. }
  29. }
  30. }
  31. ]
  32. }
  33. },
  34. "highlight": {
  35. "fields": {
  36. "hobby": {}
  37. }
  38. }
  39. }
  40. # 响应数据
  41. {
  42. "took": 5,
  43. "timed_out": false,
  44. "_shards": {
  45. "total": 1,
  46. "successful": 1,
  47. "skipped": 0,
  48. "failed": 0
  49. },
  50. "hits": {
  51. "total": {
  52. "value": 1,
  53. "relation": "eq"
  54. },
  55. "max_score": 9.24495,
  56. "hits": [
  57. {
  58. "_index": "study",
  59. "_type": "_doc",
  60. "_id": "jajJdoIBU4c5cKp3GGKx",
  61. "_score": 9.24495,
  62. "_source": {
  63. "name": "王五",
  64. "age": 22,
  65. "mail": "333@qq.com",
  66. "hobby": "羽毛球、篮球、游泳、听音乐"
  67. },
  68. "highlight": {
  69. "hobby": [
  70. "羽毛球、<em>篮球</em>、<em>游泳</em>、听<em>音乐</em>"
  71. ]
  72. }
  73. }
  74. ]
  75. }
  76. }

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/繁依Fanyi0/article/detail/305755
推荐阅读
相关标签
  

闽ICP备14008679号