当前位置:   article > 正文

ElasticSearch单字段查询去重详解_es去重查询

es去重查询

1、SQL去重

在SQL中,用dinstinct语句进行去重:

  • 获取去重后的结果:SELECT DISTINCT name, sex FROM person;
  • 统计去重后的数量:SELECT COUNT(DISTINCT name, sex) FROM person;

2、ES数据构建

2.1 创建索引

  1. from elasticsearch import Elasticsearch
  2. # 连接es
  3. es = Elasticsearch(hosts=["192.168.124.49:9200"], sniffer_timeout=60, timeout=30)
  4. body = {
  5. "mappings": {
  6. "properties": {
  7. "id": {
  8. "type": "integer"
  9. },
  10. "name": {
  11. "type": "text",
  12. "fields": {
  13. "keyword": {
  14. "type": "keyword",
  15. "ignore_above": 256
  16. }
  17. }
  18. },
  19. "age": {
  20. "type": "integer"
  21. },
  22. "gender": {
  23. "type": "keyword"
  24. },
  25. "email": {
  26. "type": "text"
  27. },
  28. "province": {
  29. "type": "text",
  30. "fields": {
  31. "keyword": {
  32. "type": "keyword",
  33. "ignore_above": 256
  34. }
  35. }
  36. },
  37. "address": {
  38. "type": "text",
  39. "fields": {
  40. "keyword": {
  41. "type": "keyword",
  42. "ignore_above": 256
  43. }
  44. }
  45. },
  46. "state": {
  47. "type": "text",
  48. "fields": {
  49. "keyword": {
  50. "type": "keyword",
  51. "ignore_above": 256
  52. }
  53. }
  54. }
  55. }
  56. }
  57. }
  58. # 创建 index
  59. es.indices.create(index="person_info", body=body)

2.2 查看索引

2.3 使用kibana批量生成数据

  1. POST person_info/_bulk
  2. {"index": {"_index": "person_info"}}
  3. {"id": 1, "name": "刘一", "age": 25, "gender": "男", "email": "111@qq.com", "provience": "北京", "address": "北京市朝阳区", "status": "正常"}
  4. {"index": {"_index": "person_info"}}
  5. {"id": 1, "name": "陈二", "age": 26, "gender": "女", "email": "111@qq.com", "provience": "山东", "address": "山东省青岛市", "status": "正常"}
  6. {"index": {"_index": "person_info"}}
  7. {"id": 1, "name": "张三", "age": 27, "gender": "男", "email": "111@qq.com", "provience": "北京", "address": "北京市朝阳区", "status": "正常"}
  8. {"index": {"_index": "person_info"}}
  9. {"id": 1, "name": "李四", "age": 28, "gender": "男", "email": "111@qq.com", "provience": "山东", "address": "山东省济南市", "status": "正常"}
  10. {"index": {"_index": "person_info"}}
  11. {"id": 1, "name": "王五", "age": 25, "gender": "男", "email": "111@qq.com", "provience": "北京", "address": "北京市朝阳区", "status": "正常"}
  12. {"index": {"_index": "person_info"}}
  13. {"id": 1, "name": "刘一", "age": 26, "gender": "男", "email": "111@qq.com", "provience": "山东", "address": "山东省青岛市", "status": "正常"}
  14. {"index": {"_index": "person_info"}}
  15. {"id": 1, "name": "陈二", "age": 26, "gender": "女", "email": "111@qq.com", "provience": "北京", "address": "北京市朝阳区", "status": "正常"}

2.4 查看生成的数据

3、ES获取去重结果

3.1 collapse折叠功能(ES5.3之后支持)

  • 推荐。原因:性能高,占内存小

注意:去重的字段不能是text类型。如果xxxfield的mapping要有keyword,且通过xxxfield.keyword去重。

注意:如果去重字段是其他可以直接去重的类型,比如:数字类型、keyword、日期等,则直接用字段名就可以。即:如果本处xxxfield是keyword,则xxxfield.keyword处写成xxxfield就行。

查询province为北京的信息:

  1. GET person_info/_search
  2. {
  3. "query": {
  4. "match": {
  5. "provience.keyword": "北京"
  6. }
  7. }
  8. }

运行结果:

  1. {
  2. "took" : 15,
  3. "timed_out" : false,
  4. "_shards" : {
  5. "total" : 1,
  6. "successful" : 1,
  7. "skipped" : 0,
  8. "failed" : 0
  9. },
  10. "hits" : {
  11. "total" : {
  12. "value" : 4,
  13. "relation" : "eq"
  14. },
  15. "max_score" : 0.5753642,
  16. "hits" : [
  17. {
  18. "_index" : "person_info",
  19. "_type" : "_doc",
  20. "_id" : "hFHKl4YBPv2uoOpTcHMg",
  21. "_score" : 0.5753642,
  22. "_source" : {
  23. "id" : 1,
  24. "name" : "刘一",
  25. "age" : 25,
  26. "gender" : "男",
  27. "email" : "111@qq.com",
  28. "provience" : "北京",
  29. "address" : "北京市朝阳区",
  30. "status" : "正常"
  31. }
  32. },
  33. {
  34. "_index" : "person_info",
  35. "_type" : "_doc",
  36. "_id" : "hlHKl4YBPv2uoOpTcHMi",
  37. "_score" : 0.5753642,
  38. "_source" : {
  39. "id" : 1,
  40. "name" : "张三",
  41. "age" : 27,
  42. "gender" : "男",
  43. "email" : "111@qq.com",
  44. "provience" : "北京",
  45. "address" : "北京市朝阳区",
  46. "status" : "正常"
  47. }
  48. },
  49. {
  50. "_index" : "person_info",
  51. "_type" : "_doc",
  52. "_id" : "iFHKl4YBPv2uoOpTcHMi",
  53. "_score" : 0.5753642,
  54. "_source" : {
  55. "id" : 1,
  56. "name" : "王五",
  57. "age" : 25,
  58. "gender" : "男",
  59. "email" : "111@qq.com",
  60. "provience" : "北京",
  61. "address" : "北京市朝阳区",
  62. "status" : "正常"
  63. }
  64. },
  65. {
  66. "_index" : "person_info",
  67. "_type" : "_doc",
  68. "_id" : "ilHKl4YBPv2uoOpTcHMi",
  69. "_score" : 0.5753642,
  70. "_source" : {
  71. "id" : 1,
  72. "name" : "陈二",
  73. "age" : 26,
  74. "gender" : "女",
  75. "email" : "111@qq.com",
  76. "provience" : "北京",
  77. "address" : "北京市朝阳区",
  78. "status" : "正常"
  79. }
  80. }
  81. ]
  82. }
  83. }

查询province为北京,且根据年龄去重的信息:

  1. # collapse获取去重结果
  2. GET person_info/_search
  3. {
  4. "query": {
  5. "match": {
  6. "provience.keyword": "北京"
  7. }
  8. },
  9. "collapse": {
  10. "field": "age"
  11. }
  12. }

运行结果:

  1. {
  2. "took" : 14,
  3. "timed_out" : false,
  4. "_shards" : {
  5. "total" : 1,
  6. "successful" : 1,
  7. "skipped" : 0,
  8. "failed" : 0
  9. },
  10. "hits" : {
  11. "total" : {
  12. "value" : 4,
  13. "relation" : "eq"
  14. },
  15. "max_score" : null,
  16. "hits" : [
  17. {
  18. "_index" : "person_info",
  19. "_type" : "_doc",
  20. "_id" : "hFHKl4YBPv2uoOpTcHMg",
  21. "_score" : 0.5753642,
  22. "_source" : {
  23. "id" : 1,
  24. "name" : "刘一",
  25. "age" : 25,
  26. "gender" : "男",
  27. "email" : "111@qq.com",
  28. "provience" : "北京",
  29. "address" : "北京市朝阳区",
  30. "status" : "正常"
  31. },
  32. "fields" : {
  33. "age" : [
  34. 25
  35. ]
  36. }
  37. },
  38. {
  39. "_index" : "person_info",
  40. "_type" : "_doc",
  41. "_id" : "hlHKl4YBPv2uoOpTcHMi",
  42. "_score" : 0.5753642,
  43. "_source" : {
  44. "id" : 1,
  45. "name" : "张三",
  46. "age" : 27,
  47. "gender" : "男",
  48. "email" : "111@qq.com",
  49. "provience" : "北京",
  50. "address" : "北京市朝阳区",
  51. "status" : "正常"
  52. },
  53. "fields" : {
  54. "age" : [
  55. 27
  56. ]
  57. }
  58. },
  59. {
  60. "_index" : "person_info",
  61. "_type" : "_doc",
  62. "_id" : "ilHKl4YBPv2uoOpTcHMi",
  63. "_score" : 0.5753642,
  64. "_source" : {
  65. "id" : 1,
  66. "name" : "陈二",
  67. "age" : 26,
  68. "gender" : "女",
  69. "email" : "111@qq.com",
  70. "provience" : "北京",
  71. "address" : "北京市朝阳区",
  72. "status" : "正常"
  73. },
  74. "fields" : {
  75. "age" : [
  76. 26
  77. ]
  78. }
  79. }
  80. ]
  81. }
  82. }

3.2 字段聚合+top_hits聚合

  • 不推荐。原因:性能差,占内存大

查询province为北京,且根据年龄去重的信息:

  1. # 聚合获取去重结果
  2. GET person_info/_search
  3. {
  4. "query": {
  5. "match": {
  6. "provience.keyword": "北京"
  7. }
  8. },
  9. "size": 0,
  10. "aggs": {
  11. "age_aggs": {
  12. "terms": {
  13. "field": "age",
  14. "size": 10
  15. },
  16. "aggs": {
  17. "age_top": {
  18. "top_hits": {
  19. "sort": [{
  20. "age": {
  21. "order": "desc"
  22. }
  23. }],
  24. "size": 1
  25. }
  26. }
  27. }
  28. }
  29. }
  30. }

运行结果:

  1. {
  2. "took" : 230,
  3. "timed_out" : false,
  4. "_shards" : {
  5. "total" : 1,
  6. "successful" : 1,
  7. "skipped" : 0,
  8. "failed" : 0
  9. },
  10. "hits" : {
  11. "total" : {
  12. "value" : 4,
  13. "relation" : "eq"
  14. },
  15. "max_score" : null,
  16. "hits" : [ ]
  17. },
  18. "aggregations" : {
  19. "age_aggs" : {
  20. "doc_count_error_upper_bound" : 0,
  21. "sum_other_doc_count" : 0,
  22. "buckets" : [
  23. {
  24. "key" : 25,
  25. "doc_count" : 2,
  26. "age_top" : {
  27. "hits" : {
  28. "total" : {
  29. "value" : 2,
  30. "relation" : "eq"
  31. },
  32. "max_score" : null,
  33. "hits" : [
  34. {
  35. "_index" : "person_info",
  36. "_type" : "_doc",
  37. "_id" : "hFHKl4YBPv2uoOpTcHMg",
  38. "_score" : null,
  39. "_source" : {
  40. "id" : 1,
  41. "name" : "刘一",
  42. "age" : 25,
  43. "gender" : "男",
  44. "email" : "111@qq.com",
  45. "provience" : "北京",
  46. "address" : "北京市朝阳区",
  47. "status" : "正常"
  48. },
  49. "sort" : [
  50. 25
  51. ]
  52. }
  53. ]
  54. }
  55. }
  56. },
  57. {
  58. "key" : 26,
  59. "doc_count" : 1,
  60. "age_top" : {
  61. "hits" : {
  62. "total" : {
  63. "value" : 1,
  64. "relation" : "eq"
  65. },
  66. "max_score" : null,
  67. "hits" : [
  68. {
  69. "_index" : "person_info",
  70. "_type" : "_doc",
  71. "_id" : "ilHKl4YBPv2uoOpTcHMi",
  72. "_score" : null,
  73. "_source" : {
  74. "id" : 1,
  75. "name" : "陈二",
  76. "age" : 26,
  77. "gender" : "女",
  78. "email" : "111@qq.com",
  79. "provience" : "北京",
  80. "address" : "北京市朝阳区",
  81. "status" : "正常"
  82. },
  83. "sort" : [
  84. 26
  85. ]
  86. }
  87. ]
  88. }
  89. }
  90. },
  91. {
  92. "key" : 27,
  93. "doc_count" : 1,
  94. "age_top" : {
  95. "hits" : {
  96. "total" : {
  97. "value" : 1,
  98. "relation" : "eq"
  99. },
  100. "max_score" : null,
  101. "hits" : [
  102. {
  103. "_index" : "person_info",
  104. "_type" : "_doc",
  105. "_id" : "hlHKl4YBPv2uoOpTcHMi",
  106. "_score" : null,
  107. "_source" : {
  108. "id" : 1,
  109. "name" : "张三",
  110. "age" : 27,
  111. "gender" : "男",
  112. "email" : "111@qq.com",
  113. "provience" : "北京",
  114. "address" : "北京市朝阳区",
  115. "status" : "正常"
  116. },
  117. "sort" : [
  118. 27
  119. ]
  120. }
  121. ]
  122. }
  123. }
  124. }
  125. ]
  126. }
  127. }
  128. }

4、ES统计去重后的数量

  • 聚合+cardinality聚合函数

查询province为北京,且根据年龄去重的数量:

  1. # 聚合获取去重数量
  2. GET person_info/_search
  3. {
  4. "query": {
  5. "match": {
  6. "provience.keyword": "北京"
  7. }
  8. },
  9. "size": 0,
  10. "aggs": {
  11. "age_aggs": {
  12. "cardinality": {
  13. "field": "age"
  14. }
  15. }
  16. }
  17. }

运行结果:

  1. {
  2. "took" : 68,
  3. "timed_out" : false,
  4. "_shards" : {
  5. "total" : 1,
  6. "successful" : 1,
  7. "skipped" : 0,
  8. "failed" : 0
  9. },
  10. "hits" : {
  11. "total" : {
  12. "value" : 4,
  13. "relation" : "eq"
  14. },
  15. "max_score" : null,
  16. "hits" : [ ]
  17. },
  18. "aggregations" : {
  19. "age_aggs" : {
  20. "value" : 3
  21. }
  22. }
  23. }

参考博文:

ElasticSearch--去重查询/根据字段去重--方法/实例_IT利刃出鞘的博客-CSDN博客_elasticsearch统计去重后的数量准确值

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/Guff_9hys/article/detail/777432
推荐阅读
相关标签
  

闽ICP备14008679号