当前位置:   article > 正文

java简单实现布谷鸟过滤器_布谷鸟过滤器java实现代码

布谷鸟过滤器java实现代码

布谷鸟过滤器,一种增强版的布隆过滤器,不同于布隆过滤器的是,存放一段hash的地方会多个位置,用于增加空间率用率,布谷鸟过滤器会有两个hash,异或算法,两个hash能找到相互的位置,用于其中一个被布谷鸟的蛋T走时的容错。

  1. /**
  2. * 布谷鸟过滤器
  3. * @Description: CuckooFilter
  4. * @Author: GeorgeYang
  5. * @Date: 2022-06-07
  6. * @Version:v1.0
  7. */
  8. public class CuckooFilter {
  9. /**
  10. * 预期处理的数据量大小
  11. */
  12. private int dataCount;
  13. /**
  14. * 位数组的数量大小
  15. */
  16. private int bitCount;
  17. private int bitSpace;
  18. /**
  19. * 鸟巢位置2
  20. * 位数组。数组中的元素只能是 0 或者 1
  21. */
  22. private BitSet nestPosition1;
  23. /**
  24. * 鸟巢位置2
  25. * 位数组。数组中的元素只能是 0 或者 1
  26. */
  27. private BitSet nestPosition2;
  28. /**
  29. * 存放包含 hash 函数的类的数组
  30. */
  31. private SliceHash[] func;
  32. /**
  33. * 静态内部类。hash 函数
  34. * 块hash
  35. */
  36. public static class SliceHash {
  37. private int bitIndex;//bitIndex
  38. private int fp;
  39. private int intSliceSpace;//hash成整数,每段长多少
  40. public SliceHash(int bitSpace, int bitIndex) {
  41. this.bitIndex = bitIndex;
  42. this.fp = bitSpace * bitIndex;
  43. this.intSliceSpace = Integer.MAX_VALUE / bitSpace;
  44. }
  45. /**
  46. * 计算 hash 值
  47. */
  48. public int hash1(Object value) {
  49. return Math.abs(bitIndex * intSliceSpace + (value.hashCode() << bitIndex) % intSliceSpace);
  50. }
  51. public int hash2(Object value) {
  52. int hash1 = this.hash1(value);
  53. int hash2 = hash1 ^ fp;
  54. return Math.abs(hash2);
  55. }
  56. }
  57. /**
  58. *
  59. * @param dataCount 预计存放总数量
  60. * @param bitSpace 每个内容,可以被拆分分散在多少个位置里面选择存放,
  61. */
  62. public CuckooFilter(int dataCount, int bitSpace) {
  63. this.dataCount = dataCount;
  64. this.bitSpace = bitSpace;
  65. //每个字符分配32个位
  66. this.bitCount = dataCount * bitSpace;
  67. func = new SliceHash[bitSpace];//16个
  68. for (int i = 0; i < bitSpace; i++) {
  69. func[i] = new SliceHash(bitSpace, i);
  70. }
  71. this.nestPosition1 = new BitSet(bitCount);
  72. this.nestPosition2 = new BitSet(bitCount);
  73. }
  74. public boolean add(Object value) {
  75. if (contains(value))
  76. return false;
  77. for (SliceHash f : func) {
  78. int hash1 = f.hash1(value);
  79. nestPosition1.set(hash1, true);
  80. nestPosition2.set(hash1, false);//T走同巢隔壁位置的蛋
  81. int hash2 = f.hash2(value);
  82. nestPosition1.set(hash2, false);//T走同巢隔壁位置的蛋
  83. nestPosition2.set(hash2, true);
  84. }
  85. return true;
  86. }
  87. public boolean contains(Object value) {
  88. for (SliceHash f : func) {
  89. int hash1 = f.hash1(value);
  90. int hash2 = f.hash2(value);
  91. boolean inPosition1 = nestPosition1.get(hash1);
  92. boolean inPosition2 = nestPosition2.get(hash2);
  93. boolean existInOne = inPosition1 || inPosition2;
  94. if (!existInOne)
  95. //两个巢位都被T走了
  96. return false;
  97. }
  98. return true;
  99. }
  100. public static void main(String[] args) throws Exception {
  101. int dataCount = 2 << 24;
  102. // int dataCount = 9000000;
  103. System.out.println(dataCount);
  104. CuckooFilter filter = new CuckooFilter(dataCount, 5);
  105. int wrongCount = 0;
  106. int addCount = 9999999;//1亿数据
  107. for (int i = 0; i < addCount; i++) {
  108. String value = "" + i;
  109. boolean exist = filter.contains(value);
  110. if (exist) {
  111. wrongCount++;
  112. }
  113. boolean addSuccess = filter.add(value);
  114. exist = filter.contains(value);
  115. if (!exist)
  116. throw new InterruptedException("添加后被认为不存在:" + value);
  117. }
  118. System.out.println("误判的数量:" + wrongCount);
  119. System.out.println("误判率:" + (wrongCount * 100d / addCount) + "%");
  120. int lostCount = 0;
  121. int existCount = 0;
  122. for (int i = 0; i < addCount; i++) {
  123. String value = "" + i;
  124. boolean exist = filter.contains(value);
  125. if (exist) {
  126. existCount++;
  127. } else {
  128. lostCount ++;
  129. }
  130. }
  131. System.out.println("存留的数量:" + (existCount - wrongCount));
  132. System.out.println("存留率:" + ((existCount - wrongCount) * 100d / addCount) + "%");
  133. System.out.println("被T除的数量:" + lostCount);
  134. System.out.println("丢失率:" + (lostCount * 100d / addCount) + "%");
  135. }
  136. }

测试添加1亿数据,运行结果:

  1. 误判的数量:876
  2. 误判率:0.008760000876000087%
  3. 存留的数量:9932122
  4. 存留率:99.32122993212299%
  5. 被T除的数量:67001
  6. 丢失率:0.6700100670010067%

还没有做扩容等操作,这里只是简单的实现。

参考:

https://zhuanlan.zhihu.com/p/462813998
https://blog.csdn.net/aaa_bbb_ccc_123_456/article/details/106055033

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/2023面试高手/article/detail/501579
推荐阅读
相关标签
  

闽ICP备14008679号