赞
踩
布谷鸟过滤器,一种增强版的布隆过滤器,不同于布隆过滤器的是,存放一段hash的地方会多个位置,用于增加空间率用率,布谷鸟过滤器会有两个hash,异或算法,两个hash能找到相互的位置,用于其中一个被布谷鸟的蛋T走时的容错。
-
- /**
- * 布谷鸟过滤器
- * @Description: CuckooFilter
- * @Author: GeorgeYang
- * @Date: 2022-06-07
- * @Version:v1.0
- */
- public class CuckooFilter {
- /**
- * 预期处理的数据量大小
- */
- private int dataCount;
- /**
- * 位数组的数量大小
- */
- private int bitCount;
-
- private int bitSpace;
- /**
- * 鸟巢位置2
- * 位数组。数组中的元素只能是 0 或者 1
- */
- private BitSet nestPosition1;
- /**
- * 鸟巢位置2
- * 位数组。数组中的元素只能是 0 或者 1
- */
- private BitSet nestPosition2;
- /**
- * 存放包含 hash 函数的类的数组
- */
- private SliceHash[] func;
-
- /**
- * 静态内部类。hash 函数
- * 块hash
- */
- public static class SliceHash {
- private int bitIndex;//bitIndex
- private int fp;
- private int intSliceSpace;//hash成整数,每段长多少
-
- public SliceHash(int bitSpace, int bitIndex) {
- this.bitIndex = bitIndex;
- this.fp = bitSpace * bitIndex;
- this.intSliceSpace = Integer.MAX_VALUE / bitSpace;
- }
-
- /**
- * 计算 hash 值
- */
- public int hash1(Object value) {
- return Math.abs(bitIndex * intSliceSpace + (value.hashCode() << bitIndex) % intSliceSpace);
- }
-
- public int hash2(Object value) {
- int hash1 = this.hash1(value);
- int hash2 = hash1 ^ fp;
- return Math.abs(hash2);
- }
-
- }
-
- /**
- *
- * @param dataCount 预计存放总数量
- * @param bitSpace 每个内容,可以被拆分分散在多少个位置里面选择存放,
- */
- public CuckooFilter(int dataCount, int bitSpace) {
- this.dataCount = dataCount;
- this.bitSpace = bitSpace;
- //每个字符分配32个位
- this.bitCount = dataCount * bitSpace;
- func = new SliceHash[bitSpace];//16个
- for (int i = 0; i < bitSpace; i++) {
- func[i] = new SliceHash(bitSpace, i);
- }
- this.nestPosition1 = new BitSet(bitCount);
- this.nestPosition2 = new BitSet(bitCount);
- }
-
- public boolean add(Object value) {
- if (contains(value))
- return false;
- for (SliceHash f : func) {
- int hash1 = f.hash1(value);
- nestPosition1.set(hash1, true);
- nestPosition2.set(hash1, false);//T走同巢隔壁位置的蛋
- int hash2 = f.hash2(value);
- nestPosition1.set(hash2, false);//T走同巢隔壁位置的蛋
- nestPosition2.set(hash2, true);
- }
- return true;
- }
-
- public boolean contains(Object value) {
- for (SliceHash f : func) {
- int hash1 = f.hash1(value);
- int hash2 = f.hash2(value);
- boolean inPosition1 = nestPosition1.get(hash1);
- boolean inPosition2 = nestPosition2.get(hash2);
- boolean existInOne = inPosition1 || inPosition2;
- if (!existInOne)
- //两个巢位都被T走了
- return false;
- }
- return true;
- }
-
- public static void main(String[] args) throws Exception {
- int dataCount = 2 << 24;
- // int dataCount = 9000000;
- System.out.println(dataCount);
- CuckooFilter filter = new CuckooFilter(dataCount, 5);
- int wrongCount = 0;
- int addCount = 9999999;//1亿数据
- for (int i = 0; i < addCount; i++) {
- String value = "" + i;
- boolean exist = filter.contains(value);
- if (exist) {
- wrongCount++;
- }
- boolean addSuccess = filter.add(value);
- exist = filter.contains(value);
- if (!exist)
- throw new InterruptedException("添加后被认为不存在:" + value);
- }
- System.out.println("误判的数量:" + wrongCount);
- System.out.println("误判率:" + (wrongCount * 100d / addCount) + "%");
- int lostCount = 0;
- int existCount = 0;
- for (int i = 0; i < addCount; i++) {
- String value = "" + i;
- boolean exist = filter.contains(value);
- if (exist) {
- existCount++;
- } else {
- lostCount ++;
- }
- }
- System.out.println("存留的数量:" + (existCount - wrongCount));
- System.out.println("存留率:" + ((existCount - wrongCount) * 100d / addCount) + "%");
- System.out.println("被T除的数量:" + lostCount);
- System.out.println("丢失率:" + (lostCount * 100d / addCount) + "%");
- }
-
- }
测试添加1亿数据,运行结果:
- 误判的数量:876
- 误判率:0.008760000876000087%
- 存留的数量:9932122
- 存留率:99.32122993212299%
- 被T除的数量:67001
- 丢失率:0.6700100670010067%
还没有做扩容等操作,这里只是简单的实现。
参考:
https://zhuanlan.zhihu.com/p/462813998
https://blog.csdn.net/aaa_bbb_ccc_123_456/article/details/106055033
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。