当前位置:   article > 正文

c++实现布隆过滤器_sdbmhash c++

sdbmhash c++

hashfuntion.h(常用字符串Hash函数

unsigned int SDBMHash(const char *str);
unsigned int RSHash(const char *str);
unsigned int JSHash(const char *str);
unsigned int PJWHash(const char *str);
unsigned int APHash(const char *str);
unsigned int DJBHash(const char *str);
unsigned int ELFHash(const char *str);
unsigned int BKDRHash(const char *str);
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8

hashfuntion.cpp

#include "hashfunction.h"

unsigned int SDBMHash(const char *str)
{
    unsigned int hash = 0;
    while (*str)
    {
        // equivalent to: hash = 65599*hash + (*str++);
        hash = (*str++) + (hash << 6) + (hash << 16) - hash;
    }
    return (hash & 0x7FFFFFFF);
}

//RS Hash Function
unsigned int RSHash(const char *str)
{
    unsigned int b = 378551;
    unsigned int a = 63689;
    unsigned int hash = 0;
    while (*str)
    {
        hash = hash * a + (*str++);
        a *= b;
    }
    return (hash & 0x7FFFFFFF);
}

//JS Hash Function
unsigned int JSHash(const char *str)
{
    unsigned int hash = 1315423911;
    while (*str)
    {
        hash ^= ((hash << 5) + (*str++) + (hash >> 2));
    }
    return (hash & 0x7FFFFFFF);
}

//P. J. Weinberger Hash Function
unsigned int PJWHash(const char *str)
{
    unsigned int BitsInUnignedInt = (unsigned int)(sizeof(unsigned int) * 8);
    unsigned int ThreeQuarters    = (unsigned int)((BitsInUnignedInt  * 3) / 4);
    unsigned int OneEighth        = (unsigned int)(BitsInUnignedInt / 8);
    unsigned int HighBits         = (unsigned int)(0xFFFFFFFF) << (BitsInUnignedInt - OneEighth);
    unsigned int hash             = 0;
    unsigned int test             = 0;
    while (*str)
    {
        hash = (hash << OneEighth) + (*str++);
        if ((test = hash & HighBits) != 0)
        {
            hash = ((hash ^ (test >> ThreeQuarters)) & (~HighBits));
        }
    }
    return (hash & 0x7FFFFFFF);
}

//AP Hash Function
unsigned int APHash(const char *str)
{
    unsigned int hash = 0;
    int i;
    for (i=0; *str; i++)
    {
        if ((i & 1) == 0)
        {
            hash ^= ((hash << 7) ^ (*str++) ^ (hash >> 3));
        }
        else
        {
            hash ^= (~((hash << 11) ^ (*str++) ^ (hash >> 5)));
        }
    }
    return (hash & 0x7FFFFFFF);
}

//DJB Hash Function
unsigned int DJBHash(const char *str)
{
    unsigned int hash = 5381;
    while (*str)
    {
        hash += (hash << 5) + (*str++);
    }
    return (hash & 0x7FFFFFFF);
}

// ELF Hash Function
unsigned int ELFHash(const char *str)
{
    unsigned int hash = 0;
    unsigned int x    = 0;
    while (*str)
    {
        hash = (hash << 4) + (*str++);
        if ((x = hash & 0xF0000000L) != 0)
        {
            hash ^= (x >> 24);
            hash &= ~x;
        }
    }
    return (hash & 0x7FFFFFFF);
}

//BKDR Hash Function
unsigned int BKDRHash(const char *str)
{
    unsigned int seed = 131; // 31 131 1313 13131 131313 etc..
    unsigned int hash = 0;
    while (*str)
    {
        hash = hash * seed + (*str++);
    }
    return (hash & 0x7FFFFFFF);
}
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68
  • 69
  • 70
  • 71
  • 72
  • 73
  • 74
  • 75
  • 76
  • 77
  • 78
  • 79
  • 80
  • 81
  • 82
  • 83
  • 84
  • 85
  • 86
  • 87
  • 88
  • 89
  • 90
  • 91
  • 92
  • 93
  • 94
  • 95
  • 96
  • 97
  • 98
  • 99
  • 100
  • 101
  • 102
  • 103
  • 104
  • 105
  • 106
  • 107
  • 108
  • 109
  • 110
  • 111
  • 112
  • 113
  • 114
  • 115
  • 116

bloomfilter.h

#ifndef BLOOMFILTER_H
#define BLOOMFILTER_H

#include <stdio.h>
#include <iostream>
#include <string>
#include <math.h>
#include <vector>
#include <fstream>
#include "hashfunction.h"

#define INT_SIZE 32
#define BUFFER_SIZE 1024

class BloomFilter{
public:
    //样本个数n,期望的失误率p,传入样本的文档路径
    BloomFilter(double errRate,int sampleNum);
    ~BloomFilter();
    void filterInit();                      //初始化布隆过滤器
    void arrayGenerate(std::string path);   //打开path路径的文档,计算每一行样本到array中
    bool isContain(const char* str);        //查看字符串是否在样本中存在
    void storeArray(std::string path);      //把array存储在指定路径文件path里
    void restoreArray(std::string path);    //把array从指定路径文件path里恢复载入
    int getHashFunNum();                    //返回需要的哈希函数的个数k
    int getIntNum();                        //返回需要的内存的int数
private:
    int hashtableInit();    //把几个哈希函数加入到hastable中
    std::string path;       //传入样本的文档路径
    double errRate;         //样本失误率p
    int sampleNum;          //样本个数n
    int bitNum;             //需要的二进制位数m
    int intNum;             //需要申请内存的int数
    int hashFunNum;         //需要的哈希函数的个数k,注意计算得到的哈希函数个数k应该<=hashtable.size();
    int *array;             //内存
    std::vector<unsigned int (*)(const char*)> hashtable;    //存放计算字符串哈希值的哈希函数
};

#endif // BLOOMFILTER_H
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39

bloomfilter.cpp

#include "bloomfilter.h"

BloomFilter::BloomFilter(double errRate,int sampleNum)
{
    this->errRate=errRate;
    this->sampleNum=sampleNum;
    this->bitNum=-((this->sampleNum*log(this->errRate))/(log(2)*log(2)));
    this->hashFunNum=0.7*(this->bitNum/this->sampleNum);
    this->intNum=this->bitNum/INT_SIZE+1;
    array=new int[this->intNum];
}

BloomFilter::~BloomFilter()
{
    delete []array;
}

void BloomFilter::filterInit()
{
    hashtableInit();
    if(this->hashFunNum>(int)hashtable.size())
    {
        std::cout<<"哈希函数不足,请添加"<<std::endl;
        return;
    }
}

int BloomFilter::hashtableInit()
{
    hashtable.push_back(*PJWHash);
    hashtable.push_back(*JSHash);
    hashtable.push_back(*RSHash);
    hashtable.push_back(*SDBMHash);
    hashtable.push_back(*APHash);
    hashtable.push_back(*DJBHash);
    hashtable.push_back(*BKDRHash);
    hashtable.push_back(*ELFHash);
    return hashtable.size();
}

void BloomFilter::arrayGenerate(std::string path)
{
    int hashval;
    std::fstream fs;
    fs.open(path.c_str(),std::ios::in|std::ios::out);
    if(!fs)
    {
        perror("open file error");
        return;
    }
    fs.seekp(std::ios::beg);
    char buf[BUFFER_SIZE]={0};
    fs.getline(buf,BUFFER_SIZE);
    while(!fs.eof())
    {
        for(int i=0;i!=this->hashFunNum;i++)
        {
            hashval=hashtable[i](buf);
            hashval=hashval%(this->intNum*INT_SIZE);
            this->array[hashval/INT_SIZE]|=(0x1<<(hashval%INT_SIZE));
        }
        fs.getline(buf,BUFFER_SIZE);
    }
    fs.clear();
    fs.close();
}

bool BloomFilter::isContain(const char* str)
{
    int hashval;
    for(int i=0;i!=this->hashFunNum;i++)
    {
        hashval=hashtable[i](str);
        hashval=hashval%(this->intNum*INT_SIZE);
        if(array[hashval/INT_SIZE]&(0x1<<(hashval%INT_SIZE)))
            continue;
        else
            return false;
    }
    return true;
}

void BloomFilter::storeArray(std::string path)
{
    std::fstream fs;
    fs.open(path.c_str(),std::ios::in|std::ios::out);
    if(!fs)
    {
        fs.open(path.c_str(),std::ios::app|std::ios::in|std::ios::out);
        for(int k=0;k<this->intNum;k++)
            fs<<array[k]<<std::endl;
    }
    else
    {
        fs.seekp(std::ios::beg);
        for(int k=0;k<this->intNum;k++)
            fs<<array[k]<<std::endl;
    }
    fs.clear();
    fs.close();
}

void BloomFilter::restoreArray(std::string path)
{
    std::fstream fs;
    fs.open(path.c_str(),std::ios::in|std::ios::out);
    if(!fs)
    {
        perror("open file error");
        return;
    }
    fs.seekp(std::ios::beg);
    char buf[INT_SIZE]={0};
    fs.getline(buf,INT_SIZE);
    int i=0;
    while(!fs.eof())
    {
        sscanf(buf,"%d\n",&this->array[i++]);
        fs.getline(buf,INT_SIZE);
    }
    fs.clear();
    fs.close();
}

int BloomFilter::getHashFunNum()
{
    return this->hashFunNum;
}

int BloomFilter::getIntNum()
{
    return this->intNum;
}
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68
  • 69
  • 70
  • 71
  • 72
  • 73
  • 74
  • 75
  • 76
  • 77
  • 78
  • 79
  • 80
  • 81
  • 82
  • 83
  • 84
  • 85
  • 86
  • 87
  • 88
  • 89
  • 90
  • 91
  • 92
  • 93
  • 94
  • 95
  • 96
  • 97
  • 98
  • 99
  • 100
  • 101
  • 102
  • 103
  • 104
  • 105
  • 106
  • 107
  • 108
  • 109
  • 110
  • 111
  • 112
  • 113
  • 114
  • 115
  • 116
  • 117
  • 118
  • 119
  • 120
  • 121
  • 122
  • 123
  • 124
  • 125
  • 126
  • 127
  • 128
  • 129
  • 130
  • 131
  • 132
  • 133

测试:

main.cpp

加载文件生成array并保存:

#include "tool/bloomfilter.h"
using namespace std;

int main(int argc, char *argv[])
{
    BloomFilter mybloom(0.01,100);
    mybloom.filterInit();
    mybloom.arrayGenerate("../tool/redlist.txt");
    cout<<"intNum:"<<mybloom.getIntNum()<<",hashFunNum:"<<mybloom.getHashFunNum()<<endl;
    cout<<"www.dubai.com在我的集合中吗:"<<(mybloom.isContain("www.dubai.com")?"在":"不在")<<endl;
    cout<<"www.qq.com在我的集合中吗:"<<(mybloom.isContain("www.qq.com")?"在":"不在")<<endl;
    mybloom.storeArray("../tool/bloom_array.txt");
    return 0;
}
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14

从文件中读取恢复array:

#include "tool/bloomfilter.h"
using namespace std;

int main(int argc, char *argv[])
{
    BloomFilter mybloom(0.01,100);
    mybloom.filterInit();
    mybloom.restoreArray("../tool/bloom_array.txt");
    cout<<"www.dubai.com在我的集合中吗:"<<(mybloom.isContain("www.dubai.com")?"在":"不在")<<endl;
    cout<<"www.qq.com在我的集合中吗:"<<(mybloom.isContain("www.qq.com")?"在":"不在")<<endl;
    return 0;
}
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/我家自动化/article/detail/382677
推荐阅读
相关标签
  

闽ICP备14008679号