Bloom filter concept and code analysis

  • 2020-04-02 01:40:59
  • OfStack

A brief introduction.
1. What is bloom filter?
Bloom filter is made by Howard Bloom in 1970 binary vector data structure, it has the very good space and time efficiency, be used to detect an element is a member of the collection, the test will only mistake within the collection of data, but not to mistake of not within a collection of data, so that each test request returns have "within the collection (may)" and "not in the collection (absolute) is not set" two situations, visible Bloom filter is sacrificed for the correct time and space.

2. How to calculate bloom filter?
Such as the need to figure out whether an element in a collection, our usual practice is to put all the elements, and then by comparing know if it is within the collection, linked lists, trees, are based on this thinking, the number of elements in the collection, we need the space and time is linear, the retrieval speed is also more and more slow. Bloom filter USES the method of hash function to map an element to a point on an m-length array. When the point is 1, the element is in the set, otherwise it is not in the set. The disadvantage of this method is that there may be conflicts when there are many elements to detect. The solution is to use k hash functions for k points. If all the points are 1, then the element is in the set, and if there are 0, then the element is not in the set.

3. Characteristics of bloom filter?
The advantage of Bloom filter is that its insertion and query time are constant, in addition, it queries elements but does not save the elements themselves, so it has good security. Its drawback is obvious, when inserted into the element, the more the greater the probability of mistake "in the collection", also Bloom filter can remove an element, because the result of multiple elements hash in Bloom filter structure is occupied in the same place, if you remove a bit, may affect the detection of multiple elements.

Two. Code implementation
Now the function code of bloom filter is implemented under Linux:


// bloom.h:
#ifndef __BLOOM_H__
#define __BLOOM_H__
#include<stdlib.h>
typedef unsigned int (*hashfunc_t)(const char *);
typedef struct {
size_t asize;
unsigned char *a;
size_t nfuncs;
hashfunc_t *funcs;
} BLOOM;
BLOOM *bloom_create(size_t size, size_t nfuncs, ...);
int bloom_destroy(BLOOM *bloom);
int bloom_add(BLOOM *bloom, const char *s);
int bloom_check(BLOOM *bloom, const char *s);
#endif
// bloom.c:
#include<limits.h>
#include<stdarg.h>
#include"bloom.h"
#define SETBIT(a, n) (a[n/CHAR_BIT] |= (1<<(n%CHAR_BIT)))
#define GETBIT(a, n) (a[n/CHAR_BIT] & (1<<(n%CHAR_BIT)))
BLOOM *bloom_create(size_t size, size_t nfuncs, ...)
{
BLOOM *bloom;
va_list l;
int n;
if(!(bloom=malloc(sizeof(BLOOM)))) return NULL;
if(!(bloom->a=calloc((size+CHAR_BIT-1)/CHAR_BIT, sizeof(char)))) {
free(bloom);
return NULL;
}
if(!(bloom->funcs=(hashfunc_t*)malloc(nfuncs*sizeof(hashfunc_t)))) {
free(bloom->a);
free(bloom);
return NULL;
}
va_start(l, nfuncs);
for(n=0; n<nfuncs; ++n) {
bloom->funcs[n]=va_arg(l, hashfunc_t);
}
va_end(l);
bloom->nfuncs=nfuncs;
bloom->asize=size;
return bloom;
}
int bloom_destroy(BLOOM *bloom)
{
free(bloom->a);
free(bloom->funcs);
free(bloom);
return 0;
}
int bloom_add(BLOOM *bloom, const char *s)
{
size_t n;
for(n=0; n<bloom->nfuncs; ++n) {
SETBIT(bloom->a, bloom->funcs[n](s)%bloom->asize);
}
return 0;
}
int bloom_check(BLOOM *bloom, const char *s)
{
size_t n;
for(n=0; n<bloom->nfuncs; ++n) {
if(!(GETBIT(bloom->a, bloom->funcs[n](s)%bloom->asize))) return 0;
}
return 1;
}   

// test.c:
#include<stdio.h>
#include<string.h>
#include"bloom.h"
//The following are two hash algorithm functions
unsigned int sax_hash(const char *key)
{
unsigned int h=0;
while(*key) h^=(h<<5)+(h>>2)+(unsigned char)*key++;
return h;
}

unsigned int sdbm_hash(const char *key)
{
unsigned int h=0;
while(*key) h=(unsigned char)*key++ + (h<<6) + (h<<16) - h;
return h;
}
int main(int argc, char *argv[])
{
FILE *fp;
char line[1024];
char *p;
BLOOM *bloom;
if(argc<2) {
fprintf(stderr, "ERROR: No word file specifiedn");
return EXIT_FAILURE;
}
if(!(bloom=bloom_create(2500000, 2, sax_hash, sdbm_hash))) {
fprintf(stderr, "ERROR: Could not create bloom filtern");
return EXIT_FAILURE;
}
if(!(fp=fopen(argv[1], "r"))) {
fprintf(stderr, "ERROR: Could not open file %sn", argv[1]);
return EXIT_FAILURE;
}
while(fgets(line, 1024, fp)) {
if((p=strchr(line, 'r'))) *p='0';//enter
if((p=strchr(line, 'n'))) *p='0';//A newline
bloom_add(bloom, line);
}
fclose(fp);
while(fgets(line, 1024, stdin)) {
if((p=strchr(line, 'r'))) *p='0';
if((p=strchr(line, 'n'))) *p='0';
p=strtok(line, " t,.;:rn?!-/()");
while(p) {
if(!bloom_check(bloom, p)) {
printf("No match for ford "%s"n", p);
}
                    else
                      printf("match for ford "%s"n",p);
p=strtok(NULL, " t,.;:rn?!-/()");
}
}
bloom_destroy(bloom);
return EXIT_SUCCESS;
}   

// Makefile:
   all: bloom
bloom: bloom.o test.o
           cc -o bloom -Wall -pedantic bloom.o test.o
bloom.o: bloom.c bloom.h
           cc -o bloom.o -Wall -pedantic -ansi -c bloom.c
test.o: test.c bloom.h
           cc -o test.o -Wall -pedantic -ansi -c test.c


Related articles: