From bc4eb4d2bb2c5ecdd3a8cb491e4957f9ae8a8f2e Mon Sep 17 00:00:00 2001 From: Jake Date: Wed, 22 Jul 2015 18:25:12 -0400 Subject: [PATCH] v0.1 commit --- .gitignore | 11 ++ MANIFEST.in | 8 ++ requirements.txt | 3 + setup.py | 37 +++++ src/MurmurHash3.c | 339 ++++++++++++++++++++++++++++++++++++++++++++ src/MurmurHash3.h | 39 +++++ src/_pybloof.pyx | 233 ++++++++++++++++++++++++++++++ src/bybloof.py | 142 +++++++++++++++++++ tests/test_bf.py | 71 ++++++++++ tests/test_mhash.py | 26 ++++ 10 files changed, 909 insertions(+) create mode 100644 .gitignore create mode 100644 MANIFEST.in create mode 100644 requirements.txt create mode 100644 setup.py create mode 100644 src/MurmurHash3.c create mode 100644 src/MurmurHash3.h create mode 100644 src/_pybloof.pyx create mode 100644 src/bybloof.py create mode 100644 tests/test_bf.py create mode 100644 tests/test_mhash.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..357563a --- /dev/null +++ b/.gitignore @@ -0,0 +1,11 @@ +a.out +*.a +_pybloof.c +*.o +*.pyc +*.so +*.swp +*egg-info +build +dist +.idea \ No newline at end of file diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..cef79e4 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,8 @@ +include *.txt +include MANIFEST.in +include setup.py +include setup.cfg + +recursive-include src *.c *.h *.py +recursive-include tests *.c *.h *.py +recursive-include docs *.html *.css *.gif *.jpg *.txt \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..23a37b5 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +bitarray +Cython +nose \ No newline at end of file diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..a57e63c --- /dev/null +++ b/setup.py @@ -0,0 +1,37 @@ +from setuptools import setup +from setuptools.extension import Extension +from Cython.Distutils import build_ext +from os.path import join +import functools + +import os + +rel = functools.partial(join, os.getcwd()) + +ext_modules = [ + Extension( + "_pybloof", + extra_compile_args=['-std=gnu99', '-O2', '-D_LARGEFILE64_SOURCE'], + sources=["src/_pybloof.pyx", + 'src/MurmurHash3.c'], + + include_dirs=[rel('src')], + library_dirs=[rel('src')] + ) +] + +setup( + name='pybloof', + version='0.1', + author='Jake Heinz', + author_email='me@jh.gg', + url="http://github.com/jhgg/pybloof", + description='A high performance python bloom filter thing.', + license='MIT License', + cmdclass={'build_ext': build_ext}, + zip_safe=False, + package_dir={'': 'src'}, + py_modules=['pyblouf'], + ext_modules=ext_modules, + test_suite='nose.collector' +) diff --git a/src/MurmurHash3.c b/src/MurmurHash3.c new file mode 100644 index 0000000..eba8df6 --- /dev/null +++ b/src/MurmurHash3.c @@ -0,0 +1,339 @@ +//----------------------------------------------------------------------------- +// MurmurHash3 was written by Austin Appleby, and is placed in the public +// domain. The author hereby disclaims copyright to this source code. + +// Note - The x86 and x64 versions do _not_ produce the same results, as the +// algorithms are optimized for their respective platforms. You can still +// compile and run any of them on any platform, but your performance with the +// non-native version will be less than optimal. + +#include "MurmurHash3.h" + +//----------------------------------------------------------------------------- +// Platform-specific functions and macros + +// Microsoft Visual Studio + +#if defined(_MSC_VER) + +#define FORCE_INLINE __forceinline + +#include + +#define ROTL32(x,y) _rotl(x,y) +#define ROTL64(x,y) _rotl64(x,y) + +#define BIG_CONSTANT(x) (x) + +// Other compilers + +#else // defined(_MSC_VER) + +#define FORCE_INLINE inline __attribute__((always_inline)) + +inline uint32_t rotl32 ( uint32_t x, int8_t r ) +{ + return (x << r) | (x >> (32 - r)); +} + +inline uint64_t rotl64 ( uint64_t x, int8_t r ) +{ + return (x << r) | (x >> (64 - r)); +} + +#define ROTL32(x,y) rotl32(x,y) +#define ROTL64(x,y) rotl64(x,y) + +#define BIG_CONSTANT(x) (x##LLU) + +#endif // !defined(_MSC_VER) + +//----------------------------------------------------------------------------- +// Block read - if your platform needs to do endian-swapping or can only +// handle aligned reads, do the conversion here + +FORCE_INLINE uint32_t getblock32 ( const uint32_t * p, int i ) +{ + return p[i]; +} + +FORCE_INLINE uint64_t getblock64 ( const uint64_t * p, int i ) +{ + return p[i]; +} + +//----------------------------------------------------------------------------- +// Finalization mix - force all bits of a hash block to avalanche + +FORCE_INLINE uint32_t fmix32 ( uint32_t h ) +{ + h ^= h >> 16; + h *= 0x85ebca6b; + h ^= h >> 13; + h *= 0xc2b2ae35; + h ^= h >> 16; + + return h; +} + +//---------- + +FORCE_INLINE uint64_t fmix64 ( uint64_t k ) +{ + k ^= k >> 33; + k *= BIG_CONSTANT(0xff51afd7ed558ccd); + k ^= k >> 33; + k *= BIG_CONSTANT(0xc4ceb9fe1a85ec53); + k ^= k >> 33; + + return k; +} + +//----------------------------------------------------------------------------- + +void MurmurHash3_x86_32 ( const void * key, int len, + uint32_t seed, void * out ) +{ + const uint8_t * data = (const uint8_t*)key; + const int nblocks = len / 4; + + uint32_t h1 = seed; + + const uint32_t c1 = 0xcc9e2d51; + const uint32_t c2 = 0x1b873593; + + //---------- + // body + + const uint32_t * blocks = (const uint32_t *)(data + nblocks*4); + + for(int i = -nblocks; i; i++) + { + uint32_t k1 = getblock32(blocks,i); + + k1 *= c1; + k1 = ROTL32(k1,15); + k1 *= c2; + + h1 ^= k1; + h1 = ROTL32(h1,13); + h1 = h1*5+0xe6546b64; + } + + //---------- + // tail + + const uint8_t * tail = (const uint8_t*)(data + nblocks*4); + + uint32_t k1 = 0; + + switch(len & 3) + { + case 3: k1 ^= tail[2] << 16; + case 2: k1 ^= tail[1] << 8; + case 1: k1 ^= tail[0]; + k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; + }; + + //---------- + // finalization + + h1 ^= len; + + h1 = fmix32(h1); + + *(uint32_t*)out = h1; +} + +//----------------------------------------------------------------------------- + +void MurmurHash3_x86_128 ( const void * key, const int len, + uint32_t seed, void * out ) +{ + const uint8_t * data = (const uint8_t*)key; + const int nblocks = len / 16; + + uint32_t h1 = seed; + uint32_t h2 = seed; + uint32_t h3 = seed; + uint32_t h4 = seed; + + const uint32_t c1 = 0x239b961b; + const uint32_t c2 = 0xab0e9789; + const uint32_t c3 = 0x38b34ae5; + const uint32_t c4 = 0xa1e38b93; + + //---------- + // body + + const uint32_t * blocks = (const uint32_t *)(data + nblocks*16); + + for(int i = -nblocks; i; i++) + { + uint32_t k1 = getblock32(blocks,i*4+0); + uint32_t k2 = getblock32(blocks,i*4+1); + uint32_t k3 = getblock32(blocks,i*4+2); + uint32_t k4 = getblock32(blocks,i*4+3); + + k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; + + h1 = ROTL32(h1,19); h1 += h2; h1 = h1*5+0x561ccd1b; + + k2 *= c2; k2 = ROTL32(k2,16); k2 *= c3; h2 ^= k2; + + h2 = ROTL32(h2,17); h2 += h3; h2 = h2*5+0x0bcaa747; + + k3 *= c3; k3 = ROTL32(k3,17); k3 *= c4; h3 ^= k3; + + h3 = ROTL32(h3,15); h3 += h4; h3 = h3*5+0x96cd1c35; + + k4 *= c4; k4 = ROTL32(k4,18); k4 *= c1; h4 ^= k4; + + h4 = ROTL32(h4,13); h4 += h1; h4 = h4*5+0x32ac3b17; + } + + //---------- + // tail + + const uint8_t * tail = (const uint8_t*)(data + nblocks*16); + + uint32_t k1 = 0; + uint32_t k2 = 0; + uint32_t k3 = 0; + uint32_t k4 = 0; + + switch(len & 15) + { + case 15: k4 ^= tail[14] << 16; + case 14: k4 ^= tail[13] << 8; + case 13: k4 ^= tail[12] << 0; + k4 *= c4; k4 = ROTL32(k4,18); k4 *= c1; h4 ^= k4; + + case 12: k3 ^= tail[11] << 24; + case 11: k3 ^= tail[10] << 16; + case 10: k3 ^= tail[ 9] << 8; + case 9: k3 ^= tail[ 8] << 0; + k3 *= c3; k3 = ROTL32(k3,17); k3 *= c4; h3 ^= k3; + + case 8: k2 ^= tail[ 7] << 24; + case 7: k2 ^= tail[ 6] << 16; + case 6: k2 ^= tail[ 5] << 8; + case 5: k2 ^= tail[ 4] << 0; + k2 *= c2; k2 = ROTL32(k2,16); k2 *= c3; h2 ^= k2; + + case 4: k1 ^= tail[ 3] << 24; + case 3: k1 ^= tail[ 2] << 16; + case 2: k1 ^= tail[ 1] << 8; + case 1: k1 ^= tail[ 0] << 0; + k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; + }; + + //---------- + // finalization + + h1 ^= len; h2 ^= len; h3 ^= len; h4 ^= len; + + h1 += h2; h1 += h3; h1 += h4; + h2 += h1; h3 += h1; h4 += h1; + + h1 = fmix32(h1); + h2 = fmix32(h2); + h3 = fmix32(h3); + h4 = fmix32(h4); + + h1 += h2; h1 += h3; h1 += h4; + h2 += h1; h3 += h1; h4 += h1; + + ((uint32_t*)out)[0] = h1; + ((uint32_t*)out)[1] = h2; + ((uint32_t*)out)[2] = h3; + ((uint32_t*)out)[3] = h4; +} + +//----------------------------------------------------------------------------- + +void MurmurHash3_x64_128 ( const void * key, const int len, + const uint32_t seed, void * out ) +{ + const uint8_t * data = (const uint8_t*)key; + const int nblocks = len / 16; + + uint64_t h1 = seed; + uint64_t h2 = seed; + + const uint64_t c1 = BIG_CONSTANT(0x87c37b91114253d5); + const uint64_t c2 = BIG_CONSTANT(0x4cf5ad432745937f); + + //---------- + // body + + const uint64_t * blocks = (const uint64_t *)(data); + + for(int i = 0; i < nblocks; i++) + { + uint64_t k1 = getblock64(blocks,i*2+0); + uint64_t k2 = getblock64(blocks,i*2+1); + + k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; h1 ^= k1; + + h1 = ROTL64(h1,27); h1 += h2; h1 = h1*5+0x52dce729; + + k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2; + + h2 = ROTL64(h2,31); h2 += h1; h2 = h2*5+0x38495ab5; + } + + //---------- + // tail + + const uint8_t * tail = (const uint8_t*)(data + nblocks*16); + + uint64_t k1 = 0; + uint64_t k2 = 0; + + switch(len & 15) + { + case 15: k2 ^= ((uint64_t)tail[14]) << 48; + case 14: k2 ^= ((uint64_t)tail[13]) << 40; + case 13: k2 ^= ((uint64_t)tail[12]) << 32; + case 12: k2 ^= ((uint64_t)tail[11]) << 24; + case 11: k2 ^= ((uint64_t)tail[10]) << 16; + case 10: k2 ^= ((uint64_t)tail[ 9]) << 8; + case 9: k2 ^= ((uint64_t)tail[ 8]) << 0; + k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2; + + case 8: k1 ^= ((uint64_t)tail[ 7]) << 56; + case 7: k1 ^= ((uint64_t)tail[ 6]) << 48; + case 6: k1 ^= ((uint64_t)tail[ 5]) << 40; + case 5: k1 ^= ((uint64_t)tail[ 4]) << 32; + case 4: k1 ^= ((uint64_t)tail[ 3]) << 24; + case 3: k1 ^= ((uint64_t)tail[ 2]) << 16; + case 2: k1 ^= ((uint64_t)tail[ 1]) << 8; + case 1: k1 ^= ((uint64_t)tail[ 0]) << 0; + k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; h1 ^= k1; + }; + + //---------- + // finalization + + h1 ^= len; h2 ^= len; + + h1 += h2; + h2 += h1; + + h1 = fmix64(h1); + h2 = fmix64(h2); + + h1 += h2; + h2 += h1; + + ((uint64_t*)out)[0] = h1; + ((uint64_t*)out)[1] = h2; +} + + +void MurmurHash3_x64_128_long( const long long key, const uint32_t seed, void * out ) { + MurmurHash3_x64_128(&key, sizeof(long long), seed, out); +} + +//----------------------------------------------------------------------------- \ No newline at end of file diff --git a/src/MurmurHash3.h b/src/MurmurHash3.h new file mode 100644 index 0000000..81e799f --- /dev/null +++ b/src/MurmurHash3.h @@ -0,0 +1,39 @@ +//----------------------------------------------------------------------------- +// MurmurHash3 was written by Austin Appleby, and is placed in the public +// domain. The author hereby disclaims copyright to this source code. + +#ifndef _MURMURHASH3_H_ +#define _MURMURHASH3_H_ + +//----------------------------------------------------------------------------- +// Platform-specific functions and macros + +// Microsoft Visual Studio + +#if defined(_MSC_VER) && (_MSC_VER < 1600) + +typedef unsigned char uint8_t; +typedef unsigned int uint32_t; +typedef unsigned __int64 uint64_t; + +// Other compilers + +#else // defined(_MSC_VER) + +#include + +#endif // !defined(_MSC_VER) + +//----------------------------------------------------------------------------- + +void MurmurHash3_x86_32 ( const void * key, int len, uint32_t seed, void * out ); + +void MurmurHash3_x86_128 ( const void * key, int len, uint32_t seed, void * out ); + +void MurmurHash3_x64_128 ( const void * key, int len, uint32_t seed, void * out ); + +void MurmurHash3_x64_128_long( const long long key, const uint32_t seed, void * out ); + +//----------------------------------------------------------------------------- + +#endif // _MURMURHASH3_H_ \ No newline at end of file diff --git a/src/_pybloof.pyx b/src/_pybloof.pyx new file mode 100644 index 0000000..65779d3 --- /dev/null +++ b/src/_pybloof.pyx @@ -0,0 +1,233 @@ +from cpython cimport array +cimport cpython +cimport cython + +import bitarray +import struct +import base64 + +from libc.string cimport memcpy + +cdef array.array char_array_template = array.array('b', []) + +cdef extern from "MurmurHash3.h" nogil: + void MurmurHash3_x64_128(void *key, int len, unsigned int seed, void *out) + void MurmurHash3_x64_128_long(long key, unsigned int seed, void *out) + +cdef extern from "stdlib.h" nogil: + long long int llabs(long long int j) + +def hash(key, int seed=0): + cdef long long result[2] + if isinstance(key, unicode): + key = key.encode('utf8') + + MurmurHash3_x64_128( key, len(key), seed, result) + return long(result[0]) << 64 | (long(result[1]) & 0xFFFFFFFFFFFFFFFF) + +def hash_long(long long key, int seed=0): + cdef long long result[2] + MurmurHash3_x64_128_long(key, seed, result) + return long(result[0]) << 64 | (long(result[1]) & 0xFFFFFFFFFFFFFFFF) + +@cython.boundscheck(False) +cdef void _get_hash_buckets(key, unsigned long long * _bucket_indexes, unsigned int hash_count, unsigned long max): + cdef unsigned long result[2] + cdef unsigned long hash1, hash2 + cdef unsigned long i + + if isinstance(key, unicode): + key = key.encode('utf8') + + MurmurHash3_x64_128(key, len(key), 0, result) + hash1 = result[0] + MurmurHash3_x64_128(key, len(key), result[1] & 0xFFFFFFFF, result) + hash2 = result[0] + + for i in range(hash_count): + _bucket_indexes[i] = llabs((hash1 + i * hash2) % max)\ + +@cython.boundscheck(False) +cdef void _get_hash_buckets_for_long(long long key, unsigned long long * _bucket_indexes, unsigned int hash_count, + unsigned long max): + cdef unsigned long result[2] + cdef unsigned long hash1, hash2 + cdef unsigned long i + + MurmurHash3_x64_128_long(key, 0, &result) + hash1 = result[0] + MurmurHash3_x64_128_long(key, result[1] & 0xFFFFFFFF, result) + hash2 = result[0] + + for i in range(hash_count): + _bucket_indexes[i] = llabs((hash1 + i * hash2) % max) + + +cdef char* fmt = '!III' +cdef ssize_t header_size = sizeof(unsigned int) * 3 +DEF MAX_HASHES = 32 + +cdef class _BloomFilter: + cdef unsigned int _size + cdef unsigned int _hashes + cdef object _bitarray + + def __cinit__(self, unsigned long size, unsigned int hashes, cpython.bool _clear=True): + self._size = size + self._hashes = min(hashes, MAX_HASHES) + self._bitarray = bitarray.bitarray(size) + + if _clear: + self._bitarray.setall(False) + + cpdef _from_byte_array(self, array.array byte_array): + (address, size, endianness, unused, allocated) = self._bitarray.buffer_info() + memcpy(cpython.PyLong_AsVoidPtr(address), byte_array.data.as_chars + header_size, + byte_array.ob_size - header_size) + + @classmethod + def from_byte_array(cls, array.array byte_array): + assert byte_array.ob_size > header_size + array_size, bit_size, hashes = struct.unpack_from(fmt, byte_array) + assert bit_size / 8 <= array_size + assert array_size == byte_array.ob_size - header_size + cdef bf = cls(bit_size, hashes, _clear=False) + bf._from_byte_array(byte_array) + return bf + + def to_byte_array(self): + (address, size, endianness, unused, allocated) = self._bitarray.buffer_info() + cdef unsigned int length = size + header_size + cdef array.array byte_array = array.clone(char_array_template, length, False) + struct.pack_into(fmt, byte_array, 0, size, self._size, self._hashes) + memcpy(byte_array.data.as_chars + header_size, cpython.PyLong_AsVoidPtr(address), size) + return byte_array + + def to_base64(self): + return base64.b64encode(self.to_byte_array()) + + @classmethod + def from_base64(cls, bytes s): + return cls.from_byte_array(array.array('b', base64.b64decode(s))) + + property hashes: + def __get__(self): + return self._hashes + + property size: + def __get__(self): + return self._size + + cpdef clear(self): + self._bitarray.setall(False) + +cdef class LongBloomFilter(_BloomFilter): + @cython.boundscheck(False) + cpdef add(self, long long item): + cdef unsigned long long _bucket_indexes[MAX_HASHES] + cdef unsigned int i + _get_hash_buckets_for_long(item, _bucket_indexes, self._hashes, self._size) + for i in range(self._hashes): + self._bitarray[_bucket_indexes[i]] = 1 + + @cython.boundscheck(False) + cpdef extend(self, items): + cdef unsigned int i + + cdef unsigned long long _bucket_indexes[MAX_HASHES] + for item in items: + _get_hash_buckets_for_long(item, _bucket_indexes, self._hashes, self._size) + for i in range(self._hashes): + self._bitarray[_bucket_indexes[i]] = 1 + + @cython.boundscheck(False) + cpdef extend_array(self, long long[:] items): + cdef unsigned int i + + cdef unsigned long long _bucket_indexes[MAX_HASHES] + for item in items: + _get_hash_buckets_for_long(item, _bucket_indexes, self._hashes, self._size) + for i in range(self._hashes): + self._bitarray[_bucket_indexes[i]] = 1 + + @cython.boundscheck(False) + cpdef contains(self, long long item): + cdef unsigned long long _bucket_indexes[MAX_HASHES] + cdef unsigned int i + _get_hash_buckets_for_long(item, _bucket_indexes, self._hashes, self._size) + + for i in range(self._hashes): + if not self._bitarray[_bucket_indexes[i]]: + return False + + return True + + def __contains__(self, long long item): + return self.contains(item) + +cdef class UIntBloomFilter(_BloomFilter): + @cython.boundscheck(False) + cpdef add(self, unsigned int item): + cdef unsigned long long _bucket_indexes[MAX_HASHES] + cdef unsigned int i + _get_hash_buckets_for_long(item, _bucket_indexes, self._hashes, self._size) + for i in range(self._hashes): + self._bitarray[_bucket_indexes[i]] = 1 + + @cython.boundscheck(False) + cpdef extend(self, items): + cdef unsigned int i + + cdef unsigned long long _bucket_indexes[MAX_HASHES] + for item in items: + _get_hash_buckets_for_long(item, _bucket_indexes, self._hashes, self._size) + for i in range(self._hashes): + self._bitarray[_bucket_indexes[i]] = 1 + + @cython.boundscheck(False) + cpdef extend_array(self, unsigned int[:] items): + cdef unsigned int i + + cdef unsigned long long _bucket_indexes[MAX_HASHES] + for item in items: + _get_hash_buckets_for_long(item, _bucket_indexes, self._hashes, self._size) + for i in range(self._hashes): + self._bitarray[_bucket_indexes[i]] = 1 + + @cython.boundscheck(False) + cpdef contains(self, unsigned int item): + cdef unsigned long long _bucket_indexes[MAX_HASHES] + cdef unsigned int i + _get_hash_buckets_for_long(item, _bucket_indexes, self._hashes, self._size) + for i in range(self._hashes): + if not self._bitarray[_bucket_indexes[i]]: + return False + + return True + + def __contains__(self, unsigned int item): + return self.contains(item) + +cdef class StringBloomFilter(_BloomFilter): + cpdef add(self, item): + cdef unsigned long long _bucket_indexes[MAX_HASHES] + cdef unsigned int i + _get_hash_buckets(item, _bucket_indexes, self._hashes, self._size) + + for i in range(self._hashes): + self._bitarray[_bucket_indexes[i]] = 1 + + @cython.boundscheck(False) + cpdef contains(self, item): + cdef unsigned long long _bucket_indexes[MAX_HASHES] + cdef unsigned int i + + _get_hash_buckets(item, _bucket_indexes, self._hashes, self._size) + for i in range(self._hashes): + if not self._bitarray[_bucket_indexes[i]]: + return False + + return True + + def __contains__(self, item): + return self.contains(item) diff --git a/src/bybloof.py b/src/bybloof.py new file mode 100644 index 0000000..35eeead --- /dev/null +++ b/src/bybloof.py @@ -0,0 +1,142 @@ +import operator +import _pybloof + +LongBloomFilter = _pybloof.LongBloomFilter +StringBloomFilter = _pybloof.StringBloomFilter +UIntBloomFilter = _pybloof.UIntBloomFilter + +""" +BloomSpecification and BloomCalculations are from: https://github.com/crankycoder/hydra/ + +The MIT License (MIT) + +Copyright (c) 2010 Victor Ng + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + + +class UnsupportedOperationException(Exception): + pass + + +class BloomSpecification(object): + """ + A wrapper class that holds two key parameters for a Bloom Filter: the + number of hash functions used, and the number of buckets per element used. + """ + + def __init__(self, k, buckets_per_element): + self.K = k + self.buckets_per_element = buckets_per_element + + def __eq__(self, other): + c1 = getattr(other, 'K', None) == self.K + c2 = getattr(other, 'buckets_per_element', None) == self.buckets_per_element + return c1 and c2 + + +class BloomCalculations(object): + """ + This calculation class is ported straight from Cassandra. + """ + min_buckets = 2 + min_k = 1 + + PROBS = [ + [1.0], # dummy row representing 0 buckets per element + [1.0, 1.0], # dummy row representing 1 buckets per element + [1.0, 0.393, 0.400], + [1.0, 0.283, 0.237, 0.253], + [1.0, 0.221, 0.155, 0.147, 0.160], + [1.0, 0.181, 0.109, 0.092, 0.092, 0.101], # 5 + [1.0, 0.154, 0.0804, 0.0609, 0.0561, 0.0578, 0.0638], + [1.0, 0.133, 0.0618, 0.0423, 0.0359, 0.0347, 0.0364], + [1.0, 0.118, 0.0489, 0.0306, 0.024, 0.0217, 0.0216, 0.0229], + [1.0, 0.105, 0.0397, 0.0228, 0.0166, 0.0141, 0.0133, 0.0135, 0.0145], + [1.0, 0.0952, 0.0329, 0.0174, 0.0118, 0.00943, 0.00844, 0.00819, 0.00846], # 10 + [1.0, 0.0869, 0.0276, 0.0136, 0.00864, 0.0065, 0.00552, 0.00513, 0.00509], + [1.0, 0.08, 0.0236, 0.0108, 0.00646, 0.00459, 0.00371, 0.00329, 0.00314], + [1.0, 0.074, 0.0203, 0.00875, 0.00492, 0.00332, 0.00255, 0.00217, 0.00199, 0.00194], + [1.0, 0.0689, 0.0177, 0.00718, 0.00381, 0.00244, 0.00179, 0.00146, 0.00129, 0.00121, 0.0012], + [1.0, 0.0645, 0.0156, 0.00596, 0.003, 0.00183, 0.00128, 0.001, 0.000852, 0.000775, 0.000744], # 15 + [1.0, 0.0606, 0.0138, 0.005, 0.00239, 0.00139, 0.000935, 0.000702, 0.000574, 0.000505, 0.00047, 0.000459], + [1.0, 0.0571, 0.0123, 0.00423, 0.00193, 0.00107, 0.000692, 0.000499, 0.000394, 0.000335, 0.000302, 0.000287, + 0.000284], + [1.0, 0.054, 0.0111, 0.00362, 0.00158, 0.000839, 0.000519, 0.00036, 0.000275, 0.000226, 0.000198, 0.000183, + 0.000176], + [1.0, 0.0513, 0.00998, 0.00312, 0.0013, 0.000663, 0.000394, 0.000264, 0.000194, 0.000155, 0.000132, 0.000118, + 0.000111, 0.000109], + [1.0, 0.0488, 0.00906, 0.0027, 0.00108, 0.00053, 0.000303, 0.000196, 0.00014, 0.000108, 8.89e-05, 7.77e-05, + 7.12e-05, 6.79e-05, 6.71e-05] # 20 + ] + + opt_k_per_buckets = [max(1, min(enumerate(probs), key=operator.itemgetter(1))[0]) for probs in PROBS] + + @classmethod + def computeBloomSpec1(cls, buckets_per_element): + """ + Given the number of buckets that can be used per element, return a + specification that minimizes the false positive rate. + @param buckets_per_element The number of buckets per element for the filter. + @return A spec that minimizes the false positive rate. + """ + assert buckets_per_element >= 1 + assert buckets_per_element <= len(BloomCalculations.PROBS) - 1 + return BloomSpecification(cls.opt_k_per_buckets[buckets_per_element], buckets_per_element) + + @classmethod + def computeBloomSpec2(cls, max_buckets_per_element, max_false_positive_probability): + """ + Given a maximum tolerable false positive probability, compute a Bloom + specification which will give less than the specified false positive rate, + but minimize the number of buckets per element and the number of hash + functions used. Because bandwidth (and therefore total bitvector size) + is considered more expensive than computing power, preference is given + to minimizing buckets per element rather than number of hash functions. + @param max_buckets_per_element The maximum number of buckets available for the filter. + @param max_false_positive_probability The maximum tolerable false positive rate. + @return A Bloom Specification which would result in a false positive rate + less than specified by the function call + @throws UnsupportedOperationException if a filter satisfying the parameters cannot be met + """ + assert max_buckets_per_element >= 1 + assert max_buckets_per_element <= len(BloomCalculations.PROBS) - 1 + maxK = len(BloomCalculations.PROBS[max_buckets_per_element]) - 1 + + # Handle the trivial cases + if max_false_positive_probability >= BloomCalculations.PROBS[cls.min_buckets][cls.min_k]: + return BloomSpecification(2, cls.opt_k_per_buckets[2]) + + if max_false_positive_probability < BloomCalculations.PROBS[max_buckets_per_element][maxK]: + msg = "Unable to satisfy %s with %s buckets per element" + raise UnsupportedOperationException(msg % (max_false_positive_probability, max_buckets_per_element)) + + # First find the minimal required number of buckets: + buckets_per_element = 2 + K = cls.opt_k_per_buckets[2] + while BloomCalculations.PROBS[buckets_per_element][K] > max_false_positive_probability: + buckets_per_element += 1 + K = cls.opt_k_per_buckets[buckets_per_element] + # Now that the number of buckets is sufficient, see if we can relax K + # without losing too much precision. + while BloomCalculations.PROBS[buckets_per_element][K - 1] <= max_false_positive_probability: + K -= 1 + + return BloomSpecification(K, buckets_per_element) diff --git a/tests/test_bf.py b/tests/test_bf.py new file mode 100644 index 0000000..e8a0e4e --- /dev/null +++ b/tests/test_bf.py @@ -0,0 +1,71 @@ +from array import array +import _pybloof + + +def test_pybloof(): + dmc = _pybloof.StringBloomFilter(500, 9) + dmc.add('test') + + assert 'test' in dmc + assert 'duck' not in dmc + + +def test_long_pybloof(): + dmc = _pybloof.LongBloomFilter(500, 9) + + dmc.add(1015L) + dmc.add(1015L) + + assert dmc.size == 500 + + assert 1015L in dmc + assert 1015 in dmc + assert 2015 not in dmc + + origin = dmc.to_byte_array().tostring().encode('hex') + dmc_2 = _pybloof.LongBloomFilter.from_byte_array(dmc.to_byte_array()) + clone = dmc_2.to_byte_array().tostring().encode('hex') + assert origin == clone + + assert 1015L in dmc_2 + assert 1015 in dmc_2 + assert 2015 not in dmc_2 + + assert dmc.size == dmc_2.size + assert dmc.hashes == dmc_2.hashes + + base64 = dmc.to_base64() + dmc_3 = _pybloof.LongBloomFilter.from_base64(base64) + + assert 1015L in dmc_3 + assert 1015 in dmc_3 + assert 2015 not in dmc_3 + + assert dmc.size == dmc_3.size + + dmc.clear() + + assert 1015 not in dmc + assert dmc.size == 500 + + +def test_extend(): + dmc = _pybloof.LongBloomFilter(500, 9) + + dmc.extend([10, 25, 35]) + + dmc2 = _pybloof.LongBloomFilter(500, 9) + dmc2.extend_array(array('l', [10, 25, 35])) + + assert dmc2.to_base64() == dmc.to_base64() + + +def test_extend_uint(): + dmc = _pybloof.UIntBloomFilter(500, 9) + + dmc.extend([10, 25, 35]) + + dmc2 = _pybloof.UIntBloomFilter(500, 9) + dmc2.extend_array(array('I', [10, 25, 35])) + + assert dmc2.to_base64() == dmc.to_base64() diff --git a/tests/test_mhash.py b/tests/test_mhash.py new file mode 100644 index 0000000..d073f01 --- /dev/null +++ b/tests/test_mhash.py @@ -0,0 +1,26 @@ +import _pybloof + + +def test__mhash3(): + h1 = _pybloof.hash('foo') + h2 = _pybloof.hash('foo', h1 & 0xFFFFFFFF) + print h1, h2 + assert (-39287385592190013122878999397579195001, + -73964642705803263641983394469427790275) == (h1, h2) + + +def test_null_key(): + h0 = _pybloof.hash('foo') + h1 = _pybloof.hash('foo\0bar') + h2 = _pybloof.hash('foo\0baz') + assert h0 != h1, 'Hash collision for appended null' + assert h0 != h2, 'Hash collision for appended null' + assert h1 != h2, 'Hash collision for bytes after null' + + +def test_mhash3_long(): + h1 = _pybloof.hash_long(123) + h2 = _pybloof.hash_long(123, h1 & 0xFFFFFFF) + + assert (-121703982708902402444108248539236701464, + 30126007557438804793814493095132085929) == (h1, h2)