From bc4eb4d2bb2c5ecdd3a8cb491e4957f9ae8a8f2e Mon Sep 17 00:00:00 2001
From: Jake <me@jh.gg>
Date: Wed, 22 Jul 2015 18:25:12 -0400
Subject: [PATCH] v0.1 commit

---
 .gitignore          |  11 ++
 MANIFEST.in         |   8 ++
 requirements.txt    |   3 +
 setup.py            |  37 +++++
 src/MurmurHash3.c   | 339 ++++++++++++++++++++++++++++++++++++++++++++
 src/MurmurHash3.h   |  39 +++++
 src/_pybloof.pyx    | 233 ++++++++++++++++++++++++++++++
 src/bybloof.py      | 142 +++++++++++++++++++
 tests/test_bf.py    |  71 ++++++++++
 tests/test_mhash.py |  26 ++++
 10 files changed, 909 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 MANIFEST.in
 create mode 100644 requirements.txt
 create mode 100644 setup.py
 create mode 100644 src/MurmurHash3.c
 create mode 100644 src/MurmurHash3.h
 create mode 100644 src/_pybloof.pyx
 create mode 100644 src/bybloof.py
 create mode 100644 tests/test_bf.py
 create mode 100644 tests/test_mhash.py

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..357563a
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,11 @@
+a.out
+*.a
+_pybloof.c
+*.o
+*.pyc
+*.so
+*.swp
+*egg-info
+build
+dist
+.idea
\ No newline at end of file
diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644
index 0000000..cef79e4
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1,8 @@
+include *.txt
+include MANIFEST.in
+include setup.py
+include setup.cfg
+
+recursive-include src *.c *.h *.py
+recursive-include tests *.c *.h *.py
+recursive-include docs *.html *.css *.gif *.jpg *.txt
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..23a37b5
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,3 @@
+bitarray
+Cython
+nose
\ No newline at end of file
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..a57e63c
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,37 @@
+from setuptools import setup
+from setuptools.extension import Extension
+from Cython.Distutils import build_ext
+from os.path import join
+import functools
+
+import os
+
+rel = functools.partial(join, os.getcwd())
+
+ext_modules = [
+    Extension(
+        "_pybloof",
+        extra_compile_args=['-std=gnu99', '-O2', '-D_LARGEFILE64_SOURCE'],
+        sources=["src/_pybloof.pyx",
+                 'src/MurmurHash3.c'],
+
+        include_dirs=[rel('src')],
+        library_dirs=[rel('src')]
+    )
+]
+
+setup(
+    name='pybloof',
+    version='0.1',
+    author='Jake Heinz',
+    author_email='me@jh.gg',
+    url="http://github.com/jhgg/pybloof",
+    description='A high performance python bloom filter thing.',
+    license='MIT License',
+    cmdclass={'build_ext': build_ext},
+    zip_safe=False,
+    package_dir={'': 'src'},
+    py_modules=['pyblouf'],
+    ext_modules=ext_modules,
+    test_suite='nose.collector'
+)
diff --git a/src/MurmurHash3.c b/src/MurmurHash3.c
new file mode 100644
index 0000000..eba8df6
--- /dev/null
+++ b/src/MurmurHash3.c
@@ -0,0 +1,339 @@
+//-----------------------------------------------------------------------------
+// MurmurHash3 was written by Austin Appleby, and is placed in the public
+// domain. The author hereby disclaims copyright to this source code.
+
+// Note - The x86 and x64 versions do _not_ produce the same results, as the
+// algorithms are optimized for their respective platforms. You can still
+// compile and run any of them on any platform, but your performance with the
+// non-native version will be less than optimal.
+
+#include "MurmurHash3.h"
+
+//-----------------------------------------------------------------------------
+// Platform-specific functions and macros
+
+// Microsoft Visual Studio
+
+#if defined(_MSC_VER)
+
+#define FORCE_INLINE	__forceinline
+
+#include <stdlib.h>
+
+#define ROTL32(x,y)	_rotl(x,y)
+#define ROTL64(x,y)	_rotl64(x,y)
+
+#define BIG_CONSTANT(x) (x)
+
+// Other compilers
+
+#else	// defined(_MSC_VER)
+
+#define	FORCE_INLINE inline __attribute__((always_inline))
+
+inline uint32_t rotl32 ( uint32_t x, int8_t r )
+{
+  return (x << r) | (x >> (32 - r));
+}
+
+inline uint64_t rotl64 ( uint64_t x, int8_t r )
+{
+  return (x << r) | (x >> (64 - r));
+}
+
+#define	ROTL32(x,y)	rotl32(x,y)
+#define ROTL64(x,y)	rotl64(x,y)
+
+#define BIG_CONSTANT(x) (x##LLU)
+
+#endif // !defined(_MSC_VER)
+
+//-----------------------------------------------------------------------------
+// Block read - if your platform needs to do endian-swapping or can only
+// handle aligned reads, do the conversion here
+
+FORCE_INLINE uint32_t getblock32 ( const uint32_t * p, int i )
+{
+  return p[i];
+}
+
+FORCE_INLINE uint64_t getblock64 ( const uint64_t * p, int i )
+{
+  return p[i];
+}
+
+//-----------------------------------------------------------------------------
+// Finalization mix - force all bits of a hash block to avalanche
+
+FORCE_INLINE uint32_t fmix32 ( uint32_t h )
+{
+  h ^= h >> 16;
+  h *= 0x85ebca6b;
+  h ^= h >> 13;
+  h *= 0xc2b2ae35;
+  h ^= h >> 16;
+
+  return h;
+}
+
+//----------
+
+FORCE_INLINE uint64_t fmix64 ( uint64_t k )
+{
+  k ^= k >> 33;
+  k *= BIG_CONSTANT(0xff51afd7ed558ccd);
+  k ^= k >> 33;
+  k *= BIG_CONSTANT(0xc4ceb9fe1a85ec53);
+  k ^= k >> 33;
+
+  return k;
+}
+
+//-----------------------------------------------------------------------------
+
+void MurmurHash3_x86_32 ( const void * key, int len,
+                          uint32_t seed, void * out )
+{
+  const uint8_t * data = (const uint8_t*)key;
+  const int nblocks = len / 4;
+
+  uint32_t h1 = seed;
+
+  const uint32_t c1 = 0xcc9e2d51;
+  const uint32_t c2 = 0x1b873593;
+
+  //----------
+  // body
+
+  const uint32_t * blocks = (const uint32_t *)(data + nblocks*4);
+
+  for(int i = -nblocks; i; i++)
+  {
+    uint32_t k1 = getblock32(blocks,i);
+
+    k1 *= c1;
+    k1 = ROTL32(k1,15);
+    k1 *= c2;
+
+    h1 ^= k1;
+    h1 = ROTL32(h1,13);
+    h1 = h1*5+0xe6546b64;
+  }
+
+  //----------
+  // tail
+
+  const uint8_t * tail = (const uint8_t*)(data + nblocks*4);
+
+  uint32_t k1 = 0;
+
+  switch(len & 3)
+  {
+  case 3: k1 ^= tail[2] << 16;
+  case 2: k1 ^= tail[1] << 8;
+  case 1: k1 ^= tail[0];
+          k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1;
+  };
+
+  //----------
+  // finalization
+
+  h1 ^= len;
+
+  h1 = fmix32(h1);
+
+  *(uint32_t*)out = h1;
+}
+
+//-----------------------------------------------------------------------------
+
+void MurmurHash3_x86_128 ( const void * key, const int len,
+                           uint32_t seed, void * out )
+{
+  const uint8_t * data = (const uint8_t*)key;
+  const int nblocks = len / 16;
+
+  uint32_t h1 = seed;
+  uint32_t h2 = seed;
+  uint32_t h3 = seed;
+  uint32_t h4 = seed;
+
+  const uint32_t c1 = 0x239b961b;
+  const uint32_t c2 = 0xab0e9789;
+  const uint32_t c3 = 0x38b34ae5;
+  const uint32_t c4 = 0xa1e38b93;
+
+  //----------
+  // body
+
+  const uint32_t * blocks = (const uint32_t *)(data + nblocks*16);
+
+  for(int i = -nblocks; i; i++)
+  {
+    uint32_t k1 = getblock32(blocks,i*4+0);
+    uint32_t k2 = getblock32(blocks,i*4+1);
+    uint32_t k3 = getblock32(blocks,i*4+2);
+    uint32_t k4 = getblock32(blocks,i*4+3);
+
+    k1 *= c1; k1  = ROTL32(k1,15); k1 *= c2; h1 ^= k1;
+
+    h1 = ROTL32(h1,19); h1 += h2; h1 = h1*5+0x561ccd1b;
+
+    k2 *= c2; k2  = ROTL32(k2,16); k2 *= c3; h2 ^= k2;
+
+    h2 = ROTL32(h2,17); h2 += h3; h2 = h2*5+0x0bcaa747;
+
+    k3 *= c3; k3  = ROTL32(k3,17); k3 *= c4; h3 ^= k3;
+
+    h3 = ROTL32(h3,15); h3 += h4; h3 = h3*5+0x96cd1c35;
+
+    k4 *= c4; k4  = ROTL32(k4,18); k4 *= c1; h4 ^= k4;
+
+    h4 = ROTL32(h4,13); h4 += h1; h4 = h4*5+0x32ac3b17;
+  }
+
+  //----------
+  // tail
+
+  const uint8_t * tail = (const uint8_t*)(data + nblocks*16);
+
+  uint32_t k1 = 0;
+  uint32_t k2 = 0;
+  uint32_t k3 = 0;
+  uint32_t k4 = 0;
+
+  switch(len & 15)
+  {
+  case 15: k4 ^= tail[14] << 16;
+  case 14: k4 ^= tail[13] << 8;
+  case 13: k4 ^= tail[12] << 0;
+           k4 *= c4; k4  = ROTL32(k4,18); k4 *= c1; h4 ^= k4;
+
+  case 12: k3 ^= tail[11] << 24;
+  case 11: k3 ^= tail[10] << 16;
+  case 10: k3 ^= tail[ 9] << 8;
+  case  9: k3 ^= tail[ 8] << 0;
+           k3 *= c3; k3  = ROTL32(k3,17); k3 *= c4; h3 ^= k3;
+
+  case  8: k2 ^= tail[ 7] << 24;
+  case  7: k2 ^= tail[ 6] << 16;
+  case  6: k2 ^= tail[ 5] << 8;
+  case  5: k2 ^= tail[ 4] << 0;
+           k2 *= c2; k2  = ROTL32(k2,16); k2 *= c3; h2 ^= k2;
+
+  case  4: k1 ^= tail[ 3] << 24;
+  case  3: k1 ^= tail[ 2] << 16;
+  case  2: k1 ^= tail[ 1] << 8;
+  case  1: k1 ^= tail[ 0] << 0;
+           k1 *= c1; k1  = ROTL32(k1,15); k1 *= c2; h1 ^= k1;
+  };
+
+  //----------
+  // finalization
+
+  h1 ^= len; h2 ^= len; h3 ^= len; h4 ^= len;
+
+  h1 += h2; h1 += h3; h1 += h4;
+  h2 += h1; h3 += h1; h4 += h1;
+
+  h1 = fmix32(h1);
+  h2 = fmix32(h2);
+  h3 = fmix32(h3);
+  h4 = fmix32(h4);
+
+  h1 += h2; h1 += h3; h1 += h4;
+  h2 += h1; h3 += h1; h4 += h1;
+
+  ((uint32_t*)out)[0] = h1;
+  ((uint32_t*)out)[1] = h2;
+  ((uint32_t*)out)[2] = h3;
+  ((uint32_t*)out)[3] = h4;
+}
+
+//-----------------------------------------------------------------------------
+
+void MurmurHash3_x64_128 ( const void * key, const int len,
+                           const uint32_t seed, void * out )
+{
+  const uint8_t * data = (const uint8_t*)key;
+  const int nblocks = len / 16;
+
+  uint64_t h1 = seed;
+  uint64_t h2 = seed;
+
+  const uint64_t c1 = BIG_CONSTANT(0x87c37b91114253d5);
+  const uint64_t c2 = BIG_CONSTANT(0x4cf5ad432745937f);
+
+  //----------
+  // body
+
+  const uint64_t * blocks = (const uint64_t *)(data);
+
+  for(int i = 0; i < nblocks; i++)
+  {
+    uint64_t k1 = getblock64(blocks,i*2+0);
+    uint64_t k2 = getblock64(blocks,i*2+1);
+
+    k1 *= c1; k1  = ROTL64(k1,31); k1 *= c2; h1 ^= k1;
+
+    h1 = ROTL64(h1,27); h1 += h2; h1 = h1*5+0x52dce729;
+
+    k2 *= c2; k2  = ROTL64(k2,33); k2 *= c1; h2 ^= k2;
+
+    h2 = ROTL64(h2,31); h2 += h1; h2 = h2*5+0x38495ab5;
+  }
+
+  //----------
+  // tail
+
+  const uint8_t * tail = (const uint8_t*)(data + nblocks*16);
+
+  uint64_t k1 = 0;
+  uint64_t k2 = 0;
+
+  switch(len & 15)
+  {
+  case 15: k2 ^= ((uint64_t)tail[14]) << 48;
+  case 14: k2 ^= ((uint64_t)tail[13]) << 40;
+  case 13: k2 ^= ((uint64_t)tail[12]) << 32;
+  case 12: k2 ^= ((uint64_t)tail[11]) << 24;
+  case 11: k2 ^= ((uint64_t)tail[10]) << 16;
+  case 10: k2 ^= ((uint64_t)tail[ 9]) << 8;
+  case  9: k2 ^= ((uint64_t)tail[ 8]) << 0;
+           k2 *= c2; k2  = ROTL64(k2,33); k2 *= c1; h2 ^= k2;
+
+  case  8: k1 ^= ((uint64_t)tail[ 7]) << 56;
+  case  7: k1 ^= ((uint64_t)tail[ 6]) << 48;
+  case  6: k1 ^= ((uint64_t)tail[ 5]) << 40;
+  case  5: k1 ^= ((uint64_t)tail[ 4]) << 32;
+  case  4: k1 ^= ((uint64_t)tail[ 3]) << 24;
+  case  3: k1 ^= ((uint64_t)tail[ 2]) << 16;
+  case  2: k1 ^= ((uint64_t)tail[ 1]) << 8;
+  case  1: k1 ^= ((uint64_t)tail[ 0]) << 0;
+           k1 *= c1; k1  = ROTL64(k1,31); k1 *= c2; h1 ^= k1;
+  };
+
+  //----------
+  // finalization
+
+  h1 ^= len; h2 ^= len;
+
+  h1 += h2;
+  h2 += h1;
+
+  h1 = fmix64(h1);
+  h2 = fmix64(h2);
+
+  h1 += h2;
+  h2 += h1;
+
+  ((uint64_t*)out)[0] = h1;
+  ((uint64_t*)out)[1] = h2;
+}
+
+
+void MurmurHash3_x64_128_long( const long long key, const uint32_t seed, void * out ) {
+    MurmurHash3_x64_128(&key, sizeof(long long), seed, out);
+}
+
+//-----------------------------------------------------------------------------
\ No newline at end of file
diff --git a/src/MurmurHash3.h b/src/MurmurHash3.h
new file mode 100644
index 0000000..81e799f
--- /dev/null
+++ b/src/MurmurHash3.h
@@ -0,0 +1,39 @@
+//-----------------------------------------------------------------------------
+// MurmurHash3 was written by Austin Appleby, and is placed in the public
+// domain. The author hereby disclaims copyright to this source code.
+
+#ifndef _MURMURHASH3_H_
+#define _MURMURHASH3_H_
+
+//-----------------------------------------------------------------------------
+// Platform-specific functions and macros
+
+// Microsoft Visual Studio
+
+#if defined(_MSC_VER) && (_MSC_VER < 1600)
+
+typedef unsigned char uint8_t;
+typedef unsigned int uint32_t;
+typedef unsigned __int64 uint64_t;
+
+// Other compilers
+
+#else	// defined(_MSC_VER)
+
+#include <stdint.h>
+
+#endif // !defined(_MSC_VER)
+
+//-----------------------------------------------------------------------------
+
+void MurmurHash3_x86_32  ( const void * key, int len, uint32_t seed, void * out );
+
+void MurmurHash3_x86_128 ( const void * key, int len, uint32_t seed, void * out );
+
+void MurmurHash3_x64_128 ( const void * key, int len, uint32_t seed, void * out );
+
+void MurmurHash3_x64_128_long( const long long key, const uint32_t seed, void * out );
+
+//-----------------------------------------------------------------------------
+
+#endif // _MURMURHASH3_H_
\ No newline at end of file
diff --git a/src/_pybloof.pyx b/src/_pybloof.pyx
new file mode 100644
index 0000000..65779d3
--- /dev/null
+++ b/src/_pybloof.pyx
@@ -0,0 +1,233 @@
+from cpython cimport array
+cimport cpython
+cimport cython
+
+import bitarray
+import struct
+import base64
+
+from libc.string cimport memcpy
+
+cdef array.array char_array_template = array.array('b', [])
+
+cdef extern from "MurmurHash3.h" nogil:
+    void MurmurHash3_x64_128(void *key, int len, unsigned int seed, void *out)
+    void MurmurHash3_x64_128_long(long key, unsigned int seed, void *out)
+
+cdef extern from "stdlib.h" nogil:
+    long long int llabs(long long int j)
+
+def hash(key, int seed=0):
+    cdef long long result[2]
+    if isinstance(key, unicode):
+        key = key.encode('utf8')
+
+    MurmurHash3_x64_128(<char*> key, len(key), seed, result)
+    return long(result[0]) << 64 | (long(result[1]) & 0xFFFFFFFFFFFFFFFF)
+
+def hash_long(long long key, int seed=0):
+    cdef long long result[2]
+    MurmurHash3_x64_128_long(key, seed, result)
+    return long(result[0]) << 64 | (long(result[1]) & 0xFFFFFFFFFFFFFFFF)
+
+@cython.boundscheck(False)
+cdef void _get_hash_buckets(key, unsigned long long * _bucket_indexes, unsigned int hash_count, unsigned long max):
+    cdef unsigned long result[2]
+    cdef unsigned long hash1, hash2
+    cdef unsigned long i
+
+    if isinstance(key, unicode):
+        key = key.encode('utf8')
+
+    MurmurHash3_x64_128(<char*>key, len(key), 0, result)
+    hash1 = result[0]
+    MurmurHash3_x64_128(<char*>key, len(key), result[1] & 0xFFFFFFFF, result)
+    hash2 = result[0]
+
+    for i in range(hash_count):
+        _bucket_indexes[i] = llabs((hash1 + i * hash2) % max)\
+
+@cython.boundscheck(False)
+cdef void _get_hash_buckets_for_long(long long key, unsigned long long * _bucket_indexes, unsigned int hash_count,
+                                     unsigned long max):
+    cdef unsigned long result[2]
+    cdef unsigned long hash1, hash2
+    cdef unsigned long i
+
+    MurmurHash3_x64_128_long(key, 0, &result)
+    hash1 = result[0]
+    MurmurHash3_x64_128_long(key, result[1] & 0xFFFFFFFF, result)
+    hash2 = result[0]
+
+    for i in range(hash_count):
+        _bucket_indexes[i] = llabs((hash1 + i * hash2) % max)
+
+
+cdef char* fmt = '!III'
+cdef ssize_t header_size = sizeof(unsigned int) * 3
+DEF MAX_HASHES = 32
+
+cdef class _BloomFilter:
+    cdef unsigned int _size
+    cdef unsigned int _hashes
+    cdef object _bitarray
+
+    def __cinit__(self, unsigned long size, unsigned int hashes, cpython.bool _clear=True):
+        self._size = size
+        self._hashes = min(hashes, MAX_HASHES)
+        self._bitarray = bitarray.bitarray(size)
+
+        if _clear:
+            self._bitarray.setall(False)
+
+    cpdef _from_byte_array(self, array.array byte_array):
+        (address, size, endianness, unused, allocated) = self._bitarray.buffer_info()
+        memcpy(cpython.PyLong_AsVoidPtr(address), byte_array.data.as_chars + header_size,
+               byte_array.ob_size - header_size)
+
+    @classmethod
+    def from_byte_array(cls, array.array byte_array):
+        assert byte_array.ob_size > header_size
+        array_size, bit_size, hashes = struct.unpack_from(fmt, byte_array)
+        assert bit_size / 8 <= array_size
+        assert array_size == byte_array.ob_size - header_size
+        cdef bf = cls(bit_size, hashes, _clear=False)
+        bf._from_byte_array(byte_array)
+        return bf
+
+    def to_byte_array(self):
+        (address, size, endianness, unused, allocated) = self._bitarray.buffer_info()
+        cdef unsigned int length = size + header_size
+        cdef array.array byte_array = array.clone(char_array_template, length, False)
+        struct.pack_into(fmt, byte_array, 0, size, self._size, self._hashes)
+        memcpy(byte_array.data.as_chars + header_size, cpython.PyLong_AsVoidPtr(address), size)
+        return byte_array
+
+    def to_base64(self):
+        return base64.b64encode(self.to_byte_array())
+
+    @classmethod
+    def from_base64(cls, bytes s):
+        return cls.from_byte_array(array.array('b', base64.b64decode(s)))
+
+    property hashes:
+        def __get__(self):
+            return self._hashes
+
+    property size:
+        def __get__(self):
+            return self._size
+
+    cpdef clear(self):
+        self._bitarray.setall(False)
+
+cdef class LongBloomFilter(_BloomFilter):
+    @cython.boundscheck(False)
+    cpdef add(self, long long item):
+        cdef unsigned long long _bucket_indexes[MAX_HASHES]
+        cdef unsigned int i
+        _get_hash_buckets_for_long(item, _bucket_indexes, self._hashes, self._size)
+        for i in range(self._hashes):
+            self._bitarray[_bucket_indexes[i]] = 1
+
+    @cython.boundscheck(False)
+    cpdef extend(self, items):
+        cdef unsigned int i
+
+        cdef unsigned long long _bucket_indexes[MAX_HASHES]
+        for item in items:
+            _get_hash_buckets_for_long(item, _bucket_indexes, self._hashes, self._size)
+            for i in range(self._hashes):
+                self._bitarray[_bucket_indexes[i]] = 1
+
+    @cython.boundscheck(False)
+    cpdef extend_array(self, long long[:] items):
+        cdef unsigned int i
+
+        cdef unsigned long long _bucket_indexes[MAX_HASHES]
+        for item in items:
+            _get_hash_buckets_for_long(item, _bucket_indexes, self._hashes, self._size)
+            for i in range(self._hashes):
+                self._bitarray[_bucket_indexes[i]] = 1
+
+    @cython.boundscheck(False)
+    cpdef contains(self, long long item):
+        cdef unsigned long long _bucket_indexes[MAX_HASHES]
+        cdef unsigned int i
+        _get_hash_buckets_for_long(item, _bucket_indexes, self._hashes, self._size)
+
+        for i in range(self._hashes):
+            if not self._bitarray[_bucket_indexes[i]]:
+                return False
+
+        return True
+
+    def __contains__(self, long long item):
+        return self.contains(item)
+
+cdef class UIntBloomFilter(_BloomFilter):
+    @cython.boundscheck(False)
+    cpdef add(self, unsigned int item):
+        cdef unsigned long long _bucket_indexes[MAX_HASHES]
+        cdef unsigned int i
+        _get_hash_buckets_for_long(item, _bucket_indexes, self._hashes, self._size)
+        for i in range(self._hashes):
+            self._bitarray[_bucket_indexes[i]] = 1
+
+    @cython.boundscheck(False)
+    cpdef extend(self, items):
+        cdef unsigned int i
+
+        cdef unsigned long long _bucket_indexes[MAX_HASHES]
+        for item in items:
+            _get_hash_buckets_for_long(item, _bucket_indexes, self._hashes, self._size)
+            for i in range(self._hashes):
+                self._bitarray[_bucket_indexes[i]] = 1
+
+    @cython.boundscheck(False)
+    cpdef extend_array(self, unsigned int[:] items):
+        cdef unsigned int i
+
+        cdef unsigned long long _bucket_indexes[MAX_HASHES]
+        for item in items:
+            _get_hash_buckets_for_long(item, _bucket_indexes, self._hashes, self._size)
+            for i in range(self._hashes):
+                self._bitarray[_bucket_indexes[i]] = 1
+
+    @cython.boundscheck(False)
+    cpdef contains(self, unsigned int item):
+        cdef unsigned long long _bucket_indexes[MAX_HASHES]
+        cdef unsigned int i
+        _get_hash_buckets_for_long(item, _bucket_indexes, self._hashes, self._size)
+        for i in range(self._hashes):
+            if not self._bitarray[_bucket_indexes[i]]:
+                return False
+
+        return True
+
+    def __contains__(self, unsigned int item):
+        return self.contains(item)
+
+cdef class StringBloomFilter(_BloomFilter):
+    cpdef add(self, item):
+        cdef unsigned long long _bucket_indexes[MAX_HASHES]
+        cdef unsigned int i
+        _get_hash_buckets(item, _bucket_indexes, self._hashes, self._size)
+
+        for i in range(self._hashes):
+            self._bitarray[_bucket_indexes[i]] = 1
+
+    @cython.boundscheck(False)
+    cpdef contains(self, item):
+        cdef unsigned long long _bucket_indexes[MAX_HASHES]
+        cdef unsigned int i
+
+        _get_hash_buckets(item, _bucket_indexes, self._hashes, self._size)
+        for i in range(self._hashes):
+            if not self._bitarray[_bucket_indexes[i]]:
+                return False
+
+        return True
+
+    def __contains__(self, item):
+        return self.contains(item)
diff --git a/src/bybloof.py b/src/bybloof.py
new file mode 100644
index 0000000..35eeead
--- /dev/null
+++ b/src/bybloof.py
@@ -0,0 +1,142 @@
+import operator
+import _pybloof
+
+LongBloomFilter = _pybloof.LongBloomFilter
+StringBloomFilter = _pybloof.StringBloomFilter
+UIntBloomFilter = _pybloof.UIntBloomFilter
+
+"""
+BloomSpecification and BloomCalculations are from: https://github.com/crankycoder/hydra/
+
+The MIT License (MIT)
+
+Copyright (c) 2010 Victor Ng
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+
+class UnsupportedOperationException(Exception):
+    pass
+
+
+class BloomSpecification(object):
+    """
+    A wrapper class that holds two key parameters for a Bloom Filter: the
+    number of hash functions used, and the number of buckets per element used.
+    """
+
+    def __init__(self, k, buckets_per_element):
+        self.K = k
+        self.buckets_per_element = buckets_per_element
+
+    def __eq__(self, other):
+        c1 = getattr(other, 'K', None) == self.K
+        c2 = getattr(other, 'buckets_per_element', None) == self.buckets_per_element
+        return c1 and c2
+
+
+class BloomCalculations(object):
+    """
+    This calculation class is ported straight from Cassandra.
+    """
+    min_buckets = 2
+    min_k = 1
+
+    PROBS = [
+        [1.0],  # dummy row representing 0 buckets per element
+        [1.0, 1.0],  # dummy row representing 1 buckets per element
+        [1.0, 0.393, 0.400],
+        [1.0, 0.283, 0.237, 0.253],
+        [1.0, 0.221, 0.155, 0.147, 0.160],
+        [1.0, 0.181, 0.109, 0.092, 0.092, 0.101],  # 5
+        [1.0, 0.154, 0.0804, 0.0609, 0.0561, 0.0578, 0.0638],
+        [1.0, 0.133, 0.0618, 0.0423, 0.0359, 0.0347, 0.0364],
+        [1.0, 0.118, 0.0489, 0.0306, 0.024, 0.0217, 0.0216, 0.0229],
+        [1.0, 0.105, 0.0397, 0.0228, 0.0166, 0.0141, 0.0133, 0.0135, 0.0145],
+        [1.0, 0.0952, 0.0329, 0.0174, 0.0118, 0.00943, 0.00844, 0.00819, 0.00846],  # 10
+        [1.0, 0.0869, 0.0276, 0.0136, 0.00864, 0.0065, 0.00552, 0.00513, 0.00509],
+        [1.0, 0.08, 0.0236, 0.0108, 0.00646, 0.00459, 0.00371, 0.00329, 0.00314],
+        [1.0, 0.074, 0.0203, 0.00875, 0.00492, 0.00332, 0.00255, 0.00217, 0.00199, 0.00194],
+        [1.0, 0.0689, 0.0177, 0.00718, 0.00381, 0.00244, 0.00179, 0.00146, 0.00129, 0.00121, 0.0012],
+        [1.0, 0.0645, 0.0156, 0.00596, 0.003, 0.00183, 0.00128, 0.001, 0.000852, 0.000775, 0.000744],  # 15
+        [1.0, 0.0606, 0.0138, 0.005, 0.00239, 0.00139, 0.000935, 0.000702, 0.000574, 0.000505, 0.00047, 0.000459],
+        [1.0, 0.0571, 0.0123, 0.00423, 0.00193, 0.00107, 0.000692, 0.000499, 0.000394, 0.000335, 0.000302, 0.000287,
+         0.000284],
+        [1.0, 0.054, 0.0111, 0.00362, 0.00158, 0.000839, 0.000519, 0.00036, 0.000275, 0.000226, 0.000198, 0.000183,
+         0.000176],
+        [1.0, 0.0513, 0.00998, 0.00312, 0.0013, 0.000663, 0.000394, 0.000264, 0.000194, 0.000155, 0.000132, 0.000118,
+         0.000111, 0.000109],
+        [1.0, 0.0488, 0.00906, 0.0027, 0.00108, 0.00053, 0.000303, 0.000196, 0.00014, 0.000108, 8.89e-05, 7.77e-05,
+         7.12e-05, 6.79e-05, 6.71e-05]  # 20
+    ]
+
+    opt_k_per_buckets = [max(1, min(enumerate(probs), key=operator.itemgetter(1))[0]) for probs in PROBS]
+
+    @classmethod
+    def computeBloomSpec1(cls, buckets_per_element):
+        """
+        Given the number of buckets that can be used per element, return a
+        specification that minimizes the false positive rate.
+        @param buckets_per_element The number of buckets per element for the filter.
+        @return A spec that minimizes the false positive rate.
+        """
+        assert buckets_per_element >= 1
+        assert buckets_per_element <= len(BloomCalculations.PROBS) - 1
+        return BloomSpecification(cls.opt_k_per_buckets[buckets_per_element], buckets_per_element)
+
+    @classmethod
+    def computeBloomSpec2(cls, max_buckets_per_element, max_false_positive_probability):
+        """
+        Given a maximum tolerable false positive probability, compute a Bloom
+        specification which will give less than the specified false positive rate,
+        but minimize the number of buckets per element and the number of hash
+        functions used.  Because bandwidth (and therefore total bitvector size)
+        is considered more expensive than computing power, preference is given
+        to minimizing buckets per element rather than number of hash functions.
+        @param max_buckets_per_element The maximum number of buckets available for the filter.
+        @param max_false_positive_probability The maximum tolerable false positive rate.
+        @return A Bloom Specification which would result in a false positive rate
+        less than specified by the function call
+        @throws UnsupportedOperationException if a filter satisfying the parameters cannot be met
+        """
+        assert max_buckets_per_element >= 1
+        assert max_buckets_per_element <= len(BloomCalculations.PROBS) - 1
+        maxK = len(BloomCalculations.PROBS[max_buckets_per_element]) - 1
+
+        # Handle the trivial cases
+        if max_false_positive_probability >= BloomCalculations.PROBS[cls.min_buckets][cls.min_k]:
+            return BloomSpecification(2, cls.opt_k_per_buckets[2])
+
+        if max_false_positive_probability < BloomCalculations.PROBS[max_buckets_per_element][maxK]:
+            msg = "Unable to satisfy %s with %s buckets per element"
+            raise UnsupportedOperationException(msg % (max_false_positive_probability, max_buckets_per_element))
+
+        # First find the minimal required number of buckets:
+        buckets_per_element = 2
+        K = cls.opt_k_per_buckets[2]
+        while BloomCalculations.PROBS[buckets_per_element][K] > max_false_positive_probability:
+            buckets_per_element += 1
+            K = cls.opt_k_per_buckets[buckets_per_element]
+        # Now that the number of buckets is sufficient, see if we can relax K
+        # without losing too much precision.
+        while BloomCalculations.PROBS[buckets_per_element][K - 1] <= max_false_positive_probability:
+            K -= 1
+
+        return BloomSpecification(K, buckets_per_element)
diff --git a/tests/test_bf.py b/tests/test_bf.py
new file mode 100644
index 0000000..e8a0e4e
--- /dev/null
+++ b/tests/test_bf.py
@@ -0,0 +1,71 @@
+from array import array
+import _pybloof
+
+
+def test_pybloof():
+    dmc = _pybloof.StringBloomFilter(500, 9)
+    dmc.add('test')
+
+    assert 'test' in dmc
+    assert 'duck' not in dmc
+
+
+def test_long_pybloof():
+    dmc = _pybloof.LongBloomFilter(500, 9)
+
+    dmc.add(1015L)
+    dmc.add(1015L)
+
+    assert dmc.size == 500
+
+    assert 1015L in dmc
+    assert 1015 in dmc
+    assert 2015 not in dmc
+
+    origin = dmc.to_byte_array().tostring().encode('hex')
+    dmc_2 = _pybloof.LongBloomFilter.from_byte_array(dmc.to_byte_array())
+    clone = dmc_2.to_byte_array().tostring().encode('hex')
+    assert origin == clone
+
+    assert 1015L in dmc_2
+    assert 1015 in dmc_2
+    assert 2015 not in dmc_2
+
+    assert dmc.size == dmc_2.size
+    assert dmc.hashes == dmc_2.hashes
+
+    base64 = dmc.to_base64()
+    dmc_3 = _pybloof.LongBloomFilter.from_base64(base64)
+
+    assert 1015L in dmc_3
+    assert 1015 in dmc_3
+    assert 2015 not in dmc_3
+
+    assert dmc.size == dmc_3.size
+
+    dmc.clear()
+
+    assert 1015 not in dmc
+    assert dmc.size == 500
+
+
+def test_extend():
+    dmc = _pybloof.LongBloomFilter(500, 9)
+
+    dmc.extend([10, 25, 35])
+
+    dmc2 = _pybloof.LongBloomFilter(500, 9)
+    dmc2.extend_array(array('l', [10, 25, 35]))
+
+    assert dmc2.to_base64() == dmc.to_base64()
+
+
+def test_extend_uint():
+    dmc = _pybloof.UIntBloomFilter(500, 9)
+
+    dmc.extend([10, 25, 35])
+
+    dmc2 = _pybloof.UIntBloomFilter(500, 9)
+    dmc2.extend_array(array('I', [10, 25, 35]))
+
+    assert dmc2.to_base64() == dmc.to_base64()
diff --git a/tests/test_mhash.py b/tests/test_mhash.py
new file mode 100644
index 0000000..d073f01
--- /dev/null
+++ b/tests/test_mhash.py
@@ -0,0 +1,26 @@
+import _pybloof
+
+
+def test__mhash3():
+    h1 = _pybloof.hash('foo')
+    h2 = _pybloof.hash('foo', h1 & 0xFFFFFFFF)
+    print h1, h2
+    assert (-39287385592190013122878999397579195001,
+            -73964642705803263641983394469427790275) == (h1, h2)
+
+
+def test_null_key():
+    h0 = _pybloof.hash('foo')
+    h1 = _pybloof.hash('foo\0bar')
+    h2 = _pybloof.hash('foo\0baz')
+    assert h0 != h1, 'Hash collision for appended null'
+    assert h0 != h2, 'Hash collision for appended null'
+    assert h1 != h2, 'Hash collision for bytes after null'
+
+
+def test_mhash3_long():
+    h1 = _pybloof.hash_long(123)
+    h2 = _pybloof.hash_long(123, h1 & 0xFFFFFFF)
+
+    assert (-121703982708902402444108248539236701464,
+            30126007557438804793814493095132085929) == (h1, h2)