Bloom filters and a simple spell checker in Python

Kragen Sitaker kragen at pobox.com
Sat Aug 12 03:37:01 EDT 2006


I thought I'd try implementing a Bloom filter in Python, just for fun,
since Python does have a nice abstraction for large bit vectors: the
long int, or bignum.  Turns out it's grossly inefficient to use a single
long int for a Bloom filter because there's no way to set or clear bits
in it.

I wrote this on Beatrice's Mac with Python 2.3, which sucked, but was
better than not programming.

#!/usr/bin/python
"""Bloom filters in Python, using SHA-1 and Python longs.

My first attempt stored the whole filter in a single arbitrary-size integer,
but for some reason that was 100x slower than storing it in a bunch of 256-bit
integers.
"""

import sha

def nbits_required(n):
    """Bits required to represent any integer in [0, n)."""
    n -= 1
    rv = 0
    while n:
	n >>= 1
	rv += 1
    return rv

class Bloom:
    """Bloom filter: compact hash table for membership tests with false pos."""
    # default bits per bucket is 256 to cut down on pickle overhead
    def __init__(self, size, nhashes, bucketbits=256):
    	"""size: number of bits.  Should be a power of 2.
	nhashes: number of separate hashes to use.

	Making nhashes larger will make it slower.  There are also tradeoffs
	between size, performance, and false-positive rate, which you can look
	up elsewhere."""
	self.bucketbits = bucketbits
    	self.filter = [0L] * int((size + bucketbits - 1) / bucketbits)
	self.size = size
	self.nhashes = nhashes
	self.hashbits = nbits_required(size)
	assert self.hashbits * nhashes <= 160  # 160's all we get with SHA1
    def add(self, astr):
    	"""Add a string to the membership of the filter."""
    	for offset in self._hashes(astr):
	    bucket, bit = divmod(offset, self.bucketbits)
	    self.filter[bucket] |= (1L << bit)
    def __contains__(self, astr):
    	"""Returns true if the string is in the filter or it feels like it."""
    	for offset in self._hashes(astr):
	    bucket, bit = divmod(offset, self.bucketbits)
	    if not self.filter[bucket] & (1L << bit): return 0
	return 1
    def _hashes(self, astr):
    	"""The hashes of a particular string."""
    	digest = sha.sha(astr).digest()
	# is there no better way to convert a byte string into a long?!
	hashlong = 0L
	for ch in digest: hashlong = (hashlong << 8) | ord(ch)
	rv = []
	mask = (1L << self.hashbits) - 1
	for ii in range(self.nhashes):
	    # Note that this will give substantially nonuniform results if
	    # self.size is not a power of 2, in order to avoid wasting hash
	    # bits or doing long division:
	    rv.append((hashlong & mask) % self.size)
	    hashlong >>= self.hashbits
	return rv

def test_bloom():
    """Very basic sanity test for Bloom filter implementation."""
    def ok(a, b): assert a == b, (a, b)
    ok(map(nbits_required, range(1, 10)), [0, 1, 2, 2, 3, 3, 3, 3, 4])
    ok(nbits_required(131072), 17)
    ok(nbits_required(131073), 18)

    b = Bloom(1024, 5)
    assert 'asdf' not in b
    assert 'fdsa' not in b
    b.add('asdf')
    assert 'asdf' in b
    assert 'fdsa' not in b

    # false positives (depends on hash function):
    x = Bloom(8, 3)
    x.add('asdf') # about a 5% chance of false positives
    assert 'asdf' in x
    assert 'fdsa' not in x
    ok(filter(x.__contains__, ['foo%d' % ii for ii in range(25)]), ['foo22'])

test_bloom()

def misspellings(infile):
    """Demo: spell check."""
    import re, cPickle, sys
    try: bf = cPickle.load(file('dict.pck', 'rb'))
    except IOError:
	# /usr/share/dict/words has 234936 words on this Mac and is 2.4 megs
	sys.stderr.write("reading dictionary...\n")
	words = file('/usr/share/dict/words')
	# 2^21 bits, 8.9 per word, would give us 1.5% false positives with 5
	# hashes or 1.7% with 6, so we use 4194304 = 2^22 bits, or 17.8 per
	# word, for 0.09% false positives; that's still only half a mebibyte,
	# although pickle overhead pushes it up to 559K, 22% of the dictionary.
	bf = Bloom(4194304, 5)  
	lineno = 0
	for line in words: 
	    bf.add(line[:-1].lower())
	    lineno += 1
	    if not lineno % 100:
	    	sys.stderr.write('%s%s\r' % (lineno, ' ' * 40))
		sys.stderr.flush()
	sys.stderr.write("done reading dictionary;\n")
	try: cPickle.dump(bf, file('dict.pck', 'wb'), 2)
	except: pass
    def candidates(word):
	"""Words you might find in the dictionary in English."""
	yield word
	for suffix in ['s', 'ing', 'ed', 'es', 'er', 's', 'ly']:
	    if word.endswith(suffix): yield word[:-len(suffix)]
	for suffix, repl in [('ed', 'e'), ('er', 'e'), ('ing', 'e'),
			     ('ies', 'y'), ('ied', 'y')]:
	    if word.endswith(suffix): yield word[:-len(suffix)] + repl
    for line in infile:
	for word in re.findall(r"['\w]+", line):
	    # we drop the "'" because our dictionary has "didnt" but not
	    # "didn't"
	    for chance in candidates(word.replace("'", '').lower()):
		if chance in bf: break
	    else:
		sys.stdout.write(word + ' ')
		sys.stdout.flush()
		# to prevent repeating it...
		bf.add(word)
    sys.stdout.write('\n')

if __name__ == '__main__': 
    import sys
    misspellings(file(sys.argv[1]))


More information about the Kragen-hacks mailing list