Skip to content

Commit d313ebc

Browse files
authored
Rotating bloom (#44)
1 parent 022ef22 commit d313ebc

File tree

8 files changed

+178
-23
lines changed

8 files changed

+178
-23
lines changed

CHANGELOG.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,13 @@
11
# PyProbables Changelog
22

3+
### Version 0.2.6
4+
* Bloom Filters:
5+
* Addition of a Rotating Bloom Filter
6+
7+
### Version 0.2.5
8+
* Bloom Filters:
9+
* Addition of an Expanding Bloom Filter
10+
311
### Version 0.2.0
412
* Use __slots__
513

docs/source/code.rst

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,13 @@ ExpandingBloomFilter
4646
.. autoclass:: probables.ExpandingBloomFilter
4747
:members:
4848

49+
RotatingBloomFilter
50+
+++++++++++++++++++++++++++++++
51+
52+
.. autoclass:: probables.RotatingBloomFilter
53+
:members:
54+
:inherited-members:
55+
4956
CountingBloomFilter
5057
+++++++++++++++++++++++++++++++
5158

docs/source/quickstart.rst

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,17 @@ determine the number of elements that will be added.
9393
At this time, it is not possible to import or export an **Expanding Bloom
9494
Filter** but that is a planned feature.
9595

96+
Rotating Bloom Filter
97+
"""""""""""""""""""""""""""""""""""""""""""""""
98+
99+
The **Rotating Bloom Filter** is a specialized version of the standard
100+
Bloom Filter that rolls of earlier entries into the filter as they become more
101+
stale. The popping of the queue can be done either programmatically or
102+
automatically.
103+
104+
At this time, it is not possible to import or export an **Expanding Bloom
105+
Filter** but that is a planned feature.
106+
96107

97108
Counting Bloom Filter
98109
"""""""""""""""""""""""""""""""""""""""""""""""

probables/__init__.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
''' pyprobables module '''
22
from __future__ import (unicode_literals, absolute_import, print_function)
33
from . blooms import (BloomFilter, BloomFilterOnDisk, CountingBloomFilter,
4-
ExpandingBloomFilter)
4+
ExpandingBloomFilter, RotatingBloomFilter)
55
from . countminsketch import (CountMinSketch, HeavyHitters, StreamThreshold,
66
CountMeanSketch, CountMeanMinSketch)
77
from . cuckoo import (CuckooFilter, CountingCuckooFilter)
@@ -12,7 +12,7 @@
1212
__maintainer__ = 'Tyler Barrus'
1313
__email__ = '[email protected]'
1414
__license__ = 'MIT'
15-
__version__ = '0.2.5'
15+
__version__ = '0.2.6'
1616
__credits__ = []
1717
__url__ = 'https://github.com/barrust/pyprobables'
1818
__bugtrack_url__ = 'https://github.com/barrust/pyprobables/issues'
@@ -22,4 +22,4 @@
2222
'HeavyHitters', 'StreamThreshold', 'CuckooFilter',
2323
'CountingCuckooFilter', 'InitializationError', 'NotSupportedError',
2424
'ProbablesBaseException', 'CuckooFilterFullError',
25-
'ExpandingBloomFilter']
25+
'ExpandingBloomFilter', 'RotatingBloomFilter']

probables/blooms/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33

44
from . bloom import (BloomFilter, BloomFilterOnDisk)
55
from . countingbloom import (CountingBloomFilter)
6-
from . expandingbloom import (ExpandingBloomFilter)
6+
from . expandingbloom import (ExpandingBloomFilter, RotatingBloomFilter)
77

88
__all__ = ['BloomFilter', 'BloomFilterOnDisk', 'CountingBloomFilter',
9-
'ExpandingBloomFilter']
9+
'ExpandingBloomFilter', 'RotatingBloomFilter']

probables/blooms/expandingbloom.py

Lines changed: 99 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,9 @@ class ExpandingBloomFilter(object):
2525
At this point, the expanding Bloom Filter does not support \
2626
`export` or `import` '''
2727

28+
__slots__ = ['_blooms', '__fpr', '__est_elements', '__hash_func',
29+
'__added_elements']
30+
2831
def __init__(self, est_elements=None, false_positive_rate=None,
2932
hash_function=None):
3033
''' initialize '''
@@ -62,16 +65,6 @@ def elements_added(self):
6265
''' int: The total number of elements added '''
6366
return self.__added_elements
6467

65-
def __add_bloom_filter(self):
66-
''' build a new bloom and add it on! '''
67-
blm = BloomFilter(self.__est_elements, self.__fpr, self.__hash_func)
68-
self._blooms.append(blm)
69-
70-
def __check_for_growth(self):
71-
''' detereming if the bloom filter should automatically grow '''
72-
if self._blooms[-1].elements_added >= self.__est_elements:
73-
self.__add_bloom_filter()
74-
7568
def check(self, key):
7669
''' Check to see if the key is in the Bloom Filter
7770
@@ -103,8 +96,8 @@ def add(self, key, force=False):
10396
Args:
10497
key (str): The element to be inserted
10598
force (bool): `True` will force it to be inserted, even if it \
106-
likely has been inserted before \
107-
`False` will only insert if not found in the Bloom Filter '''
99+
likely has been inserted before `False` will \
100+
only insert if not found in the Bloom Filter '''
108101
hashes = self._blooms[0].hashes(key)
109102
self.add_alt(hashes, force)
110103

@@ -115,8 +108,101 @@ def add_alt(self, hashes, force=False):
115108
hashes (list): A list of integers representing the key to insert
116109
force (bool): `True` will force it to be inserted, even if \
117110
it likely has been inserted before \
118-
`False` will only insert if not found in the Bloom Filter '''
111+
`False` will only insert if not found in the \
112+
Bloom Filter '''
119113
self.__added_elements += 1
120114
if force or not self.check_alt(hashes):
121115
self.__check_for_growth()
122116
self._blooms[-1].add_alt(hashes)
117+
118+
def __add_bloom_filter(self):
119+
''' build a new bloom and add it on! '''
120+
blm = BloomFilter(est_elements=self.__est_elements,
121+
false_positive_rate=self.__fpr,
122+
hash_function=self.__hash_func)
123+
self._blooms.append(blm)
124+
125+
def __check_for_growth(self):
126+
''' detereming if the bloom filter should automatically grow '''
127+
if self._blooms[-1].elements_added >= self.__est_elements:
128+
self.__add_bloom_filter()
129+
130+
131+
class RotatingBloomFilter(ExpandingBloomFilter):
132+
''' Simple Rotating Bloom Filter implementation that allows for the "older"
133+
elements added to be removed, in chunks. As the queue fills up, those
134+
elements inserted earlier will be bulk removed. This also provides the
135+
user with the oportunity to force the removal instead of it being time
136+
based.
137+
138+
Args:
139+
est_elements (int): The number of estimated elements to be added
140+
false_positive_rate (float): The desired false positive rate
141+
max_queue_size (int): This is the number is used to determine the \
142+
maximum number of Bloom Filters. Total elements added is based on \
143+
`max_queue_size * est_elements`
144+
hash_function (function): Hashing strategy function to use \
145+
`hf(key, number)`
146+
'''
147+
__slots__ = ['_blooms', '__fpr', '__est_elements', '__hash_func',
148+
'__added_elements', '_queue_size']
149+
150+
def __init__(self, est_elements=None, false_positive_rate=None,
151+
max_queue_size=10, hash_function=None):
152+
''' initialize '''
153+
super(RotatingBloomFilter,
154+
self).__init__(est_elements=est_elements,
155+
false_positive_rate=false_positive_rate,
156+
hash_function=hash_function)
157+
self.__fpr = false_positive_rate
158+
self.__est_elements = est_elements
159+
self.__hash_func = hash_function
160+
self._queue_size = max_queue_size
161+
self.__added_elements = 0
162+
163+
@property
164+
def max_queue_size(self):
165+
''' int: The maximum size for the queue '''
166+
return self._queue_size
167+
168+
@property
169+
def current_queue_size(self):
170+
''' int: The current size of the queue '''
171+
return len(self._blooms)
172+
173+
def add_alt(self, hashes, force=False):
174+
''' Add the element represented by hashes into the Bloom Filter
175+
176+
Args:
177+
hashes (list): A list of integers representing the key to insert
178+
force (bool): `True` will force it to be inserted, even if \
179+
it likely has been inserted before \
180+
`False` will only insert if not found in the \
181+
Bloom Filter '''
182+
self.__added_elements += 1
183+
if force or not self.check_alt(hashes):
184+
self.__rotate_bloom_filter()
185+
self._blooms[-1].add_alt(hashes)
186+
187+
def pop(self):
188+
''' Pop an element off of the queue '''
189+
self.__rotate_bloom_filter(force=True)
190+
191+
def __rotate_bloom_filter(self, force=False):
192+
''' handle determining if/when the Bloom Filter queue needs to be
193+
rotated '''
194+
blm = self._blooms[-1]
195+
ready_to_rotate = blm.elements_added == blm.estimated_elements
196+
neeeds_to_pop = self.current_queue_size < self._queue_size
197+
if force or (ready_to_rotate and neeeds_to_pop):
198+
self.__add_bloom_filter()
199+
elif force or ready_to_rotate:
200+
blm = self._blooms.pop(0)
201+
self.__add_bloom_filter()
202+
203+
def __add_bloom_filter(self):
204+
''' build a new bloom and add it on! '''
205+
blm = BloomFilter(est_elements=self.__est_elements,
206+
false_positive_rate=self.__fpr,
207+
hash_function=self.__hash_func)
208+
self._blooms.append(blm)

tests/cuckoo_test.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -114,23 +114,23 @@ def test_cuckoo_filter_fing_size(self):
114114
''' test bad fingerprint size < 1 '''
115115
def runner():
116116
''' runner '''
117-
cko = CuckooFilter(capacity=100, bucket_size=2, finger_size=0)
117+
CuckooFilter(capacity=100, bucket_size=2, finger_size=0)
118118

119119
self.assertRaises(ValueError, runner)
120120

121121
def test_cuckoo_filter_fing_size_2(self):
122122
''' test bad fingerprint size > 4 '''
123123
def runner():
124124
''' runner '''
125-
cko = CuckooFilter(capacity=100, bucket_size=2, finger_size=5)
125+
CuckooFilter(capacity=100, bucket_size=2, finger_size=5)
126126

127127
self.assertRaises(ValueError, runner)
128128

129129
def test_cuckoo_filter_fing_size_3(self):
130130
''' test valid fingerprint size '''
131131
try:
132-
cko = CuckooFilter(capacity=100, bucket_size=2, finger_size=1)
133-
except:
132+
CuckooFilter(capacity=100, bucket_size=2, finger_size=1)
133+
except ValueError:
134134
self.assertEqual(True, False)
135135
self.assertEqual(True, True)
136136

tests/expandingbloom_test.py

Lines changed: 44 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
''' Unittest class '''
33
from __future__ import (unicode_literals, absolute_import, print_function)
44
import unittest
5-
from probables import (ExpandingBloomFilter)
5+
from probables import (ExpandingBloomFilter, RotatingBloomFilter)
66

77
class TestExpandingBloomFilter(unittest.TestCase):
88

@@ -54,3 +54,46 @@ def test_ebf_contains(self):
5454
self.assertEqual('this is another test' in blm, True)
5555
self.assertEqual('this is yet another test' in blm, False)
5656
self.assertEqual('this is not another test' in blm, False)
57+
58+
59+
class TestRotatingBloomFilter(unittest.TestCase):
60+
61+
def test_rbf_init(self):
62+
''' test the initialization of an rotating bloom filter '''
63+
blm = RotatingBloomFilter(est_elements=10, false_positive_rate=0.05,
64+
max_queue_size=10)
65+
self.assertEqual(blm.expansions, 0)
66+
self.assertEqual(blm.max_queue_size, 10)
67+
68+
def test_rfb_rotate(self):
69+
''' test that the bloom filter rotates the first bloom off the stack '''
70+
blm = RotatingBloomFilter(est_elements=10, false_positive_rate=0.05,
71+
max_queue_size=5)
72+
self.assertEqual(blm.expansions, 0)
73+
blm.add('test')
74+
self.assertEqual(blm.expansions, 0)
75+
for i in range(10):
76+
blm.add('{}'.format(i), force=True)
77+
self.assertEqual(blm.expansions, 1)
78+
self.assertEqual(blm.current_queue_size, 2)
79+
self.assertEqual(blm.check('test'), True)
80+
81+
for i in range(10, 20):
82+
blm.add('{}'.format(i), force=True)
83+
self.assertEqual(blm.check('test'), True)
84+
self.assertEqual(blm.current_queue_size, 3)
85+
86+
for i in range(20, 30):
87+
blm.add('{}'.format(i), force=True)
88+
self.assertEqual(blm.check('test'), True)
89+
self.assertEqual(blm.current_queue_size, 4)
90+
91+
for i in range(30, 40):
92+
blm.add('{}'.format(i), force=True)
93+
self.assertEqual(blm.check('test'), True)
94+
self.assertEqual(blm.current_queue_size, 5)
95+
96+
for i in range(40, 50):
97+
blm.add('{}'.format(i), force=True)
98+
self.assertEqual(blm.check('test'), False) # it should roll off
99+
self.assertEqual(blm.current_queue_size, 5)

0 commit comments

Comments
 (0)