Skip to content

Commit bd2fd2e

Browse files
authored
0.0.7 (#17)
* fix counting bloom hex * added remove to counting bloom * add some missing tests * fix for overflow
1 parent 14abcd3 commit bd2fd2e

File tree

9 files changed

+376
-107
lines changed

9 files changed

+376
-107
lines changed

CHANGELOG.md

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,18 @@
11
# PyProbables Changelog
22

3+
### Version 0.0.7:
4+
* Counting Bloom Filter
5+
* Fix counting bloom hex export / import
6+
* Fix for overflow issue in counting bloom export
7+
* Added ability to remove from counting bloom
8+
* Count-Min Sketch
9+
* Fix for not recording large numbers of inserts and deletions correctly
10+
311
### Version 0.0.6:
412
* Probabilistic data structures added:
513
* Counting Bloom Filter
614
* Minor code clean-up
7-
* Re-factored Bloom Filters
15+
* Re-factored Bloom Filters
816

917
### Version 0.0.5:
1018
* Better on-line documentation

probables/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
__maintainer__ = 'Tyler Barrus'
1111
__email__ = '[email protected]'
1212
__license__ = 'MIT'
13-
__version__ = '0.0.6'
13+
__version__ = '0.0.7'
1414
__credits__ = []
1515
__url__ = 'https://github.com/barrust/pyprobables'
1616
__bugtrack_url__ = 'https://github.com/barrust/pyprobables/issues'

probables/blooms/basebloom.py

Lines changed: 32 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,10 @@ def __init__(self, blm_type, est_elements=None, false_positive_rate=None,
2929
self._els_added = 0
3030
self._on_disk = False # not on disk
3131
self.__blm_type = blm_type
32+
if self.__blm_type in ['regular', 'reg-ondisk']:
33+
self.__impt_type = 'B'
34+
else:
35+
self.__impt_type = 'I'
3236

3337
if blm_type in ['regular', 'reg-ondisk']:
3438
msg = ('Insufecient parameters to set up the Bloom Filter')
@@ -105,9 +109,15 @@ def elements_added(self):
105109
''' int: Number of elements added to the Bloom Filter
106110
107111
Note:
108-
Not settable '''
112+
Changing this can cause the current false positive rate to \
113+
be reported incorrectly '''
109114
return self._els_added
110115

116+
@elements_added.setter
117+
def elements_added(self, val):
118+
''' set the els added '''
119+
self._els_added = val
120+
111121
@property
112122
def is_on_disk(self):
113123
''' bool: Is the Bloom Filter on Disk or not
@@ -124,6 +134,11 @@ def bloom_length(self):
124134
Not settable '''
125135
return self.__bloom_length
126136

137+
@property
138+
def bloom(self):
139+
''' list(int): The bit/int array '''
140+
return self._bloom
141+
127142
@property
128143
def hash_function(self):
129144
''' function: The hash function used
@@ -179,15 +194,13 @@ def __load(self, blm_type, filename, hash_function=None):
179194
self.__number_hashes = vals[2]
180195
self.__num_bits = vals[3]
181196
if blm_type in ['regular', 'reg-ondisk']:
182-
impt_type = 'B'
183197
self.__bloom_length = int(math.ceil(self.__num_bits / 8.0))
184198
else:
185-
impt_type = 'I'
186199
self.__bloom_length = self.number_bits
187200
# now read in the bit array!
188201
filepointer.seek(0, os.SEEK_SET)
189-
offset = calcsize(impt_type) * self.bloom_length
190-
rep = impt_type * self.bloom_length
202+
offset = calcsize(self.__impt_type) * self.bloom_length
203+
rep = self.__impt_type * self.bloom_length
191204
self._bloom = list(unpack(rep, filepointer.read(offset)))
192205

193206
def _load_hex(self, hex_string, hash_function=None):
@@ -202,14 +215,12 @@ def _load_hex(self, hex_string, hash_function=None):
202215
self.__number_hashes = vals[2]
203216
self.__num_bits = vals[3]
204217
if self.__blm_type in ['regular', 'reg-ondisk']:
205-
impt_type = 'B'
206218
self.__bloom_length = int(math.ceil(self.__num_bits / 8.0))
207219
else:
208-
impt_type = 'B'
209220
self.__bloom_length = self.number_bits
210221

211222
tmp_bloom = unhexlify(hex_string[:-offset])
212-
rep = impt_type * self.bloom_length
223+
rep = self.__impt_type * self.bloom_length
213224
self._bloom = list(unpack(rep, tmp_bloom))
214225

215226
def export_hex(self):
@@ -220,7 +231,13 @@ def export_hex(self):
220231
'''
221232
mybytes = pack('>QQf', self.estimated_elements,
222233
self.elements_added, self.false_positive_rate)
223-
bytes_string = hexlify(bytearray(self._bloom)) + hexlify(mybytes)
234+
if self.__blm_type in ['regular', 'reg-ondisk']:
235+
bytes_string = hexlify(bytearray(self.bloom)) + hexlify(mybytes)
236+
else:
237+
bytes_string = b''
238+
for val in self.bloom:
239+
bytes_string += hexlify(pack(self.__impt_type, val))
240+
bytes_string += hexlify(mybytes)
224241
if sys.version_info > (3, 0): # python 3 gives us bytes
225242
return str(bytes_string, 'utf-8')
226243
return bytes_string
@@ -233,12 +250,8 @@ def export(self, filename):
233250
be written.
234251
'''
235252
with open(filename, 'wb') as filepointer:
236-
if self.__blm_type == 'regular' or self.__blm_type is 'regular':
237-
impt_type = 'B'
238-
else:
239-
impt_type = 'I'
240-
rep = impt_type * self.bloom_length
241-
filepointer.write(pack(rep, *self._bloom))
253+
rep = self.__impt_type * self.bloom_length
254+
filepointer.write(pack(rep, *self.bloom))
242255
filepointer.write(pack('QQf', self.estimated_elements,
243256
self.elements_added,
244257
self.false_positive_rate))
@@ -249,11 +262,7 @@ def export_size(self):
249262
Returns:
250263
int: Size of the Bloom Filter when exported to disk
251264
'''
252-
if self.__blm_type == 'regular' or self.__blm_type is 'regular':
253-
impt_type = 'B'
254-
else:
255-
impt_type = 'I'
256-
tmp_b = calcsize(impt_type)
265+
tmp_b = calcsize(self.__impt_type)
257266
return (self.bloom_length * tmp_b) + calcsize('QQf')
258267

259268
def current_false_positive_rate(self):
@@ -360,31 +369,10 @@ def intersection(self, second):
360369
two '''
361370
pass
362371

372+
@abstractmethod
363373
def jaccard_index(self, second):
364-
''' Calculate the jaccard similarity score between two Bloom Filters
365-
366-
Args:
367-
second (BloomFilter): The Bloom Filter to compare with
368-
Returns:
369-
float: A numeric value between 0 and 1 where 1 is identical \
370-
and 0 means completely different
371-
Note:
372-
`second` may be a BloomFilterOnDisk object
373-
'''
374-
self._verify_not_type_mismatch(second)
375-
376-
if self._verify_bloom_similarity(second) is False:
377-
return None
378-
count_union = 0
379-
count_int = 0
380-
for i in list(range(0, self.bloom_length)):
381-
t_union = self._get_element(i) | second._get_element(i)
382-
t_intersection = self._get_element(i) & second._get_element(i)
383-
count_union += self.__cnt_set_bits(t_union)
384-
count_int += self.__cnt_set_bits(t_intersection)
385-
if count_union == 0:
386-
return 1.0
387-
return count_int / count_union
374+
''' Return a the Jaccard Similarity score between two bloom filters '''
375+
pass
388376

389377
def _verify_bloom_similarity(self, second):
390378
''' can the blooms be used in intersection, union, or jaccard index '''
@@ -394,9 +382,3 @@ def _verify_bloom_similarity(self, second):
394382
if hash_match or same_bits or next_hash:
395383
return False
396384
return True
397-
398-
@staticmethod
399-
def _verify_not_type_mismatch(second):
400-
''' verify that there is not a type mismatch '''
401-
if not isinstance(second, BaseBloom):
402-
raise TypeError('The parameter second must be of type BloomFilter')

0 commit comments

Comments
 (0)