@@ -29,6 +29,10 @@ def __init__(self, blm_type, est_elements=None, false_positive_rate=None,
29
29
self ._els_added = 0
30
30
self ._on_disk = False # not on disk
31
31
self .__blm_type = blm_type
32
+ if self .__blm_type in ['regular' , 'reg-ondisk' ]:
33
+ self .__impt_type = 'B'
34
+ else :
35
+ self .__impt_type = 'I'
32
36
33
37
if blm_type in ['regular' , 'reg-ondisk' ]:
34
38
msg = ('Insufecient parameters to set up the Bloom Filter' )
@@ -105,9 +109,15 @@ def elements_added(self):
105
109
''' int: Number of elements added to the Bloom Filter
106
110
107
111
Note:
108
- Not settable '''
112
+ Changing this can cause the current false positive rate to \
113
+ be reported incorrectly '''
109
114
return self ._els_added
110
115
116
+ @elements_added .setter
117
+ def elements_added (self , val ):
118
+ ''' set the els added '''
119
+ self ._els_added = val
120
+
111
121
@property
112
122
def is_on_disk (self ):
113
123
''' bool: Is the Bloom Filter on Disk or not
@@ -124,6 +134,11 @@ def bloom_length(self):
124
134
Not settable '''
125
135
return self .__bloom_length
126
136
137
+ @property
138
+ def bloom (self ):
139
+ ''' list(int): The bit/int array '''
140
+ return self ._bloom
141
+
127
142
@property
128
143
def hash_function (self ):
129
144
''' function: The hash function used
@@ -179,15 +194,13 @@ def __load(self, blm_type, filename, hash_function=None):
179
194
self .__number_hashes = vals [2 ]
180
195
self .__num_bits = vals [3 ]
181
196
if blm_type in ['regular' , 'reg-ondisk' ]:
182
- impt_type = 'B'
183
197
self .__bloom_length = int (math .ceil (self .__num_bits / 8.0 ))
184
198
else :
185
- impt_type = 'I'
186
199
self .__bloom_length = self .number_bits
187
200
# now read in the bit array!
188
201
filepointer .seek (0 , os .SEEK_SET )
189
- offset = calcsize (impt_type ) * self .bloom_length
190
- rep = impt_type * self .bloom_length
202
+ offset = calcsize (self . __impt_type ) * self .bloom_length
203
+ rep = self . __impt_type * self .bloom_length
191
204
self ._bloom = list (unpack (rep , filepointer .read (offset )))
192
205
193
206
def _load_hex (self , hex_string , hash_function = None ):
@@ -202,14 +215,12 @@ def _load_hex(self, hex_string, hash_function=None):
202
215
self .__number_hashes = vals [2 ]
203
216
self .__num_bits = vals [3 ]
204
217
if self .__blm_type in ['regular' , 'reg-ondisk' ]:
205
- impt_type = 'B'
206
218
self .__bloom_length = int (math .ceil (self .__num_bits / 8.0 ))
207
219
else :
208
- impt_type = 'B'
209
220
self .__bloom_length = self .number_bits
210
221
211
222
tmp_bloom = unhexlify (hex_string [:- offset ])
212
- rep = impt_type * self .bloom_length
223
+ rep = self . __impt_type * self .bloom_length
213
224
self ._bloom = list (unpack (rep , tmp_bloom ))
214
225
215
226
def export_hex (self ):
@@ -220,7 +231,13 @@ def export_hex(self):
220
231
'''
221
232
mybytes = pack ('>QQf' , self .estimated_elements ,
222
233
self .elements_added , self .false_positive_rate )
223
- bytes_string = hexlify (bytearray (self ._bloom )) + hexlify (mybytes )
234
+ if self .__blm_type in ['regular' , 'reg-ondisk' ]:
235
+ bytes_string = hexlify (bytearray (self .bloom )) + hexlify (mybytes )
236
+ else :
237
+ bytes_string = b''
238
+ for val in self .bloom :
239
+ bytes_string += hexlify (pack (self .__impt_type , val ))
240
+ bytes_string += hexlify (mybytes )
224
241
if sys .version_info > (3 , 0 ): # python 3 gives us bytes
225
242
return str (bytes_string , 'utf-8' )
226
243
return bytes_string
@@ -233,12 +250,8 @@ def export(self, filename):
233
250
be written.
234
251
'''
235
252
with open (filename , 'wb' ) as filepointer :
236
- if self .__blm_type == 'regular' or self .__blm_type is 'regular' :
237
- impt_type = 'B'
238
- else :
239
- impt_type = 'I'
240
- rep = impt_type * self .bloom_length
241
- filepointer .write (pack (rep , * self ._bloom ))
253
+ rep = self .__impt_type * self .bloom_length
254
+ filepointer .write (pack (rep , * self .bloom ))
242
255
filepointer .write (pack ('QQf' , self .estimated_elements ,
243
256
self .elements_added ,
244
257
self .false_positive_rate ))
@@ -249,11 +262,7 @@ def export_size(self):
249
262
Returns:
250
263
int: Size of the Bloom Filter when exported to disk
251
264
'''
252
- if self .__blm_type == 'regular' or self .__blm_type is 'regular' :
253
- impt_type = 'B'
254
- else :
255
- impt_type = 'I'
256
- tmp_b = calcsize (impt_type )
265
+ tmp_b = calcsize (self .__impt_type )
257
266
return (self .bloom_length * tmp_b ) + calcsize ('QQf' )
258
267
259
268
def current_false_positive_rate (self ):
@@ -360,31 +369,10 @@ def intersection(self, second):
360
369
two '''
361
370
pass
362
371
372
+ @abstractmethod
363
373
def jaccard_index (self , second ):
364
- ''' Calculate the jaccard similarity score between two Bloom Filters
365
-
366
- Args:
367
- second (BloomFilter): The Bloom Filter to compare with
368
- Returns:
369
- float: A numeric value between 0 and 1 where 1 is identical \
370
- and 0 means completely different
371
- Note:
372
- `second` may be a BloomFilterOnDisk object
373
- '''
374
- self ._verify_not_type_mismatch (second )
375
-
376
- if self ._verify_bloom_similarity (second ) is False :
377
- return None
378
- count_union = 0
379
- count_int = 0
380
- for i in list (range (0 , self .bloom_length )):
381
- t_union = self ._get_element (i ) | second ._get_element (i )
382
- t_intersection = self ._get_element (i ) & second ._get_element (i )
383
- count_union += self .__cnt_set_bits (t_union )
384
- count_int += self .__cnt_set_bits (t_intersection )
385
- if count_union == 0 :
386
- return 1.0
387
- return count_int / count_union
374
+ ''' Return a the Jaccard Similarity score between two bloom filters '''
375
+ pass
388
376
389
377
def _verify_bloom_similarity (self , second ):
390
378
''' can the blooms be used in intersection, union, or jaccard index '''
@@ -394,9 +382,3 @@ def _verify_bloom_similarity(self, second):
394
382
if hash_match or same_bits or next_hash :
395
383
return False
396
384
return True
397
-
398
- @staticmethod
399
- def _verify_not_type_mismatch (second ):
400
- ''' verify that there is not a type mismatch '''
401
- if not isinstance (second , BaseBloom ):
402
- raise TypeError ('The parameter second must be of type BloomFilter' )
0 commit comments