@@ -29,6 +29,10 @@ def __init__(self, blm_type, est_elements=None, false_positive_rate=None,
2929 self ._els_added = 0
3030 self ._on_disk = False # not on disk
3131 self .__blm_type = blm_type
32+ if self .__blm_type in ['regular' , 'reg-ondisk' ]:
33+ self .__impt_type = 'B'
34+ else :
35+ self .__impt_type = 'I'
3236
3337 if blm_type in ['regular' , 'reg-ondisk' ]:
3438 msg = ('Insufecient parameters to set up the Bloom Filter' )
@@ -105,9 +109,15 @@ def elements_added(self):
105109 ''' int: Number of elements added to the Bloom Filter
106110
107111 Note:
108- Not settable '''
112+ Changing this can cause the current false positive rate to \
113+ be reported incorrectly '''
109114 return self ._els_added
110115
116+ @elements_added .setter
117+ def elements_added (self , val ):
118+ ''' set the els added '''
119+ self ._els_added = val
120+
111121 @property
112122 def is_on_disk (self ):
113123 ''' bool: Is the Bloom Filter on Disk or not
@@ -124,6 +134,11 @@ def bloom_length(self):
124134 Not settable '''
125135 return self .__bloom_length
126136
137+ @property
138+ def bloom (self ):
139+ ''' list(int): The bit/int array '''
140+ return self ._bloom
141+
127142 @property
128143 def hash_function (self ):
129144 ''' function: The hash function used
@@ -179,15 +194,13 @@ def __load(self, blm_type, filename, hash_function=None):
179194 self .__number_hashes = vals [2 ]
180195 self .__num_bits = vals [3 ]
181196 if blm_type in ['regular' , 'reg-ondisk' ]:
182- impt_type = 'B'
183197 self .__bloom_length = int (math .ceil (self .__num_bits / 8.0 ))
184198 else :
185- impt_type = 'I'
186199 self .__bloom_length = self .number_bits
187200 # now read in the bit array!
188201 filepointer .seek (0 , os .SEEK_SET )
189- offset = calcsize (impt_type ) * self .bloom_length
190- rep = impt_type * self .bloom_length
202+ offset = calcsize (self . __impt_type ) * self .bloom_length
203+ rep = self . __impt_type * self .bloom_length
191204 self ._bloom = list (unpack (rep , filepointer .read (offset )))
192205
193206 def _load_hex (self , hex_string , hash_function = None ):
@@ -202,14 +215,12 @@ def _load_hex(self, hex_string, hash_function=None):
202215 self .__number_hashes = vals [2 ]
203216 self .__num_bits = vals [3 ]
204217 if self .__blm_type in ['regular' , 'reg-ondisk' ]:
205- impt_type = 'B'
206218 self .__bloom_length = int (math .ceil (self .__num_bits / 8.0 ))
207219 else :
208- impt_type = 'B'
209220 self .__bloom_length = self .number_bits
210221
211222 tmp_bloom = unhexlify (hex_string [:- offset ])
212- rep = impt_type * self .bloom_length
223+ rep = self . __impt_type * self .bloom_length
213224 self ._bloom = list (unpack (rep , tmp_bloom ))
214225
215226 def export_hex (self ):
@@ -220,7 +231,13 @@ def export_hex(self):
220231 '''
221232 mybytes = pack ('>QQf' , self .estimated_elements ,
222233 self .elements_added , self .false_positive_rate )
223- bytes_string = hexlify (bytearray (self ._bloom )) + hexlify (mybytes )
234+ if self .__blm_type in ['regular' , 'reg-ondisk' ]:
235+ bytes_string = hexlify (bytearray (self .bloom )) + hexlify (mybytes )
236+ else :
237+ bytes_string = b''
238+ for val in self .bloom :
239+ bytes_string += hexlify (pack (self .__impt_type , val ))
240+ bytes_string += hexlify (mybytes )
224241 if sys .version_info > (3 , 0 ): # python 3 gives us bytes
225242 return str (bytes_string , 'utf-8' )
226243 return bytes_string
@@ -233,12 +250,8 @@ def export(self, filename):
233250 be written.
234251 '''
235252 with open (filename , 'wb' ) as filepointer :
236- if self .__blm_type == 'regular' or self .__blm_type is 'regular' :
237- impt_type = 'B'
238- else :
239- impt_type = 'I'
240- rep = impt_type * self .bloom_length
241- filepointer .write (pack (rep , * self ._bloom ))
253+ rep = self .__impt_type * self .bloom_length
254+ filepointer .write (pack (rep , * self .bloom ))
242255 filepointer .write (pack ('QQf' , self .estimated_elements ,
243256 self .elements_added ,
244257 self .false_positive_rate ))
@@ -249,11 +262,7 @@ def export_size(self):
249262 Returns:
250263 int: Size of the Bloom Filter when exported to disk
251264 '''
252- if self .__blm_type == 'regular' or self .__blm_type is 'regular' :
253- impt_type = 'B'
254- else :
255- impt_type = 'I'
256- tmp_b = calcsize (impt_type )
265+ tmp_b = calcsize (self .__impt_type )
257266 return (self .bloom_length * tmp_b ) + calcsize ('QQf' )
258267
259268 def current_false_positive_rate (self ):
@@ -360,31 +369,10 @@ def intersection(self, second):
360369 two '''
361370 pass
362371
372+ @abstractmethod
363373 def jaccard_index (self , second ):
364- ''' Calculate the jaccard similarity score between two Bloom Filters
365-
366- Args:
367- second (BloomFilter): The Bloom Filter to compare with
368- Returns:
369- float: A numeric value between 0 and 1 where 1 is identical \
370- and 0 means completely different
371- Note:
372- `second` may be a BloomFilterOnDisk object
373- '''
374- self ._verify_not_type_mismatch (second )
375-
376- if self ._verify_bloom_similarity (second ) is False :
377- return None
378- count_union = 0
379- count_int = 0
380- for i in list (range (0 , self .bloom_length )):
381- t_union = self ._get_element (i ) | second ._get_element (i )
382- t_intersection = self ._get_element (i ) & second ._get_element (i )
383- count_union += self .__cnt_set_bits (t_union )
384- count_int += self .__cnt_set_bits (t_intersection )
385- if count_union == 0 :
386- return 1.0
387- return count_int / count_union
374+ ''' Return a the Jaccard Similarity score between two bloom filters '''
375+ pass
388376
389377 def _verify_bloom_similarity (self , second ):
390378 ''' can the blooms be used in intersection, union, or jaccard index '''
@@ -394,9 +382,3 @@ def _verify_bloom_similarity(self, second):
394382 if hash_match or same_bits or next_hash :
395383 return False
396384 return True
397-
398- @staticmethod
399- def _verify_not_type_mismatch (second ):
400- ''' verify that there is not a type mismatch '''
401- if not isinstance (second , BaseBloom ):
402- raise TypeError ('The parameter second must be of type BloomFilter' )
0 commit comments