Skip to content

Commit 27ae06d

Browse files
authored
v0.5.3 (#85)
* additional enhancements; update changelog; version bump
1 parent 9ccbaa2 commit 27ae06d

File tree

12 files changed

+150
-72
lines changed

12 files changed

+150
-72
lines changed

CHANGELOG.md

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,19 @@
11
# PyProbables Changelog
22

3+
### Version 0.5.3
4+
* Additional type hinting
5+
* Improved format parsing and serialization; [see PR#81](https://github.com/barrust/pyprobables/pull/81). Thanks [@KOLANICH](https://github.com/KOLANICH)
6+
* Bloom Filters
7+
* Added `export_to_hex` functionality for Bloom Filters on Disk
8+
* Export as C header (**\*.h**) for Bloom Filters on Disk and Counting Bloom Filters
9+
* Added support for more input types for exporting and loading of saved files
10+
11+
312
### Version 0.5.2
413
* Add ability to hash bytes along with strings
514
* Make all tests files individually executable from the CLI. Thanks [@KOLANICH](https://github.com/KOLANICH)
615
* Added type hints
716

8-
917
### Version 0.5.1
1018
* Bloom Filter:
1119
* Export as a C header (**\*.h**)

docs/requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
sphinx>=3.0

docs/source/conf.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,9 @@
4242
"sphinx.ext.todo",
4343
]
4444

45+
# Turn off typehints in discription
46+
autodoc_typehints = "description"
47+
4548
# Add any paths that contain templates here, relative to this directory.
4649
templates_path = ["_templates"]
4750

@@ -55,8 +58,8 @@
5558
master_doc = "index"
5659

5760
# General information about the project.
58-
project = u"probables"
59-
copyright = u"2017, Tyler Barrus"
61+
project = "probables"
62+
copyright = "2017, Tyler Barrus"
6063
author = probables.__author__
6164

6265
# The version info for the project you're documenting, acts as replacement for
@@ -137,8 +140,8 @@
137140
(
138141
master_doc,
139142
"pyprobables.tex",
140-
u"pyprobables Documentation",
141-
u"Tyler Barrus",
143+
"pyprobables Documentation",
144+
"Tyler Barrus",
142145
"manual",
143146
),
144147
]
@@ -148,7 +151,7 @@
148151

149152
# One entry per manual page. List of tuples
150153
# (source start file, name, description, authors, manual section).
151-
man_pages = [(master_doc, "pyprobables", u"pyprobables Documentation", [author], 1)]
154+
man_pages = [(master_doc, "pyprobables", "pyprobables Documentation", [author], 1)]
152155

153156

154157
# -- Options for Texinfo output -------------------------------------------
@@ -160,7 +163,7 @@
160163
(
161164
master_doc,
162165
"pyprobables",
163-
u"pyprobables Documentation",
166+
"pyprobables Documentation",
164167
author,
165168
"pyprobables",
166169
"One line description of project.",

probables/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
__maintainer__ = "Tyler Barrus"
2828
__email__ = "[email protected]"
2929
__license__ = "MIT"
30-
__version__ = "0.5.2"
30+
__version__ = "0.5.3"
3131
__credits__ = [] # type: ignore
3232
__url__ = "https://github.com/barrust/pyprobables"
3333
__bugtrack_url__ = "https://github.com/barrust/pyprobables/issues"

probables/blooms/basebloom.py

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -226,7 +226,10 @@ def _set_optimized_params(
226226
HEADER_STRUCT_BE = Struct(">" + HEADER_STRUCT_FORMAT)
227227

228228
def __load(
229-
self, blm_type: str, file: typing.Union[Path, str, IOBase], hash_function: typing.Optional[HashFuncT] = None
229+
self,
230+
blm_type: str,
231+
file: typing.Union[Path, str, IOBase, mmap],
232+
hash_function: typing.Optional[HashFuncT] = None,
230233
) -> None:
231234
"""load the Bloom Filter from file"""
232235
# read in the needed information, and then call _set_optimized_params
@@ -290,7 +293,7 @@ def export_hex(self) -> str:
290293
self.false_positive_rate,
291294
)
292295
if self.__blm_type in ["regular", "reg-ondisk"]:
293-
bytes_string = hexlify(bytearray(self.bloom)) + hexlify(mybytes)
296+
bytes_string = hexlify(bytearray(self.bloom[: self.bloom_length])) + hexlify(mybytes)
294297
else:
295298
bytes_string = b""
296299
for val in self.bloom:
@@ -331,13 +334,13 @@ def export_c_header(self, filename: str) -> None:
331334
Args:
332335
filename (str): The filename to which the Bloom Filter will \
333336
be written. """
334-
trailer = self.__class__.HEADER_STRUCT_BE.pack(
335-
self.estimated_elements,
336-
self.elements_added,
337-
self.false_positive_rate,
337+
data = (
338+
" " + line
339+
for line in wrap(", ".join(("0x{:02x}".format(e) for e in bytearray.fromhex(self.export_hex()))), 80)
338340
)
339-
data = (" " + line for line in wrap(", ".join(("0x{:02x}".format(e) for e in chain(self.bloom, trailer))), 80))
341+
bloom_type = "standard BloomFilter" if self.__blm_type in ("regular", "reg-ondisk") else "CountingBloomFilter"
340342
with open(filename, "w") as file:
343+
print("/* BloomFilter Export of a {} */".format(bloom_type), file=file)
341344
print("#include <inttypes.h>", file=file)
342345
print("const uint64_t estimated_elements = ", self.estimated_elements, ";", sep="", file=file)
343346
print("const uint64_t elements_added = ", self.elements_added, ";", sep="", file=file)

probables/blooms/bloom.py

Lines changed: 9 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import mmap
88
import os
99
import typing
10+
from pathlib import Path
1011
from shutil import copyfile
1112
from struct import calcsize, pack, unpack
1213

@@ -251,7 +252,7 @@ class BloomFilterOnDisk(BaseBloom):
251252

252253
def __init__(
253254
self,
254-
filepath: str,
255+
filepath: typing.Union[str, Path],
255256
est_elements: typing.Optional[int] = None,
256257
false_positive_rate: typing.Optional[float] = None,
257258
hex_string: typing.Optional[str] = None,
@@ -271,7 +272,7 @@ def __init__(
271272
pass
272273

273274
self.__file_pointer = None
274-
self.__filename = None
275+
self.__filename = Path(filepath)
275276
self.__export_offset = calcsize("Qf")
276277
self._on_disk = True
277278

@@ -315,7 +316,7 @@ def close(self) -> None:
315316
self.__file_pointer.close()
316317
self.__file_pointer = None
317318

318-
def __load(self, filepath: str, hash_function: typing.Optional[HashFuncT] = None):
319+
def __load(self, filepath: typing.Union[str, Path], hash_function: typing.Optional[HashFuncT] = None):
319320
"""load the Bloom Filter on disk"""
320321
# read the file, set the optimal params
321322
# mmap everything
@@ -333,9 +334,9 @@ def __load(self, filepath: str, hash_function: typing.Optional[HashFuncT] = None
333334
self.__file_pointer = open(filepath, "r+b") # type: ignore
334335
self._bloom = mmap.mmap(self.__file_pointer.fileno(), 0) # type: ignore
335336
self._on_disk = True
336-
self.__filename = filepath # type: ignore
337+
self.__filename = Path(filepath)
337338

338-
def export(self, filename: str) -> None: # type: ignore
339+
def export(self, filename: typing.Union[str, Path]) -> None: # type: ignore
339340
""" Export to disk if a different location
340341
341342
Args:
@@ -344,9 +345,10 @@ def export(self, filename: str) -> None: # type: ignore
344345
Note:
345346
Only exported if the filename is not the original filename """
346347
self.__update()
347-
if filename != self.__filename:
348+
filename = Path(filename)
349+
if filename.name != self.__filename.name:
348350
# setup the new bloom filter
349-
copyfile(self.__filename, filename)
351+
copyfile(self.__filename.name, filename.name)
350352
# otherwise, nothing to do!
351353

352354
def add_alt(self, hashes: HashResultsT) -> None:
@@ -425,15 +427,6 @@ def jaccard_index(self, second: SimpleBloomT) -> typing.Optional[float]:
425427
return None
426428
return _tmp_jaccard_index(self, second)
427429

428-
def export_hex(self) -> str:
429-
""" Export to a hex string
430-
431-
Raises:
432-
NotSupportedError: This functionality is currently not \
433-
supported """
434-
msg = "`export_hex` is currently not supported by the on disk Bloom Filter"
435-
raise NotSupportedError(msg)
436-
437430
def _load_hex(self, hex_string: str, hash_function: typing.Optional[HashFuncT] = None):
438431
"""load from hex ..."""
439432
msg = "Loading from hex_string is currently not supported by the on disk Bloom Filter"

probables/countminsketch/countminsketch.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -381,7 +381,7 @@ def join(self, second: "CountMinSketch") -> None:
381381
elif self.elements_added < INT64_T_MIN:
382382
self.__elements_added = INT64_T_MIN
383383

384-
def __load(self, file: typing.Union[Path, str, IOBase]):
384+
def __load(self, file: typing.Union[Path, str, IOBase, mmap]):
385385
"""load the count-min sketch from file"""
386386
if not isinstance(file, (IOBase, mmap)):
387387
file = Path(file)

probables/cuckoo/countingcuckoo.py

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313

1414
from ..exceptions import CuckooFilterFullError
1515
from ..hashes import KeyT, SimpleHashT
16+
from ..utilities import MMap
1617
from .cuckoo import CuckooFilter
1718

1819

@@ -215,25 +216,29 @@ def _check_if_present(self, idx_1: int, idx_2: int, fingerprint: int) -> typing.
215216
return idx_2
216217
return None
217218

218-
def _load(self, filename: typing.Union[Path, str]) -> None:
219+
def _load(self, file: typing.Union[Path, str, IOBase, mmap]) -> None:
219220
"""load a cuckoo filter from file"""
220-
with open(filename, "rb") as filepointer:
221+
if not isinstance(file, (IOBase, mmap)):
222+
file = Path(file)
223+
with MMap(file) as filepointer:
224+
self._load(filepointer)
225+
else:
221226
offset = calcsize("II")
222227
int_size = calcsize("II")
223-
filepointer.seek(offset * -1, os.SEEK_END)
224-
list_size = filepointer.tell()
225-
mybytes = unpack("II", filepointer.read(offset))
228+
file.seek(offset * -1, os.SEEK_END)
229+
list_size = file.tell()
230+
mybytes = unpack("II", file.read(offset))
226231
self._bucket_size = mybytes[0]
227232
self.__max_cuckoo_swaps = mybytes[1]
228233
self._cuckoo_capacity = list_size // int_size // self.bucket_size
229234
self._inserted_elements = 0
230235
# now pull everything in!
231-
filepointer.seek(0, os.SEEK_SET)
236+
file.seek(0, os.SEEK_SET)
232237
self._buckets = list()
233238
for i in range(self.capacity):
234239
self.buckets.append(list())
235240
for _ in range(self.bucket_size):
236-
finger, count = unpack("II", filepointer.read(int_size))
241+
finger, count = unpack("II", file.read(int_size))
237242
if finger > 0:
238243
ccb = CountingCuckooBin(finger, count)
239244
self.buckets[i].append(ccb)

probables/cuckoo/cuckoo.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -319,14 +319,17 @@ def _insert_fingerprint(self, fingerprint, idx_1, idx_2):
319319
# if we got here we have an error... we might need to know what is left
320320
return fingerprint
321321

322-
def _load(self, filename: typing.Union[Path, str]) -> None:
322+
def _load(self, file: typing.Union[Path, str, IOBase, mmap]) -> None:
323323
"""load a cuckoo filter from file"""
324-
filename = Path(filename)
325-
with MMap(filename) as d:
326-
self._parse_footer(d)
324+
if not isinstance(file, (IOBase, mmap)):
325+
file = Path(file)
326+
with MMap(file) as filepointer:
327+
self._load(filepointer)
328+
else:
329+
self._parse_footer(file) # type: ignore
327330
self._inserted_elements = 0
328331
# now pull everything in!
329-
self._parse_buckets(d)
332+
self._parse_buckets(file) # type: ignore
330333

331334
SINGLE_INT_C = "I"
332335
SINGLE_INT_SIZE = calcsize(SINGLE_INT_C)

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "pyprobables"
3-
version = "0.5.2"
3+
version = "0.5.3"
44
description = "Probabilistic data structures in Python"
55
authors = ["Tyler Barrus <[email protected]>"]
66
license = "MIT"

tests/bloom_test.py

Lines changed: 47 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -306,17 +306,18 @@ def test_bf_export_c_header(self):
306306
data = fobj.readlines()
307307
data = [x.strip() for x in data]
308308

309-
self.assertEqual("#include <inttypes.h>", data[0])
310-
self.assertEqual("const uint64_t estimated_elements = {};".format(blm.estimated_elements), data[1])
311-
self.assertEqual("const uint64_t elements_added = {};".format(blm.elements_added), data[2])
312-
self.assertEqual("const float false_positive_rate = {};".format(blm.false_positive_rate), data[3])
313-
self.assertEqual("const uint64_t number_bits = {};".format(blm.number_bits), data[4])
314-
self.assertEqual("const unsigned int number_hashes = {};".format(blm.number_hashes), data[5])
315-
self.assertEqual("const unsigned char bloom[] = {", data[6])
309+
self.assertEqual("/* BloomFilter Export of a standard BloomFilter */", data[0])
310+
self.assertEqual("#include <inttypes.h>", data[1])
311+
self.assertEqual("const uint64_t estimated_elements = {};".format(blm.estimated_elements), data[2])
312+
self.assertEqual("const uint64_t elements_added = {};".format(blm.elements_added), data[3])
313+
self.assertEqual("const float false_positive_rate = {};".format(blm.false_positive_rate), data[4])
314+
self.assertEqual("const uint64_t number_bits = {};".format(blm.number_bits), data[5])
315+
self.assertEqual("const unsigned int number_hashes = {};".format(blm.number_hashes), data[6])
316+
self.assertEqual("const unsigned char bloom[] = {", data[7])
316317
self.assertEqual("};", data[-1])
317318

318319
# rebuild the hex version!
319-
new_hex = "".join([x.strip().replace("0x", "") for x in " ".join(data[7:-1]).split(",")])
320+
new_hex = "".join([x.strip().replace("0x", "") for x in " ".join(data[8:-1]).split(",")])
320321
self.assertEqual(hex_val, new_hex)
321322

322323
def test_bf_load_invalid_hex(self):
@@ -722,26 +723,14 @@ def test_bfod_bytes(self):
722723

723724
def test_bfod_export_hex(self):
724725
"""test that page error is thrown correctly"""
725-
726-
def runner():
727-
"""runner"""
728-
blm = BloomFilterOnDisk(fobj.name, 10, 0.05)
729-
blm.export_hex()
730-
731-
with NamedTemporaryFile(dir=os.getcwd(), suffix=".blm", delete=DELETE_TEMP_FILES) as fobj:
732-
self.assertRaises(NotSupportedError, runner)
733-
734-
def test_bfod_export_hex_msg(self):
735-
"""test that page error is thrown correctly"""
726+
hex_val = "6da491461a6bba4d000000000000000a000000000000000a3d4ccccd"
736727
with NamedTemporaryFile(dir=os.getcwd(), suffix=".blm", delete=DELETE_TEMP_FILES) as fobj:
737-
blm = BloomFilterOnDisk(fobj.name, 10, 0.05)
738-
try:
739-
blm.export_hex()
740-
except NotSupportedError as ex:
741-
msg = "`export_hex` is currently not supported by the on disk Bloom Filter"
742-
self.assertEqual(str(ex), msg)
743-
else:
744-
self.assertEqual(True, False)
728+
blm = BloomFilterOnDisk(fobj.name, est_elements=10, false_positive_rate=0.05)
729+
for i in range(0, 10):
730+
tmp = "this is a test {0}".format(i)
731+
blm.add(tmp)
732+
hex_out = blm.export_hex()
733+
self.assertEqual(hex_out, hex_val)
745734

746735
def test_bfod_load_hex(self):
747736
"""test that page error is thrown correctly"""
@@ -764,6 +753,37 @@ def test_bfod_load_hex_msg(self):
764753
else:
765754
self.assertEqual(True, False)
766755

756+
def test_bfod_export_c_header(self):
757+
"""test exporting a c header"""
758+
hex_val = "6da491461a6bba4d000000000000000a000000000000000a3d4ccccd"
759+
with NamedTemporaryFile(dir=os.getcwd(), suffix=".blm", delete=DELETE_TEMP_FILES) as fobj:
760+
blm = BloomFilterOnDisk(fobj.name, est_elements=10, false_positive_rate=0.05)
761+
for i in range(0, 10):
762+
tmp = "this is a test {0}".format(i)
763+
blm.add(tmp)
764+
with NamedTemporaryFile(dir=os.getcwd(), suffix=".blm", delete=DELETE_TEMP_FILES) as fobj:
765+
blm.export_c_header(fobj.name)
766+
767+
# now load the file, parse it and do some tests!
768+
with open(fobj.name, "r") as fobj:
769+
data = fobj.readlines()
770+
771+
data = [x.strip() for x in data]
772+
773+
self.assertEqual("/* BloomFilter Export of a standard BloomFilter */", data[0])
774+
self.assertEqual("#include <inttypes.h>", data[1])
775+
self.assertEqual("const uint64_t estimated_elements = {};".format(blm.estimated_elements), data[2])
776+
self.assertEqual("const uint64_t elements_added = {};".format(blm.elements_added), data[3])
777+
self.assertEqual("const float false_positive_rate = {};".format(blm.false_positive_rate), data[4])
778+
self.assertEqual("const uint64_t number_bits = {};".format(blm.number_bits), data[5])
779+
self.assertEqual("const unsigned int number_hashes = {};".format(blm.number_hashes), data[6])
780+
self.assertEqual("const unsigned char bloom[] = {", data[7])
781+
self.assertEqual("};", data[-1])
782+
783+
# rebuild the hex version!
784+
new_hex = "".join([x.strip().replace("0x", "") for x in " ".join(data[8:-1]).split(",")])
785+
self.assertEqual(hex_val, new_hex)
786+
767787
def test_bfod_clear(self):
768788
"""test clearing out the bloom filter on disk"""
769789
with NamedTemporaryFile(dir=os.getcwd(), suffix=".blm", delete=DELETE_TEMP_FILES) as fobj:

0 commit comments

Comments
 (0)