v0.5.3 (#85)

barrust · web-flow · commit 27ae06d0fc89 · 2021-12-29T14:41:23.000-05:00
* additional enhancements; update changelog; version bump
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,11 +1,19 @@
 # PyProbables Changelog
 
+### Version 0.5.3
+* Additional type hinting
+* Improved format parsing and serialization; [see PR#81](https://github.com/barrust/pyprobables/pull/81). Thanks [@KOLANICH](https://github.com/KOLANICH)
+* Bloom Filters
+    * Added `export_to_hex` functionality for Bloom Filters on Disk
+    * Export as C header (**\*.h**) for Bloom Filters on Disk and Counting Bloom Filters
+* Added support for more input types for exporting and loading of saved files
+
+
 ### Version 0.5.2
 * Add ability to hash bytes along with strings
 * Make all tests files individually executable from the CLI. Thanks [@KOLANICH](https://github.com/KOLANICH)
 * Added type hints
 
-
 ### Version 0.5.1
 * Bloom Filter:
     * Export as a C header (**\*.h**)
diff --git a/docs/requirements.txt b/docs/requirements.txt
@@ -0,0 +1 @@
+sphinx>=3.0
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -42,6 +42,9 @@
     "sphinx.ext.todo",
 ]
 
+# Turn off typehints in discription
+autodoc_typehints = "description"
+
 # Add any paths that contain templates here, relative to this directory.
 templates_path = ["_templates"]
 
@@ -55,8 +58,8 @@
 master_doc = "index"
 
 # General information about the project.
-project = u"probables"
-copyright = u"2017, Tyler Barrus"
+project = "probables"
+copyright = "2017, Tyler Barrus"
 author = probables.__author__
 
 # The version info for the project you're documenting, acts as replacement for
@@ -137,8 +140,8 @@
     (
         master_doc,
         "pyprobables.tex",
-        u"pyprobables Documentation",
-        u"Tyler Barrus",
+        "pyprobables Documentation",
+        "Tyler Barrus",
         "manual",
     ),
 ]
@@ -148,7 +151,7 @@
 
 # One entry per manual page. List of tuples
 # (source start file, name, description, authors, manual section).
-man_pages = [(master_doc, "pyprobables", u"pyprobables Documentation", [author], 1)]
+man_pages = [(master_doc, "pyprobables", "pyprobables Documentation", [author], 1)]
 
 
 # -- Options for Texinfo output -------------------------------------------
@@ -160,7 +163,7 @@
     (
         master_doc,
         "pyprobables",
-        u"pyprobables Documentation",
+        "pyprobables Documentation",
         author,
         "pyprobables",
         "One line description of project.",
diff --git a/probables/__init__.py b/probables/__init__.py
@@ -27,7 +27,7 @@
 __maintainer__ = "Tyler Barrus"
 __email__ = "barrust@gmail.com"
 __license__ = "MIT"
-__version__ = "0.5.2"
+__version__ = "0.5.3"
 __credits__ = []  # type: ignore
 __url__ = "https://github.com/barrust/pyprobables"
 __bugtrack_url__ = "https://github.com/barrust/pyprobables/issues"
diff --git a/probables/blooms/basebloom.py b/probables/blooms/basebloom.py
@@ -226,7 +226,10 @@ def _set_optimized_params(
     HEADER_STRUCT_BE = Struct(">" + HEADER_STRUCT_FORMAT)
 
     def __load(
-        self, blm_type: str, file: typing.Union[Path, str, IOBase], hash_function: typing.Optional[HashFuncT] = None
+        self,
+        blm_type: str,
+        file: typing.Union[Path, str, IOBase, mmap],
+        hash_function: typing.Optional[HashFuncT] = None,
     ) -> None:
         """load the Bloom Filter from file"""
         # read in the needed information, and then call _set_optimized_params
@@ -290,7 +293,7 @@ def export_hex(self) -> str:
             self.false_positive_rate,
         )
         if self.__blm_type in ["regular", "reg-ondisk"]:
-            bytes_string = hexlify(bytearray(self.bloom)) + hexlify(mybytes)
+            bytes_string = hexlify(bytearray(self.bloom[: self.bloom_length])) + hexlify(mybytes)
         else:
             bytes_string = b""
             for val in self.bloom:
@@ -331,13 +334,13 @@ def export_c_header(self, filename: str) -> None:
             Args:
                 filename (str): The filename to which the Bloom Filter will \
                 be written. """
-        trailer = self.__class__.HEADER_STRUCT_BE.pack(
-            self.estimated_elements,
-            self.elements_added,
-            self.false_positive_rate,
+        data = (
+            "  " + line
+            for line in wrap(", ".join(("0x{:02x}".format(e) for e in bytearray.fromhex(self.export_hex()))), 80)
         )
-        data = ("  " + line for line in wrap(", ".join(("0x{:02x}".format(e) for e in chain(self.bloom, trailer))), 80))
+        bloom_type = "standard BloomFilter" if self.__blm_type in ("regular", "reg-ondisk") else "CountingBloomFilter"
         with open(filename, "w") as file:
+            print("/* BloomFilter Export of a {} */".format(bloom_type), file=file)
             print("#include <inttypes.h>", file=file)
             print("const uint64_t estimated_elements = ", self.estimated_elements, ";", sep="", file=file)
             print("const uint64_t elements_added = ", self.elements_added, ";", sep="", file=file)
diff --git a/probables/blooms/bloom.py b/probables/blooms/bloom.py
@@ -7,6 +7,7 @@
 import mmap
 import os
 import typing
+from pathlib import Path
 from shutil import copyfile
 from struct import calcsize, pack, unpack
 
@@ -251,7 +252,7 @@ class BloomFilterOnDisk(BaseBloom):
 
     def __init__(
         self,
-        filepath: str,
+        filepath: typing.Union[str, Path],
         est_elements: typing.Optional[int] = None,
         false_positive_rate: typing.Optional[float] = None,
         hex_string: typing.Optional[str] = None,
@@ -271,7 +272,7 @@ def __init__(
             pass
 
         self.__file_pointer = None
-        self.__filename = None
+        self.__filename = Path(filepath)
         self.__export_offset = calcsize("Qf")
         self._on_disk = True
 
@@ -315,7 +316,7 @@ def close(self) -> None:
             self.__file_pointer.close()
             self.__file_pointer = None
 
-    def __load(self, filepath: str, hash_function: typing.Optional[HashFuncT] = None):
+    def __load(self, filepath: typing.Union[str, Path], hash_function: typing.Optional[HashFuncT] = None):
         """load the Bloom Filter on disk"""
         # read the file, set the optimal params
         # mmap everything
@@ -333,9 +334,9 @@ def __load(self, filepath: str, hash_function: typing.Optional[HashFuncT] = None
         self.__file_pointer = open(filepath, "r+b")  # type: ignore
         self._bloom = mmap.mmap(self.__file_pointer.fileno(), 0)  # type: ignore
         self._on_disk = True
-        self.__filename = filepath  # type: ignore
+        self.__filename = Path(filepath)
 
-    def export(self, filename: str) -> None:  # type: ignore
+    def export(self, filename: typing.Union[str, Path]) -> None:  # type: ignore
         """ Export to disk if a different location
 
             Args:
@@ -344,9 +345,10 @@ def export(self, filename: str) -> None:  # type: ignore
             Note:
                 Only exported if the filename is not the original filename """
         self.__update()
-        if filename != self.__filename:
+        filename = Path(filename)
+        if filename.name != self.__filename.name:
             # setup the new bloom filter
-            copyfile(self.__filename, filename)
+            copyfile(self.__filename.name, filename.name)
         # otherwise, nothing to do!
 
     def add_alt(self, hashes: HashResultsT) -> None:
@@ -425,15 +427,6 @@ def jaccard_index(self, second: SimpleBloomT) -> typing.Optional[float]:
             return None
         return _tmp_jaccard_index(self, second)
 
-    def export_hex(self) -> str:
-        """ Export to a hex string
-
-            Raises:
-                NotSupportedError: This functionality is currently not \
-                supported """
-        msg = "`export_hex` is currently not supported by the on disk Bloom Filter"
-        raise NotSupportedError(msg)
-
     def _load_hex(self, hex_string: str, hash_function: typing.Optional[HashFuncT] = None):
         """load from hex ..."""
         msg = "Loading from hex_string is currently not supported by the on disk Bloom Filter"
diff --git a/probables/countminsketch/countminsketch.py b/probables/countminsketch/countminsketch.py
@@ -381,7 +381,7 @@ def join(self, second: "CountMinSketch") -> None:
         elif self.elements_added < INT64_T_MIN:
             self.__elements_added = INT64_T_MIN
 
-    def __load(self, file: typing.Union[Path, str, IOBase]):
+    def __load(self, file: typing.Union[Path, str, IOBase, mmap]):
         """load the count-min sketch from file"""
         if not isinstance(file, (IOBase, mmap)):
             file = Path(file)
diff --git a/probables/cuckoo/countingcuckoo.py b/probables/cuckoo/countingcuckoo.py
@@ -13,6 +13,7 @@
 
 from ..exceptions import CuckooFilterFullError
 from ..hashes import KeyT, SimpleHashT
+from ..utilities import MMap
 from .cuckoo import CuckooFilter
 
 
@@ -215,25 +216,29 @@ def _check_if_present(self, idx_1: int, idx_2: int, fingerprint: int) -> typing.
             return idx_2
         return None
 
-    def _load(self, filename: typing.Union[Path, str]) -> None:
+    def _load(self, file: typing.Union[Path, str, IOBase, mmap]) -> None:
         """load a cuckoo filter from file"""
-        with open(filename, "rb") as filepointer:
+        if not isinstance(file, (IOBase, mmap)):
+            file = Path(file)
+            with MMap(file) as filepointer:
+                self._load(filepointer)
+        else:
             offset = calcsize("II")
             int_size = calcsize("II")
-            filepointer.seek(offset * -1, os.SEEK_END)
-            list_size = filepointer.tell()
-            mybytes = unpack("II", filepointer.read(offset))
+            file.seek(offset * -1, os.SEEK_END)
+            list_size = file.tell()
+            mybytes = unpack("II", file.read(offset))
             self._bucket_size = mybytes[0]
             self.__max_cuckoo_swaps = mybytes[1]
             self._cuckoo_capacity = list_size // int_size // self.bucket_size
             self._inserted_elements = 0
             # now pull everything in!
-            filepointer.seek(0, os.SEEK_SET)
+            file.seek(0, os.SEEK_SET)
             self._buckets = list()
             for i in range(self.capacity):
                 self.buckets.append(list())
                 for _ in range(self.bucket_size):
-                    finger, count = unpack("II", filepointer.read(int_size))
+                    finger, count = unpack("II", file.read(int_size))
                     if finger > 0:
                         ccb = CountingCuckooBin(finger, count)
                         self.buckets[i].append(ccb)
diff --git a/probables/cuckoo/cuckoo.py b/probables/cuckoo/cuckoo.py
@@ -319,14 +319,17 @@ def _insert_fingerprint(self, fingerprint, idx_1, idx_2):
         # if we got here we have an error... we might need to know what is left
         return fingerprint
 
-    def _load(self, filename: typing.Union[Path, str]) -> None:
+    def _load(self, file: typing.Union[Path, str, IOBase, mmap]) -> None:
         """load a cuckoo filter from file"""
-        filename = Path(filename)
-        with MMap(filename) as d:
-            self._parse_footer(d)
+        if not isinstance(file, (IOBase, mmap)):
+            file = Path(file)
+            with MMap(file) as filepointer:
+                self._load(filepointer)
+        else:
+            self._parse_footer(file)  # type: ignore
             self._inserted_elements = 0
             # now pull everything in!
-            self._parse_buckets(d)
+            self._parse_buckets(file)  # type: ignore
 
     SINGLE_INT_C = "I"
     SINGLE_INT_SIZE = calcsize(SINGLE_INT_C)
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "pyprobables"
-version = "0.5.2"
+version = "0.5.3"
 description = "Probabilistic data structures in Python"
 authors = ["Tyler Barrus <barrust@gmail.com>"]
 license = "MIT"
diff --git a/tests/bloom_test.py b/tests/bloom_test.py
@@ -306,17 +306,18 @@ def test_bf_export_c_header(self):
                 data = fobj.readlines()
         data = [x.strip() for x in data]
 
-        self.assertEqual("#include <inttypes.h>", data[0])
-        self.assertEqual("const uint64_t estimated_elements = {};".format(blm.estimated_elements), data[1])
-        self.assertEqual("const uint64_t elements_added = {};".format(blm.elements_added), data[2])
-        self.assertEqual("const float false_positive_rate = {};".format(blm.false_positive_rate), data[3])
-        self.assertEqual("const uint64_t number_bits = {};".format(blm.number_bits), data[4])
-        self.assertEqual("const unsigned int number_hashes = {};".format(blm.number_hashes), data[5])
-        self.assertEqual("const unsigned char bloom[] = {", data[6])
+        self.assertEqual("/* BloomFilter Export of a standard BloomFilter */", data[0])
+        self.assertEqual("#include <inttypes.h>", data[1])
+        self.assertEqual("const uint64_t estimated_elements = {};".format(blm.estimated_elements), data[2])
+        self.assertEqual("const uint64_t elements_added = {};".format(blm.elements_added), data[3])
+        self.assertEqual("const float false_positive_rate = {};".format(blm.false_positive_rate), data[4])
+        self.assertEqual("const uint64_t number_bits = {};".format(blm.number_bits), data[5])
+        self.assertEqual("const unsigned int number_hashes = {};".format(blm.number_hashes), data[6])
+        self.assertEqual("const unsigned char bloom[] = {", data[7])
         self.assertEqual("};", data[-1])
 
         # rebuild the hex version!
-        new_hex = "".join([x.strip().replace("0x", "") for x in " ".join(data[7:-1]).split(",")])
+        new_hex = "".join([x.strip().replace("0x", "") for x in " ".join(data[8:-1]).split(",")])
         self.assertEqual(hex_val, new_hex)
 
     def test_bf_load_invalid_hex(self):
@@ -722,26 +723,14 @@ def test_bfod_bytes(self):
 
     def test_bfod_export_hex(self):
         """test that page error is thrown correctly"""
-
-        def runner():
-            """runner"""
-            blm = BloomFilterOnDisk(fobj.name, 10, 0.05)
-            blm.export_hex()
-
-        with NamedTemporaryFile(dir=os.getcwd(), suffix=".blm", delete=DELETE_TEMP_FILES) as fobj:
-            self.assertRaises(NotSupportedError, runner)
-
-    def test_bfod_export_hex_msg(self):
-        """test that page error is thrown correctly"""
+        hex_val = "6da491461a6bba4d000000000000000a000000000000000a3d4ccccd"
         with NamedTemporaryFile(dir=os.getcwd(), suffix=".blm", delete=DELETE_TEMP_FILES) as fobj:
-            blm = BloomFilterOnDisk(fobj.name, 10, 0.05)
-            try:
-                blm.export_hex()
-            except NotSupportedError as ex:
-                msg = "`export_hex` is currently not supported by the on disk Bloom Filter"
-                self.assertEqual(str(ex), msg)
-            else:
-                self.assertEqual(True, False)
+            blm = BloomFilterOnDisk(fobj.name, est_elements=10, false_positive_rate=0.05)
+            for i in range(0, 10):
+                tmp = "this is a test {0}".format(i)
+                blm.add(tmp)
+            hex_out = blm.export_hex()
+            self.assertEqual(hex_out, hex_val)
 
     def test_bfod_load_hex(self):
         """test that page error is thrown correctly"""
@@ -764,6 +753,37 @@ def test_bfod_load_hex_msg(self):
             else:
                 self.assertEqual(True, False)
 
+    def test_bfod_export_c_header(self):
+        """test exporting a c header"""
+        hex_val = "6da491461a6bba4d000000000000000a000000000000000a3d4ccccd"
+        with NamedTemporaryFile(dir=os.getcwd(), suffix=".blm", delete=DELETE_TEMP_FILES) as fobj:
+            blm = BloomFilterOnDisk(fobj.name, est_elements=10, false_positive_rate=0.05)
+            for i in range(0, 10):
+                tmp = "this is a test {0}".format(i)
+                blm.add(tmp)
+            with NamedTemporaryFile(dir=os.getcwd(), suffix=".blm", delete=DELETE_TEMP_FILES) as fobj:
+                blm.export_c_header(fobj.name)
+
+                # now load the file, parse it and do some tests!
+                with open(fobj.name, "r") as fobj:
+                    data = fobj.readlines()
+
+        data = [x.strip() for x in data]
+
+        self.assertEqual("/* BloomFilter Export of a standard BloomFilter */", data[0])
+        self.assertEqual("#include <inttypes.h>", data[1])
+        self.assertEqual("const uint64_t estimated_elements = {};".format(blm.estimated_elements), data[2])
+        self.assertEqual("const uint64_t elements_added = {};".format(blm.elements_added), data[3])
+        self.assertEqual("const float false_positive_rate = {};".format(blm.false_positive_rate), data[4])
+        self.assertEqual("const uint64_t number_bits = {};".format(blm.number_bits), data[5])
+        self.assertEqual("const unsigned int number_hashes = {};".format(blm.number_hashes), data[6])
+        self.assertEqual("const unsigned char bloom[] = {", data[7])
+        self.assertEqual("};", data[-1])
+
+        # rebuild the hex version!
+        new_hex = "".join([x.strip().replace("0x", "") for x in " ".join(data[8:-1]).split(",")])
+        self.assertEqual(hex_val, new_hex)
+
     def test_bfod_clear(self):
         """test clearing out the bloom filter on disk"""
         with NamedTemporaryFile(dir=os.getcwd(), suffix=".blm", delete=DELETE_TEMP_FILES) as fobj:
diff --git a/tests/countingbloom_test.py b/tests/countingbloom_test.py