0.0.4 (#12)

barrust · web-flow · commit e54656b2698b · 2017-07-15T12:51:59.000-04:00
* more documentation
* easier query type setting to count-mean and count-mean-min sketches
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,8 @@
     * Bloom Filter
     * Bloom Filter (on disk)
     * Count-Min Sketch
+    * Count-Mean Sketch
+    * Count-Mean-Min Sketch
     * Heavy Hitters
     * Stream Threshold
 * Import and export of each
diff --git a/README.rst b/README.rst
@@ -5,6 +5,7 @@ PyProbables
 is to provide the developer with a pure-python implementation of common
 probabilistic data-structures to use in their work.
 
+
 Installation
 ------------------
 
@@ -25,14 +26,14 @@ To install `pyprobables`, simply clone the `repository on GitHub
 
 `pyprobables` supports python versions 2.7 and 3.3 - 3.6
 
+
 API Documentation
 ---------------------
 
-Documentation is currently under development. The documentation of
-the latest release will be hosted on
-`readthedocs.io <http://pyprobables.readthedocs.io/en/stable/?>`__
+The documentation of is hosted on
+`readthedocs.io <http://pyprobables.readthedocs.io/en/latest/code.html#api>`__
 
-Once completed, you can build the documentation yourself by running:
+You can build the documentation yourself by running:
 
 ::
 
@@ -53,6 +54,7 @@ downloaded folder:
   $ python setup.py test
 
 
+
 Quickstart
 ------------------
 
@@ -76,8 +78,11 @@ Import pyprobables and setup a Count-Min Sketch:
     >>> cms.add('google.com')  # should return 1
     >>> cms.add('facebook.com', 25)  # insert 25 at once; should return 25
 
-See the documentation for other data structures available and for more
-examples!
+See the `API documentation <http://pyprobables.readthedocs.io/en/latest/code.html#api>`__
+for other data structures available and the
+`quickstart page <http://pyprobables.readthedocs.io/en/latest/quickstart.html#quickstart>`__
+for more examples!
+
 
 Changelog
 ------------------
diff --git a/docs/source/code.rst b/docs/source/code.rst
@@ -4,12 +4,12 @@ pyprobables API
 ***************
 
 Here you can find the full developer API for the pyprobables project.
+pyprobables provides a suite of probabilistic data-structures to be used
+in data analytics and data science projects.
 
-Contents:
-=========
 
 .. toctree::
-   :maxdepth: 3
+   :maxdepth: 4
 
    code
 
@@ -20,8 +20,10 @@ Data Structures and Classes
 Bloom Filters
 -------------
 
-Bloom Filters are a class of probabilistic data structures that guarantee a
-zero percent false negative rate and a predetermined false positive rate.
+Bloom Filters are a class of probabilistic data structures used for set
+operations. Bloom Filters guarantee a zero percent false negative rate
+and a predetermined false positive rate.
+
 
 BloomFilter
 +++++++++++++++++++++++++++++++
@@ -41,33 +43,61 @@ BloomFilterOnDisk
 Count-Min Sketches
 ------------------
 
+Count-Min Sketches are a class of probabilistic data structures designed to
+count the number of occurrences of data elements in data streams.
+
+
 CountMinSketch
 +++++++++++++++++++++++++++++++
 
 .. autoclass:: probables.CountMinSketch
     :members:
 
 
+CountMeanSketch
++++++++++++++++++++++++++++++++
+
+.. autoclass:: probables.CountMeanSketch
+    :members:
+
+
+CountMeanMinSketch
++++++++++++++++++++++++++++++++
+
+.. autoclass:: probables.CountMeanMinSketch
+    :members:
+
+
 HeavyHitters
 +++++++++++++++++++++++++++++++
 
 .. autoclass:: probables.HeavyHitters
     :members:
     :inherited-members:
 
+
 StreamThreshold
 +++++++++++++++++++++++++++++++
 
 .. autoclass:: probables.StreamThreshold
     :members:
     :inherited-members:
 
+
 Exceptions
 ===============================
 
 .. automodule:: probables.exceptions
     :members:
 
+
+Hashing Functions
+===============================
+
+.. automodule:: probables.hashes
+    :members:
+
+
 Indices and tables
 ==================
 
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -5,6 +5,7 @@ Read More
 ==================
 
 * :ref:`api`
+* :ref:`quickstart`
 * :ref:`genindex`
 * :ref:`modindex`
 * :ref:`search`
diff --git a/docs/source/quickstart.rst b/docs/source/quickstart.rst
@@ -1,14 +1,187 @@
 .. _quickstart:
 
 pyprobables Quickstart
-======================
+######################
 
 .. toctree::
-   :maxdepth: 2
-   :caption: Quickstart:
+   :maxdepth: 5
 
    quickstart
 
 
 Install
-^^^^^^^
+**************************
+
+The easiest method of installing pyprobables is by using the pip package
+manager:
+
+Pip Installation:
+
+::
+
+    $ pip install pyprobables
+
+
+API Documentation
+**************************
+
+The full API documentation for the pyprobables package:  :ref:`api`
+
+Example Usage
+**************************
+
+Bloom Filters
+==========================
+
+Bloom Filters provide set operations of large datasets while being small in
+memory footprint. They provide a zero percent false negative rate and a
+predetermined, or desired, false positive rate.
+`more information <https://en.wikipedia.org/wiki/Bloom_filter>`__
+
+Import, Initialize, and Train
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. code:: python
+
+    >>> from probables import (BloomFilter)
+    >>> blm = BloomFilter(est_elements=1000000, false_positive_rate=0.05)
+    >>> with open('war_and_peace.txt', 'r') as fp:
+    >>>     for line in fp:
+    >>>         for word in line.split():
+    >>>             blm.add(word.lower())  # add each word to the bloom filter!
+    >>> # end reading in the file
+
+Query the Bloom Filter
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: python
+
+    >>> words_to_check = ['step', 'borzoi', 'diametrically', 'fleches', 'rain']
+    >>> for word in words_to_check:
+    >>>     blm.check(word)
+
+
+Export the Bloom Filter
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: python
+
+    >>> blm.export('war_and_peace_bloom.blm')
+
+
+Import a Bloom Filter
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: python
+
+    >>> blm2 = BloomFilter(filepath='war_and_peace_bloom.blm')
+    >>> print(blm2.check('sutler'))
+
+
+Other Bloom Filters
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Bloom Filter on Disk
+"""""""""""""""""""""""""""""""""""""""""""""""
+
+The **Bloom Filter on Disk** is a specialized version of the standard
+Bloom Filter that is run directly off of disk instead of in memory. This
+can be useful for very large Bloom Filters or when needing to access many
+Blooms that are exported to file.
+
+
+Counting Bloom Filter
+"""""""""""""""""""""""""""""""""""""""""""""""
+
+**Counting Bloom Filters** are another specialized version of the standard
+Bloom Filter. Instead of using a bit array to track added elements, a
+Counting Bloom uses integers to track the number of times the element has
+been added. **currently not supported; planned**
+
+
+Count-Min Sketch
+==========================
+
+Count-Min Sketches, and its derivatives, are good for counting the number of
+occurrences of an element in streaming data while not needing to retain all the
+data elements. The result is a probabilistic count of elements inserted into
+the data structure. It will always provide a **maximum** number of times
+encountered. Notice that the result may be **more** than the true number
+of times it was inserted, but never fewer.
+`more information <https://en.wikipedia.org/wiki/Count%E2%80%93min_sketch>`__
+
+
+Import, Initialize, and Train
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. code:: python
+
+    >>> from probables import (CountMinSketch)
+    >>> cms = CountMinSketch(width=100000, depth=5)
+    >>> with open('war_and_peace.txt', 'r') as fp:
+    >>>     for line in fp:
+    >>>         for word in line.split():
+    >>>             cms.add(word.lower())  # add each to the count-min sketch!
+
+
+Query the Count-Min Sketch
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. code:: python
+
+    >>> words_to_check = ['step', 'borzoi', 'diametrically', 'fleches', 'rain']
+    >>> for word in words_to_check:
+    >>>     print(cms.check(word))  # prints: 80, 17, 1, 20, 25
+
+
+Export Count-Min Sketch
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. code:: python
+
+    >>> cms.export('war_and_peace.cms')
+
+
+Import a Count-Min Sketch
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: python
+
+    >>> cms2 = CountMinSketch(filepath='war_and_peace.cms')
+    >>> print(cms2.check('fleches'))  # prints 20
+
+
+Other Count-Min Sketches
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Count-Mean Sketch and Count-Mean-Min Sketch
+"""""""""""""""""""""""""""""""""""""""""""""""
+
+**Count-Mean Sketch** and **Count-Mean-Min Sketch** are identical to the
+Count-Min Sketch for the data structure but both differ in the method of
+calculating the number of times and element has been inserted. These are
+currently supported by specifying at query time which method is desired
+or by initializing to the desired class: CountMeanSketch or CountMeanMinSketch.
+
+
+Heavy Hitters
+"""""""""""""""""""""""""""""""""""""""""""""""
+
+**Heavy Hitters** is a version of the Count-Min Sketch that tracks those
+elements that are seen most often. Beyond the normal initialization parameters
+one only needs to specify the number of heavy hitters to track.
+
+
+Stream Threshold
+"""""""""""""""""""""""""""""""""""""""""""""""
+
+**Stream Threshold** is another version of the Count-Min Sketch similar to the
+Heavy Hitters. The main difference is that the there is a threshold for
+including an element to be tracked instead of tracking a certain number of
+elements.
+
+
+Indices and tables
+==================
+
+* :ref:`home`
+* :ref:`api`
+* :ref:`genindex`
+* :ref:`modindex`
+* :ref:`search`
diff --git a/probables/__init__.py b/probables/__init__.py
@@ -1,17 +1,19 @@
 ''' pyprobables module '''
 from __future__ import (unicode_literals, absolute_import, print_function)
 from .blooms import (BloomFilter, BloomFilterOnDisk)
-from .countminsketch import (CountMinSketch, HeavyHitters, StreamThreshold)
+from .countminsketch import (CountMinSketch, HeavyHitters, StreamThreshold,
+                             CountMeanSketch, CountMeanMinSketch)
 from .exceptions import (InitializationError, NotSupportedError,
                          ProbablesBaseException)
 
 __author__ = 'Tyler Barrus'
 __maintainer__ = 'Tyler Barrus'
 __email__ = 'barrust@gmail.com'
 __license__ = 'MIT'
-__version__ = '0.0.3'
+__version__ = '0.0.4'
 __credits__ = []
 __url__ = 'https://github.com/barrust/pyprobables'
 
 __all__ = ['BloomFilter', 'BloomFilterOnDisk', 'CountMinSketch',
-           'HeavyHitters', 'StreamThreshold']
+           'HeavyHitters', 'StreamThreshold', 'CountMeanSketch',
+           'CountMeanMinSketch']
diff --git a/probables/countminsketch/__init__.py b/probables/countminsketch/__init__.py
@@ -1,6 +1,8 @@
 ''' count-min sketch submodule '''
 from __future__ import (unicode_literals, absolute_import, print_function)
-from .countminsketch import (CountMinSketch, HeavyHitters, StreamThreshold)
+from .countminsketch import (CountMinSketch, HeavyHitters, StreamThreshold,
+                             CountMeanSketch, CountMeanMinSketch)
 
 
-__all__ = ['CountMinSketch', 'HeavyHitters', 'StreamThreshold']
+__all__ = ['CountMinSketch', 'HeavyHitters', 'StreamThreshold',
+           'CountMeanSketch', 'CountMeanMinSketch']
diff --git a/probables/countminsketch/countminsketch.py b/probables/countminsketch/countminsketch.py
diff --git a/tests/countminsketch_test.py b/tests/countminsketch_test.py