From 9126e4d01d7a64bdd86f466a094bd72fb3343a25 Mon Sep 17 00:00:00 2001 From: pavankm Date: Sat, 30 Dec 2017 14:04:22 -0600 Subject: [PATCH 01/12] check for pyQt5 version 5.9.3 or above --- notebooks/Sampling and Labeling.ipynb | 10 ---------- py_labeler/labeler/labeler.py | 5 ++++- 2 files changed, 4 insertions(+), 11 deletions(-) diff --git a/notebooks/Sampling and Labeling.ipynb b/notebooks/Sampling and Labeling.ipynb index 4d04403..e0a99d9 100644 --- a/notebooks/Sampling and Labeling.ipynb +++ b/notebooks/Sampling and Labeling.ipynb @@ -15,16 +15,6 @@ "First, we need to import py_entitymatching package and other libraries as follows:" ] }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import sys\n", - "sys.path.append('/Users/pradap/Documents/Research/Python-Package/anhaid/py_labeler')" - ] - }, { "cell_type": "code", "execution_count": 2, diff --git a/py_labeler/labeler/labeler.py b/py_labeler/labeler/labeler.py index 438a0fd..6ecd7c5 100644 --- a/py_labeler/labeler/labeler.py +++ b/py_labeler/labeler/labeler.py @@ -10,9 +10,10 @@ from PyQt5.QtWebChannel import QWebChannel from PyQt5.QtWebEngineWidgets import QWebEngineView, QWebEnginePage, QWebEngineScript from PyQt5.QtWidgets import QApplication + from PyQt5.QtCore import QT_VERSION_STR except ImportError: raise ImportError('PyQt5 is not installed. Please install PyQt5 to use ' - 'GUI related functions in py_entitymatching.') + 'GUI related functions in py_labeler.') from py_labeler.labeler.controller.FilterController import FilterController from py_labeler.labeler.controller.LabelUpdateController import LabelUpdateController @@ -175,6 +176,8 @@ def label_table(df, label_column_name): """ if sys.version_info < (3, 5): raise ImportError("Python 3.3 or greater is required") + if QT_VERSION_STR < '5.9.3': + raise ImportError("PyQt 5.9.3 or greater is required") _validate_inputs(df, label_column_name) df = df.copy(deep=True) From 75a7d5940686fdf2ea6be10cb60dd89814d89896 Mon Sep 17 00:00:00 2001 From: kvpradap Date: Sat, 30 Dec 2017 14:49:46 -0600 Subject: [PATCH 02/12] [CLN] Removed unnecessary files. --- conda.recipe/meta.yaml | 4 +- docs/contributing.rst | 8 +- docs/make_copy.sh | 9 - notebooks/Sampling and Labeling.ipynb | 64 +- py_labeler/catalog/__init__.py | 0 py_labeler/catalog/catalog.py | 127 --- py_labeler/catalog/catalog_manager.py | 1437 ------------------------- py_labeler/io/__init__.py | 0 py_labeler/io/parsers.py | 465 -------- requirements.yml | 2 +- 10 files changed, 60 insertions(+), 2056 deletions(-) delete mode 100644 docs/make_copy.sh delete mode 100644 py_labeler/catalog/__init__.py delete mode 100644 py_labeler/catalog/catalog.py delete mode 100644 py_labeler/catalog/catalog_manager.py delete mode 100644 py_labeler/io/__init__.py delete mode 100644 py_labeler/io/parsers.py diff --git a/conda.recipe/meta.yaml b/conda.recipe/meta.yaml index c826073..95cadec 100644 --- a/conda.recipe/meta.yaml +++ b/conda.recipe/meta.yaml @@ -31,7 +31,7 @@ requirements: test: # Python imports imports: - - py_entitymatching + - py_labeler # commands: # You can put test commands to be run here. Use this to test that the @@ -46,7 +46,7 @@ test: # - nose about: - home: https://sites.google.com/site/anhaidgroup/projects/magellan/py_entitymatching + home: https://sites.google.com/site/anhaidgroup/projects/magellan/py_labeler license: BSD License summary: 'Python library for entity matching.' diff --git a/docs/contributing.rst b/docs/contributing.rst index 572815e..4660aaa 100644 --- a/docs/contributing.rst +++ b/docs/contributing.rst @@ -26,7 +26,7 @@ in the documentation and thinking 'this can be improved'...you can do something about it! Feel free to ask questions on the `mailing list -`_ +`_ Bug reports and enhancement requests ==================================== @@ -51,10 +51,10 @@ Bug reports must: ... ``` -#. Include the full version string of *magellan_labeler*. You can find the version as follows:: +#. Include the full version string of *py_labeler*. You can find the version as follows:: - >>> import magellan_labeler as em - >>> em.__version__ + >>> import py_labeler as pl + >>> pl.__version__ #. Explain why the current behavior is wrong/not desired and what you expect instead. diff --git a/docs/make_copy.sh b/docs/make_copy.sh deleted file mode 100644 index 4ed1002..0000000 --- a/docs/make_copy.sh +++ /dev/null @@ -1,9 +0,0 @@ -cd /Users/pradap/Documents/Research/Python-Package/anhaid/py_entitymatching/docs -make clean html -cd _build/html -scp -r * pradap@trinity.cs.wisc.edu:~/public/html-www/magellan/user_manual/multi_page -cd /Users/pradap/Documents/Research/Python-Package/anhaid/py_entitymatching/docs -make clean singlehtml -cd _build/singlehtml -scp -r * pradap@trinity.cs.wisc.edu:~/public/html-www/magellan/user_manual/single_page -cd /Users/pradap/Documents/Research/Python-Package/anhaid/py_entitymatching/docs diff --git a/notebooks/Sampling and Labeling.ipynb b/notebooks/Sampling and Labeling.ipynb index e0a99d9..18d7631 100644 --- a/notebooks/Sampling and Labeling.ipynb +++ b/notebooks/Sampling and Labeling.ipynb @@ -17,7 +17,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "metadata": { "ExecuteTime": { "end_time": "2017-12-30T18:12:19.100155Z", @@ -29,24 +29,24 @@ "# Import py_labeler package\n", "import py_labeler as labeler\n", "import os\n", - "import pandas as pd\n" + "import pandas as pd" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# Get the datasets directory\n", "datasets_dir = labeler.get_install_path() + os.sep + 'tests' + os.sep + 'test_datasets'\n", "\n", - "path_C = datasets_dir + os.sep + 'C1.csv'\n" + "path_C = datasets_dir + os.sep + 'C1.csv'" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -55,7 +55,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -171,7 +171,7 @@ "4 1988 Joseph Kuan 94122 1982 " ] }, - "execution_count": 5, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -182,7 +182,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -191,7 +191,7 @@ "14" ] }, - "execution_count": 6, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -209,9 +209,51 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "metadata": {}, "outputs": [], + "source": [ + "import PyQt5" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'5.6.2'" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "PyQt5.QtCore.QT_VERSION_STR" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "ename": "ImportError", + "evalue": "PyQt 5.9.3 or greater is required", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mImportError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# Label the data set\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;31m# Specify the name for the label column\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0mG\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlabeler\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlabel_table\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mC\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'gold_label'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m~/miniconda3/lib/python3.5/site-packages/py_labeler-0.1.0-py3.5.egg/py_labeler/labeler/labeler.py\u001b[0m in \u001b[0;36mlabel_table\u001b[0;34m(df, label_column_name)\u001b[0m\n\u001b[1;32m 178\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mImportError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Python 3.3 or greater is required\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 179\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mQT_VERSION_STR\u001b[0m \u001b[0;34m<\u001b[0m \u001b[0;34m'5.9.3'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 180\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mImportError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"PyQt 5.9.3 or greater is required\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 181\u001b[0m \u001b[0m_validate_inputs\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlabel_column_name\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 182\u001b[0m \u001b[0mdf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcopy\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdeep\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mImportError\u001b[0m: PyQt 5.9.3 or greater is required" + ] + } + ], "source": [ "# Label the data set\n", "# Specify the name for the label column\n", @@ -503,7 +545,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.4" + "version": "3.5.2" } }, "nbformat": 4, diff --git a/py_labeler/catalog/__init__.py b/py_labeler/catalog/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/py_labeler/catalog/catalog.py b/py_labeler/catalog/catalog.py deleted file mode 100644 index 0a220a9..0000000 --- a/py_labeler/catalog/catalog.py +++ /dev/null @@ -1,127 +0,0 @@ -# coding=utf-8 -import logging - -logger = logging.getLogger(__name__) - - -class Singleton(object): - """ - A non-thread-safe helper class to ease implementing singletons. - This should be used as a decorator -- not a metaclass -- to the - class that should be a singleton. - The decorated class can define one `__init__` function that - takes only the `self` argument. Other than that, there are - no restrictions that apply to the decorated class. - To get the singleton instance, use the `Instance` method. Trying - to use `__call__` will result in a `TypeError` being raised. - Limitations: The decorated class cannot be inherited from. - """ - - def __init__(self, decorated): - self._decorated = decorated - - # noinspection PyPep8Naming - def Instance(self): - """ - Returns the singleton instance. Upon its first call, it creates a - new instance of the decorated class and calls its `__init__` method. - On all subsequent calls, the already created instance is returned. - """ - try: - return self._instance - except AttributeError: - # noinspection PyAttributeOutsideInit - self._instance = self._decorated() - return self._instance - - def __call__(self): - raise TypeError('Singletons must be accessed through `Instance()`.') - - def __instancecheck__(self, inst): - return isinstance(inst, self._decorated) - - -@Singleton -class Catalog(object): - """ - Class to store and retrieve catalog information - """ - - def __init__(self): - self.properties_catalog = {} - - def init_properties_for_id(self, obj_id): - self.properties_catalog[obj_id] = {} - return True - - def init_properties(self, df): - df_id = id(df) - self.init_properties_for_id(df_id) - - def get_property_for_id(self, obj_id, name): - d = self.properties_catalog[obj_id] - return d[name] - - def get_property(self, df, name): - df_id = id(df) - return self.get_property_for_id(df_id, name) - - def set_property_for_id(self, obj_id, name, value): - d = self.properties_catalog[obj_id] - d[name] = value - self.properties_catalog[obj_id] = d - return True - - def set_property(self, df, name, value): - df_id = id(df) - return self.set_property_for_id(df_id, name, value) - - def get_all_properties_for_id(self, obj_id): - d = self.properties_catalog[obj_id] - return d - - def get_all_properties(self, df): - df_id = id(df) - return self.get_all_properties_for_id(df_id) - - def del_property_for_id(self, obj_id, name): - d = self.properties_catalog[obj_id] - del d[name] - self.properties_catalog[obj_id] = d - return True - - def del_property(self, df, name): - df_id = id(df) - return self.del_property_for_id(df_id, name) - - def del_all_properties_for_id(self, obj_id): - del self.properties_catalog[obj_id] - return True - - def del_all_properties(self, df): - df_id = id(df) - return self.del_all_properties_for_id(df_id) - - def get_catalog(self): - return self.properties_catalog - - def del_catalog(self): - self.properties_catalog = {} - return True - - def get_catalog_len(self): - return len(self.properties_catalog) - - def is_catalog_empty(self): - return len(self.properties_catalog) == 0 - - def is_df_info_present_in_catalog(self, df): - return id(df) in self.properties_catalog - - def is_property_present_for_id(self, obj_id, name): - d = self.properties_catalog[obj_id] - return name in d - - def is_property_present_for_df(self, df, name): - df_id = id(df) - return self.is_property_present_for_id(df_id, name) diff --git a/py_labeler/catalog/catalog_manager.py b/py_labeler/catalog/catalog_manager.py deleted file mode 100644 index d344a5d..0000000 --- a/py_labeler/catalog/catalog_manager.py +++ /dev/null @@ -1,1437 +0,0 @@ -# coding=utf-8 -""" -This module contains wrapper functions for the catalog. -""" -import logging - -import pandas as pd -import six - -import py_labeler.utils.catalog_helper as ch -from py_labeler.catalog.catalog import Catalog -from py_labeler.utils.validation_helper import validate_object_type - -logger = logging.getLogger(__name__) - - -def get_property(data_frame, property_name): - """ - Gets the value of a property (with the given property name) for a pandas - DataFrame from the catalog. - - Args: - data_frame (DataFrame): The DataFrame for which the property should be - retrieved. - property_name (string): The name of the property that should be - retrieved. - - Returns: - A Python object (typically a string or a pandas DataFrame depending - on the property name) is returned. - - Raises: - AssertionError: If `data_frame` is not of type pandas - DataFrame. - AssertionError: If `property_name` is not of type string. - KeyError: If `data_frame` information is not present in the catalog. - KeyError: If requested property for the `data_frame` is not present - in the catalog. - - Examples: - >>> import magellan_labeler as pl - >>> import pandas as pd - >>> A = pd.DataFrame({'id' : [1, 2], 'colA':['a', 'b'], 'colB' : [10, 20]}) - >>> pl.set_key(A, 'id') - >>> pl.get_property(A, 'key') - # id - """ - # Validate input parameters - - # # The input object should be of type pandas DataFrame - validate_object_type(data_frame, pd.DataFrame) - - # # The property name should be of type string - validate_object_type(property_name, six.string_types, error_prefix='Property name') - - # Get the catalog instance, this is imported here because this object - # used to validate the presence of a DataFrame in the catalog, and the - # presence of requested metadata in the catalog. - catalog = Catalog.Instance() - - # Check for the present of input DataFrame in the catalog. - if not catalog.is_df_info_present_in_catalog(data_frame): - logger.error('DataFrame information is not present in the catalog') - raise KeyError('DataFrame information is not present in the catalog') - - # Check if the requested property is present in the catalog. - if not catalog.is_property_present_for_df(data_frame, property_name): - logger.error( - 'Requested metadata ( %s ) for the given DataFrame is not ' - 'present in the catalog' % property_name) - raise KeyError( - 'Requested metadata ( %s ) for the given DataFrame is not ' - 'present in the catalog' % property_name) - - # Return the requested property for the input DataFrame - return catalog.get_property(data_frame, property_name) - - -def set_property(data_frame, property_name, property_value): - """ - Sets the value of a property (with the given property name) for a pandas - DataFrame in the catalog. - - Args: - data_frame (DataFrame): The DataFrame for which the property must be - set. - property_name (string): The name of the property to be set. - property_value (object): The value of the property to be set. This is - typically a string (such as key) or pandas DataFrame (such as - ltable, rtable). - - Returns: - A Boolean value of True is returned if the update was successful. - - Raises: - AssertionError: If `data_frame` is not of type pandas - DataFrame. - AssertionError: If `property_name` is not of type string. - - Examples: - >>> import magellan_labeler as pl - >>> import pandas as pd - >>> A = pd.DataFrame({'id' : [1, 2], 'colA':['a', 'b'], 'colB' : [10, 20]}) - >>> pl.set_property(A, 'key', 'id') - >>> pl.get_property(A, 'key') - # id - >>> pl.get_key(A) - # id - - - Note: - If the input DataFrame is not present in the catalog, this function - will create an entry in the catalog and set the given property. - - """ - # Validate input parameters - - # # The input object should be of type pandas DataFrame - validate_object_type(data_frame, pd.DataFrame) - - # # The property name should be of type string - validate_object_type(property_name, six.string_types, error_prefix='Property name') - - # Get the catalog instance - catalog = Catalog.Instance() - - # Check if the DataFrame information is present in the catalog. If the - # information is not present, then initialize an entry for that DataFrame - # in the catalog. - if not catalog.is_df_info_present_in_catalog(data_frame): - catalog.init_properties(data_frame) - - # Set the property in the catalog, and relay the return value from the - # underlying catalog object's function. The return value is typically - # True if the update was successful. - return catalog.set_property(data_frame, property_name, property_value) - - -def init_properties(data_frame): - """ - Initializes properties for a pandas DataFrame in the catalog. - - Specifically, this function creates an entry in the catalog and sets its - properties to empty. - - Args: - data_frame (DataFrame): DataFrame for which the properties must be - initialized. - - Returns: - A Boolean value of True is returned if the initialization was - successful. - - """ - # Validate input parameters - - # # The input object should be of type pandas DataFrame - validate_object_type(data_frame, pd.DataFrame) - - # Get the catalog instance - catalog = Catalog.Instance() - - # Initialize the property in the catalog. - # Relay the return value from the underlying catalog object's function. - # The return value is typically True if the initialization was successful - return catalog.init_properties(data_frame) - - -def get_all_properties(data_frame): - """ - Gets all the properties for a pandas DataFrame object from the catalog. - - Args: - data_frame (DataFrame): DataFrame for which the properties must be - retrieved. - - Returns: - A dictionary containing properties for the input pandas DataFrame. - - Raises: - AttributeError: If the input object is not of type pandas DataFrame. - KeyError: If the information about DataFrame is not present in the - catalog. - - - """ - # Validate input parameters - # # The input object is expected to be of type DataFrame - # # The input object should be of type pandas DataFrame - validate_object_type(data_frame, pd.DataFrame) - - # Get the catalog instance - catalog = Catalog.Instance() - - # Check if the DataFrame information is present in the catalog. If not - # raise an error. - if not catalog.is_df_info_present_in_catalog(data_frame): - logger.error('DataFrame information is not present in the catalog') - raise KeyError('DataFrame information is not present in the catalog') - - # Retrieve the properties for the DataFrame from the catalog and return - # it back to the user. - return catalog.get_all_properties(data_frame) - - -def del_property(data_frame, property_name): - """ - Deletes a property for a pandas DataFrame from the catalog. - - Args: - data_frame (DataFrame): The input DataFrame for which a property must be - deleted from the catalog. - - property_name (string): The name of the property that should be deleted. - - Returns: - A Boolean value of True is returned if the deletion was successful. - - Raises: - AssertionError: If `data_frame` is not of type pandas DataFrame. - AssertionError: If `property_name` is not of type string. - KeyError: If `data_frame` information is not present in the catalog. - KeyError: If requested property for the DataFrame is not present - in the catalog. - - Examples: - >>> import magellan_labeler as pl - >>> import pandas as pd - >>> A = pd.DataFrame({'id' : [1, 2], 'colA':['a', 'b'], 'colB' : [10, 20]}) - >>> pl.set_property(A, 'key', 'id') - >>> pl.get_property(A, 'key') - # id - >>> pl.del_property(A, 'key') - >>> pl.is_property_present_for_df(A, 'key') - # False - - """ - # Validate input parameters - - # # The input object should be of type pandas DataFrame - validate_object_type(data_frame, pd.DataFrame) - - # # The property name should be of type string - validate_object_type(property_name, six.string_types, error_prefix='Property name') - - # Get the catalog instance - catalog = Catalog.Instance() - - # Check if the DataFrame information is present in the catalog, if not - # raise an error. - if not catalog.is_df_info_present_in_catalog(data_frame): - logger.error('DataFrame information is not present in the catalog') - raise KeyError('DataFrame information is not present in the catalog') - - # Check if the requested property name to be deleted is present for the - # DataFrame in the catalog, if not raise an error. - if not catalog.is_property_present_for_df(data_frame, property_name): - logger.error('Requested metadata ( %s ) for the given DataFrame is ' - 'not present in the catalog' % property_name) - raise KeyError('Requested metadata ( %s ) for the given DataFrame is ' - 'not present in the catalog' % property_name) - - # Delete the property using the underlying catalog object and relay the - # return value. Typically the return value is True if the deletion was - # successful - return catalog.del_property(data_frame, property_name) - - -def del_all_properties(data_frame): - """ - Deletes all properties for a DataFrame from the catalog. - - Args: - data_frame (DataFrame): Input DataFrame for which all the properties - must be deleted from the catalog. - - Returns: - A boolean of True is returned if the deletion was successful - from the catalog. - - Raises: - AssertionError: If the `data_frame` is not of type pandas DataFrame. - KeyError: If the DataFrame information is not present in the catalog. - - Note: - This method's functionality is not as same as init_properties. Here - the DataFrame's entry will be removed from the catalog, - but init_properties will add (if the DataFrame is not present in the - catalog) and initialize its properties to an empty object ( - specifically, an empty Python dictionary). - """ - # Validations of input parameters - # # The input object is expected to be of type pandas DataFrame - if not isinstance(data_frame, pd.DataFrame): - logger.error('Input object is not of type pandas data frame') - raise AssertionError('Input object is not of type pandas data frame') - - # Get the catalog instance - catalog = Catalog.Instance() - - # Check if the DataFrame is present in the catalog. If not, raise an error - if not catalog.is_df_info_present_in_catalog(data_frame): - logger.error('DataFrame information is not present in the catalog') - raise KeyError('DataFrame information is not present in the catalog') - - # Call the underlying catalog object's function to delete the properties - # and relay its return value - return catalog.del_all_properties(data_frame) - - -def get_catalog(): - """ - Gets the catalog information for the current session. - - Returns: - A Python dictionary containing the catalog information. - - Specifically, the dictionary contains the Python identifier of a - DataFrame (obtained by id(DataFrame object)) as the key - and their properties as value. - - Examples: - >>> import magellan_labeler as pl - >>> catalog = pl.get_catalog() - - """ - # Get the catalog instance - catalog = Catalog.Instance() - # Call the underlying catalog object's function to get the catalog. Relay - # the return value from the delegated function. - return catalog.get_catalog() - - -def del_catalog(): - """ - Deletes the catalog for the current session. - - Returns: - A Boolean value of True is returned if the deletion was successful. - - Examples: - >>> import magellan_labeler as pl - >>> pl.del_catalog() - """ - # Get the catalog instance - catalog = Catalog.Instance() - # Call the underlying catalog object's function to delete the catalog (a - # dict). Relay the return value from the delegated function. - return catalog.del_catalog() - - -def is_catalog_empty(): - """ - Checks if the catalog is empty. - - Returns: - A Boolean value of True is returned if the catalog is empty, - else returns False. - - Examples: - >>> import magellan_labeler as pl - >>> import pandas as pd - >>> A = pd.DataFrame({'id' : [1, 2], 'colA':['a', 'b'], 'colB' : [10, 20]}) - >>> pl.set_key(A, 'id') - >>> pl.is_catalog_empty() - # False - - """ - # Get the catalog instance - catalog = Catalog.Instance() - - # Call the underlying catalog object's function to check if the catalog - # is empty. Relay the return value from the delegated function. - return catalog.is_catalog_empty() - - -def is_dfinfo_present(data_frame): - """ - Checks whether the DataFrame information is present in the catalog. - - Args: - data_frame (DataFrame): The DataFrame that should be checked for its - presence in the catalog. - - Returns: - A Boolean value of True is returned if the DataFrame is present in - the catalog, else False is returned. - - Raises: - AssertionError: If `data_frame` is not of type pandas - DataFrame. - - Examples: - >>> import magellan_labeler as pl - >>> import pandas as pd - >>> A = pd.DataFrame({'id' : [1, 2], 'colA':['a', 'b'], 'colB' : [10, 20]}) - >>> pl.set_key(A, 'id') - >>> pl.is_dfinfo_present(A) - # True - - """ - # Validate inputs - # We expect the input object to be of type pandas DataFrame - validate_object_type(data_frame, pd.DataFrame) - - # Get the catalog instance - catalog = Catalog.Instance() - - # Call the underlying catalog object's function to check if the - # DataFrame information is present in the catalog. - # Relay the return value from the delegated function. - return catalog.is_df_info_present_in_catalog(data_frame) - - -def is_property_present_for_df(data_frame, property_name): - """ - Checks if the given property is present for the given DataFrame in the - catalog. - - Args: - data_frame (DataFrame): The DataFrame for which the property must be - checked for. - property_name (string): The name of the property that should be - checked for its presence for the DataFrame, in the catalog. - - Returns: - A Boolean value of True is returned if the property is present for - the given DataFrame. - - Raises: - AssertionError: If `data_frame` is not of type pandas - DataFrame. - AssertionError: If `property_name` is not of type string. - KeyError: If `data_frame` is not present in the catalog. - - Examples: - - >>> import magellan_labeler as pl - >>> import pandas as pd - >>> A = pd.DataFrame({'id' : [1, 2], 'colA':['a', 'b'], 'colB' : [10, 20]}) - >>> pl.set_key(A, 'id') - >>> pl.is_property_present_for_df(A, 'id') - # True - >>> pl.is_property_present_for_df(A, 'fk_ltable') - # False - - """ - # Input validations - - # # The input object should be of type pandas DataFrame - validate_object_type(data_frame, pd.DataFrame) - - # # The property name should be of type string - validate_object_type(property_name, six.string_types, error_prefix='Property name') - - # Get the catalog instance - catalog = Catalog.Instance() - - # Check if the given DataFrame information is present in the catalog. If - # not, raise an error. - if catalog.is_df_info_present_in_catalog(data_frame) is False: - logger.error('DataFrame information is not present in the catalog') - raise KeyError('DataFrame information is not present in the catalog') - - # Call the underlying catalog object's function to check if the property - # is present for the given DataFrame. Relay the return value from that - # function. - return catalog.is_property_present_for_df(data_frame, property_name) - - -def get_catalog_len(): - """ - Get the length (i.e the number of entries) in the catalog. - - Returns: - The number of entries in the catalog as an integer. - - Examples: - >>> import magellan_labeler as pl - >>> len = pl.get_catalog_len() - - """ - # Get the catalog instance - catalog = Catalog.Instance() - # Call the underlying catalog object's function to get the catalog length. - # Relay the return value from that function. - return catalog.get_catalog_len() - - -def set_properties(data_frame, properties, replace=True): - """ - Sets the properties for a DataFrame in the catalog. - - Args: - data_frame (DataFrame): DataFrame for which the properties must be set. - properties (dict): A Python dictionary with keys as property names and - values as Python objects (typically strings or DataFrames) - replace (Optional[bool]): Flag to indicate whether the input - properties can replace the properties in the catalog. The default - value for the flag is True. - Specifically, if the DataFrame information is already present in - the catalog then the function will check if the replace flag is - True. If the flag is set to True, then the function will first - delete the existing properties, set it with the given properties. - If the flag is False, the function will just return without - modifying the existing properties. - - - Returns: - A Boolean value of True is returned if the properties were set for - the given DataFrame, else returns False. - - Raises: - AssertionError: If the input data_frame object is not of type pandas - DataFrame. - AssertionError: If the input properties object is not of type Python - dictionary. - - """ - # Validate input parameters - # # Input object is expected to be a pandas DataFrame - validate_object_type(data_frame, pd.DataFrame) - - # # Input properties is expected to be of type Python dictionary - validate_object_type(properties, dict, error_prefix='The properties') - - # Get the catalog instance - catalog = Catalog.Instance() - # Check if the the DataFrame information is present in the catalog. If - # present, we expect the replace flag to be True. If the flag was set to - # False, then warn the user and return False. - if catalog.is_df_info_present_in_catalog(data_frame): - if not replace: - logger.warning( - 'Properties already exists for df ( %s ). Not replacing it' - % str(id(data_frame))) - return False - else: - # DataFrame information is present and replace flag is True. We - # now reset the properties dictionary for this DataFrame. - catalog.init_properties(data_frame) - else: - # The DataFrame information is not present in the catalog. so - # initialize the properties - catalog.init_properties(data_frame) - - # Now iterate through the given properties and set for the DataFrame. - # Note: Here we don't check the correctness of the input properties (i.e - # we do not check if a property 'key' is indeed a key) - for property_name, property_value in six.iteritems(properties): - catalog.set_property(data_frame, property_name, property_value) - - # Finally return True, if everything was successful - return True - - -def copy_properties(source_data_frame, target_data_frame, replace=True): - """ - Copies properties from a source DataFrame to target DataFrame in the - catalog. - - Args: - source_data_frame (DataFrame): The DataFrame from which the properties - to be copied from, in the catalog. - target_data_frame (DataFrame): The DataFrame to which the properties - to be copied to, in the catalog. - replace (boolean): A flag to indicate whether the source - DataFrame's properties can replace the target - DataFrame's properties in the catalog. The default value for the - flag is True. - Specifically, if the target DataFrame's information is already - present in the catalog then the function will check if the - replace flag is True. If the flag is set to True, then the - function will first delete the existing properties and then set - it with the source DataFrame properties. - If the flag is False, the function will just return without - modifying the existing properties. - - Returns: - A Boolean value of True is returned if the copying was successful. - - Raises: - AssertionError: If `source_data_frame` is not of - type pandas DataFrame. - AssertionError: If `target_data_frame` is not of - type pandas DataFrame. - KeyError: If source DataFrame is not present in the - catalog. - - Examples: - - >>> import magellan_labeler as pl - >>> import pandas as pd - >>> A = pd.DataFrame({'id' : [1, 2], 'colA':['a', 'b'], 'colB' : [10, 20]}) - >>> pl.set_key(A, 'id') - >>> B = pd.DataFrame({'id' : [1, 2], 'colA':['c', 'd'], 'colB' : [30, 40]}) - >>> pl.copy_properties(A, B) - >>> pl.get_key(B) - # 'id' - """ - # Validate input parameters - - # # The source_data_frame is expected to be of type pandas DataFrame - validate_object_type(source_data_frame, pd.DataFrame, error_prefix='Input object (source_data_frame)') - - # # The target_data_frame is expected to be of type pandas DataFrame - validate_object_type(target_data_frame, pd.DataFrame, error_prefix='Input object (target_data_frame)') - - # Get the catalog instance - catalog = Catalog.Instance() - - # Check if the source DataFrame information is present in the catalog. If - # not raise an error. - if catalog.is_df_info_present_in_catalog(source_data_frame) is False: - logger.error( - 'DataFrame information (source_data_frame) is not present in the ' - 'catalog') - raise KeyError( - 'DataFrame information (source_data_frame) is not present in the ' - 'catalog') - - # Get all properties for the source DataFrame - metadata = catalog.get_all_properties(source_data_frame) - - # Set the properties to the target DataFrame. Specifically, call the set - # properties function and relay its return value. - - # Note: There is a redundancy in validating the input parameters. This - # might have a slight performance impact, but we don't expect that this - # function gets called so often. - return set_properties(target_data_frame, metadata, - replace) # this initializes tar in the catalog. - - -# key related methods -def get_key(data_frame): - """ - Gets the value of 'key' property for a DataFrame from the catalog. - - Args: - data_frame (DataFrame): The DataFrame for which the key must be - retrieved from the catalog. - - Returns: - A string value containing the key column name is returned (if present). - - Examples: - >>> import magellan_labeler as pl - >>> import pandas as pd - >>> A = pd.DataFrame({'id' : [1, 2], 'colA':['a', 'b'], 'colB' : [10, 20]}) - >>> pl.set_key(A, 'id') - >>> pl.get_key(A) - # 'id' - - - See Also: - :meth:`~magellan_labeler.get_property` - - """ - # This function is just a sugar to get the 'key' property for a DataFrame - return get_property(data_frame, 'key') - - -def set_key(data_frame, key_attribute): - """ - Sets the value of 'key' property for a DataFrame in the catalog with the - given attribute (i.e column name). - - Specifically, this function set the the key attribute for the DataFrame - if the given attribute satisfies the following two properties: - - The key attribute should have unique values. - - The key attribute should not have missing values. A missing value - is represented as np.NaN. - - Args: - data_frame (DataFrame): The DataFrame for which the key must be set in - the catalog. - key_attribute (string): The key attribute (column name) in the - DataFrame. - - Returns: - A Boolean value of True is returned, if the given attribute - satisfies the conditions for a key and the update was successful. - - Raises: - AssertionError: If `data_frame` is not of type - pandas DataFrame. - AssertionError: If `key_attribute` is not of type string. - KeyError: If given `key_attribute` is not in the DataFrame columns. - - Examples: - >>> import magellan_labeler as pl - >>> import pandas as pd - >>> A = pd.DataFrame({'id' : [1, 2], 'colA':['a', 'b'], 'colB' : [10, 20]}) - >>> pl.set_key(A, 'id') - >>> pl.get_key(A) - # 'id' - - - See Also: - :meth:`~magellan_labeler.set_property` - - - """ - # Validate input parameters - - # # We expect the input object (data_frame) to be of type pandas DataFrame - validate_object_type(data_frame, pd.DataFrame) - - # # We expect input key attribute to be of type string - validate_object_type(key_attribute, six.string_types, error_prefix='Input key attribute') - - # Check if the key attribute is present as one of the columns in the - # DataFrame - if not ch.check_attrs_present(data_frame, key_attribute): - logger.error('Input key ( %s ) not in the DataFrame' % key_attribute) - raise KeyError('Input key ( %s ) not in the DataFrame' % key_attribute) - - # Check if the key attribute satisfies the conditions to be a key. If - # not, just return False. - # Note: Currently it is not clear, whether we should return False from - # here or raise an exception. As of now resorting to just returning - # False, because this function is used by other computation - # intensive commands in magellan_labeler and raising an exception might make all - # the work done in those commands go in vain (or those commands should - # catch the exception correctly, which may be complicated and require - # changes to the current code). We need to revisit this - # later. - if ch.is_key_attribute(data_frame, key_attribute) is False: - logger.warning('Attribute (%s ) does not qualify to be a key; Not ' - 'setting/replacing the key' % key_attribute) - return False - else: - # Set the key property for the input DataFrame - return set_property(data_frame, 'key', key_attribute) - - -def get_fk_ltable(data_frame): - """ - Gets the foreign key to left table for a DataFrame from the - catalog. - - Specifically this function is a sugar function that will get the foreign - key to left table using underlying :meth:`~magellan_labeler.get_property` function. - This function is typically called on a DataFrame which contains metadata - such as fk_ltable, fk_rtable, ltable, rtable. - - - Args: - data_frame (DataFrame): The input DataFrame for which the foreign key - ltable property must be retrieved. - - Returns: - A Python object, typically a string is returned. - - - Examples: - >>> import magellan_labeler as pl - >>> A = pd.DataFrame({'id' : [1, 2], 'colA':['a', 'b'], 'colB' : [10, 20]}) - >>> B = pd.DataFrame({'id' : [1, 2], 'colA':['c', 'd'], 'colB' : [30, 40]}) - >>> pl.set_key(A, 'id') - >>> pl.set_key(B, 'id') - >>> C = pd.DataFrame({'id':[1, 2], 'ltable_id':[1, 2], 'rtable_id':[2, 1]}) - >>> pl.set_key(C, 'id') - >>> pl.set_fk_ltable(C, 'ltable_id') - >>> pl.get_fk_ltable(C) - # 'ltable_id' - - See Also: - :meth:`~magellan_labeler.get_property` - - """ - # Call the get_property function and relay the result. - return get_property(data_frame, 'fk_ltable') - - -def get_fk_rtable(data_frame): - """ - Gets the foreign key to right table for a DataFrame from the catalog. - - Specifically this function is a sugar function that will get the foreign - key to right table using :meth:`magellan_labeler.get_property` function. This - function is typically called on a DataFrame which contains metadata such as - fk_ltable, fk_rtable, ltable, rtable. - - Args: - data_frame (DataFrame): The input DataFrame for which the foreign key - rtable property must be retrieved. - - Returns: - A Python object, (typically a string) is returned. - - Examples: - >>> import magellan_labeler as pl - >>> A = pd.DataFrame({'id' : [1, 2], 'colA':['a', 'b'], 'colB' : [10, 20]}) - >>> B = pd.DataFrame({'id' : [1, 2], 'colA':['c', 'd'], 'colB' : [30, 40]}) - >>> pl.set_key(A, 'id') - >>> pl.set_key(B, 'id') - >>> C = pd.DataFrame({'id':[1, 2], 'ltable_id':[1, 2], 'rtable_id':[2, 1]}) - >>> pl.set_key(C, 'id') - >>> pl.set_fk_rtable(C, 'rtable_id') - >>> pl.get_fk_rtable(C) - # 'rtable_id' - - - See Also: - :meth:`~magellan_labeler.get_property` - """ - # Call the get_property function and relay the result. - return get_property(data_frame, 'fk_rtable') - - -def set_fk_ltable(data_frame, fk_ltable): - """ - Sets the foreign key to ltable for a DataFrame in the catalog. - - Specifically this function is a sugar function that will set the foreign - key to the left table using :meth:`magellan_labeler.set_property` function. This - function is typically called on a DataFrame which contains metadata such as - fk_ltable, fk_rtable, ltable, rtable. - - Args: - data_frame (DataFrame): The input DataFrame for which the foreign key - ltable property must be set. - fk_ltable (string): The attribute that must ne set as the foreign key - to the ltable in the catalog. - - Returns: - A Boolean value of True is returned if the foreign key to ltable was - set successfully. - - Raises: - AssertionError: If `data_frame` is not of type - pandas DataFrame. - AssertionError: If `fk_ltable` is not of type - string. - AssertionError: If `fk_ltable` is not in the input - DataFrame. - - Examples: - >>> import magellan_labeler as pl - >>> A = pd.DataFrame({'id' : [1, 2], 'colA':['a', 'b'], 'colB' : [10, 20]}) - >>> B = pd.DataFrame({'id' : [1, 2], 'colA':['c', 'd'], 'colB' : [30, 40]}) - >>> pl.set_key(A, 'id') - >>> pl.set_key(B, 'id') - >>> C = pd.DataFrame({'id':[1, 2], 'ltable_id':[1, 2], 'rtable_id':[2, 1]}) - >>> pl.set_key(C, 'id') - >>> pl.set_fk_ltable(C, 'ltable_id') - >>> pl.get_fk_ltable(C) - # 'ltable_id' - - - - See Also: - :meth:`~magellan_labeler.set_property` - - """ - # Validate the input parameters - # # We expect the input object to be of type pandas DataFrame - validate_object_type(data_frame, pd.DataFrame) - - # # We expect the input fk_ltable to be of type string - validate_object_type(fk_ltable, six.string_types, error_prefix='The input (fk_ltable)') - - # # The fk_ltable attribute should be one of the columns in the input - # DataFrame - if not ch.check_attrs_present(data_frame, fk_ltable): - logger.error('Input attr. ( %s ) not in the DataFrame' % fk_ltable) - raise KeyError('Input attr. ( %s ) not in the DataFrame' % fk_ltable) - - # Call the set_property function and relay the result. - return set_property(data_frame, 'fk_ltable', fk_ltable) - - -def validate_and_set_fk_ltable(foreign_data_frame, foreign_key_ltable, ltable, - ltable_key): - """ - Validates and set the foreign key ltable for a DataFrame in the the catalog. - - Specifically, given a DataFrame and a foreign key attribute it checks - for the following conditions to be satisfied for the attribute. First it - checks that foreign key ltable attribute does not have any missing - values. Second it checks that the subset of foreign key values, - have unique values in the primary (base) table. - - Args: - foreign_data_frame (DataFrame): DataFrame containing the foreign key - (typically a candidate set, for example output from blocking two - tables). - foreign_key_ltable (string): An attribute in the foreign DataFrame - ltable (DataFrame): Base DataFrame, in which the foreign key - attribute would form the primary key. - ltable_key (string): An attribute in the base table - (typically a primary key attribute). - - Returns: - A Boolean value of True will be returned if the validation was - successful and the update was successful in the catalog. - Raises: - AssertionError: If the input foreign DataFrame (foreign_data_frame) - is not of type pandas DataFrame. - AssertionError: If the foreign key ltable (foreign_key_ltable) is not - of type string. - AssertionError: If the input ltable (ltable) is not of type pandas - DataFrame. - AssertionError: If the ltable key (ltable_key) is not of type string. - - - """ - - # check the foreign key constraint - # # Note all the validations are done inside the function - # check_fk_constraint - status = ch.check_fk_constraint(foreign_data_frame, foreign_key_ltable, - ltable, ltable_key) - - # If the validation is successful then set the property - if status: - return set_property(foreign_data_frame, 'fk_ltable', foreign_key_ltable) - else: - # else report the error and just return False. - logger.warning( - 'FK constraint for fk_ltable is not satisfied; ' - 'Not setting the fk_ltable') - return False - - -def validate_and_set_fk_rtable(foreign_data_frame, foreign_key_rtable, - rtable, rtable_key): - """ - Validates and set the foreign key ltable for a DataFrame in the the catalog. - - Specifically, given a DataFrame and a foreign key attribute it checks - for the following conditions to be satisfied for the attribute. First it - checks that foreign key rtable attribute does not have any missing - values. Second it checks that the subset of foreign key values, - have unique values in the primary (base) table. - - Args: - foreign_data_frame (DataFrame): DataFrame containing the foreign key - (typically a candidate set, for example output from blocking two - tables). - foreign_key_rtable (string): An attribute in the foreign DataFrame - rtable (DataFrame): Base DataFrame, in which the foreign key - attribute would form the primary key. - rtable_key (string): An attribute in the base table - (typically a primary key attribute). - - Returns: - A Boolean value of True will be returned if the validation was - successful and the update was successful in the catalog. - Raises: - AssertionError: If the input foreign DataFrame (foreign_data_frame) - is not of type pandas DataFrame. - AssertionError: If the foreign key ltable (foreign_key_ltable) is not - of type string. - AssertionError: If the input ltable (ltable) is not of type pandas - DataFrame. - AssertionError: If the ltable key (ltable_key) is not of type string. - - - """ - - # Validate the foreign key constraint - # Note: All the basic input validations are done inside the - # check_fk_constraint function. - status = ch.check_fk_constraint(foreign_data_frame, foreign_key_rtable, - rtable, rtable_key) - - # If the validation was successful, then set the property - if status: - return set_property(foreign_data_frame, 'fk_rtable', foreign_key_rtable) - # else just warn and return False - else: - logger.warning( - 'FK constraint for fk_rtable is not satisfied; Not ' - 'setting the fk_rtable and rtable') - return False - - -def set_fk_rtable(data_frame, foreign_key_rtable): - """ - Sets the foreign key to rtable for a DataFrame in the catalog. - - Specifically this function is a sugar function that will set the foreign - key to right table using set_property function. This function - is typically called on a DataFrame which contains metadata such as - fk_ltable, fk_rtable, ltable, rtable. - - - - Args: - data_frame (DataFrame): The input DataFrame for which the foreign key - rtable property must be set. - foreign_key_rtable (string): The attribute that must be set as - foreign key to rtable in the catalog. - - Returns: - A Boolean value of True is returned if the foreign key to rtable was - set successfully. - - Raises: - AssertionError: If `data_frame` is not of type - pandas DataFrame. - AssertionError: If `foreign_key_rtable` is not of - type string. - AssertionError: If `fk_rtable` is not in the input - DataFrame. - - Examples: - >>> import magellan_labeler as pl - >>> A = pd.DataFrame({'id' : [1, 2], 'colA':['a', 'b'], 'colB' : [10, 20]}) - >>> B = pd.DataFrame({'id' : [1, 2], 'colA':['c', 'd'], 'colB' : [30, 40]}) - >>> pl.set_key(A, 'id') - >>> pl.set_key(B, 'id') - >>> C = pd.DataFrame({'id':[1, 2], 'ltable_id':[1, 2], 'rtable_id':[2, 1]}) - >>> pl.set_key(C, 'id') - >>> pl.set_fk_rtable(C, 'rtable_id') - >>> pl.get_fk_rtable(C) - # 'rtable_id' - - - See Also: - :meth:`~magellan_labeler.set_property` - - """ - # Validate the input parameters - # # The input object is expected to be of type pandas DataFrame - validate_object_type(data_frame, pd.DataFrame) - - validate_object_type(foreign_key_rtable, six.string_types, error_prefix='Input (foreign key ltable)') - - # Check if the given attribute is present in the DataFrame - if not ch.check_attrs_present(data_frame, foreign_key_rtable): - logger.error('Input attr. ( %s ) not in the DataFrame' - % foreign_key_rtable) - raise KeyError('Input attr. ( %s ) not in the DataFrame' - % foreign_key_rtable) - - # Finally set the property and relay the result - return set_property(data_frame, 'fk_rtable', foreign_key_rtable) - - -def show_properties(data_frame): - """ - Prints the properties for a DataFrame that is present in the catalog. - - Args: - data_frame (DataFrame): The input pandas DataFrame for which the - properties must be displayed. - - Examples: - >>> A = pd.DataFrame({'key_attr' : [1, 2], 'colA':['a', 'b'], 'colB' : [10, 20]}) - >>> pl.set_key(A, 'key_attr') - >>> pl.show_properties(A) - # id: 4572922488 # This will change dynamically - # key: key_attr - - - - """ - # Check if the DataFrame information is present in the catalog. If not - # return - if not is_dfinfo_present(data_frame): - logger.error('DataFrame information is not present in the catalog') - return - - # Delegate it to show properties for the id if an object in the catalog - show_properties_for_id(id(data_frame)) - # # Get the properties for the DataFrame from the catalog - # metadata = get_all_properties(data_frame) - # - # # First print the id for the DataFrame - # print('id: ' + str(id(data_frame))) - # # For each property name anf value, print the contents to the user - # for property_name, property_value in six.iteritems(metadata): - # # If the property value is string print it out - # if isinstance(property_value, six.string_types): - # print(property_name + ": " + property_value) - # # else, print just the id. - # else: - # print(property_name + "(obj.id): " + str(id(property_value))) - - -def show_properties_for_id(object_id): - """ - Shows the properties for an object id present in the catalog. - - Specifically, given an object id got from typically executing id( - ), where the object could be a DataFrame, this function will - display the properties present for that object id in the catalog. - - Args: - object_id (int): The Python identifier of an object (typically a - pandas DataFrame). - - Examples: - >>> A = pd.DataFrame({'key_attr' : [1, 2], 'colA':['a', 'b'], 'colB' : [10, 20]}) - >>> pl.set_key(A, 'key_attr') - >>> pl.show_properties_for_id(id(A)) - # id: 4572922488 # This will change dynamically - # key: key_attr - - - """ - catalog = Catalog.Instance() - metadata = catalog.get_all_properties_for_id(object_id) - # First print the id for the DataFrame - print('id: ' + str(object_id)) - # For each property name anf value, print the contents to the user - for property_name, property_value in six.iteritems(metadata): - # If the property value is string print it out - if isinstance(property_value, six.string_types): - print(property_name + ": " + property_value) - # else, print just the id. - else: - print(property_name + "(obj.id): " + str(id(property_value))) - - -def set_candset_properties(candset, key, foreign_key_ltable, - foreign_key_rtable, ltable, rtable): - """ - Sets candidate set properties. - - Specifically, this is a sugar function that sets all the properties for a - candidate set such as key, foreign key ltable, foreign key rtable, - ltable and rtable. Further, this function does not check the integrity of - input properties. - - - - Args: - candset (DataFrame): Input DataFrame for which the properties must be - set. - key (string): Key attribute that must be set for the DataFrame in the - catalog. - foreign_key_ltable (string): Foreign key ltable attribute that must be - set for the DataFrame in the catalog. - foreign_key_rtable (string): Foreign key rtable attribute that must be - set for the DataFrame in the catalog. - ltable (DataFrame): DataFrame that must be set as ltable. - rtable (DataFrame): DataFrame that must be set as rtable. - - Returns: - A Boolean value of True is returned if the updates were successful. - - """ - # set the key - set_property(candset, 'key', key) - # set the foreign key attributes - set_fk_ltable(candset, foreign_key_ltable) - set_fk_rtable(candset, foreign_key_rtable) - # set the ltable and rtables - set_property(candset, 'ltable', ltable) - set_property(candset, 'rtable', rtable) - return True - - -def _validate_metadata_for_table(table, key, output_string, lgr, verbose): - """ - Validates metadata for table (DataFrame) - - """ - # Validate input parameters - # # We expect the input table to be of type pandas DataFrame - validate_object_type(table, pd.DataFrame) - - # Check the key column is present in the table - if not ch.check_attrs_present(table, key): - raise KeyError('Input key ( %s ) not in the DataFrame' % key) - - # Validate the key - ch.log_info(lgr, 'Validating ' + output_string + ' key: ' + str(key), - verbose) - # We expect the key to be of type string - validate_object_type(key, six.string_types, error_prefix='Key attribute') - - if not ch.is_key_attribute(table, key, verbose): - raise AssertionError('Attribute %s in the %s table does not ' - 'qualify to be the key' % ( - str(key), output_string)) - ch.log_info(lgr, '..... Done', verbose) - return True - - -def _validate_metadata_for_candset(candset, key, foreign_key_ltable, - foreign_key_rtable, - ltable, rtable, - ltable_key, rtable_key, - lgr, verbose): - """ - Validates metadata for a candidate set. - - """ - # Validate input parameters - # # We expect candset to be of type pandas DataFrame - validate_object_type(candset, pd.DataFrame, error_prefix='Input candset') - - # Check if the key column is present in the candset - if not ch.check_attrs_present(candset, key): - raise KeyError('Input key ( %s ) not in the DataFrame' % key) - - # Check if the foreign key ltable column is present in the candset - if not ch.check_attrs_present(candset, foreign_key_ltable): - raise KeyError( - 'Input foreign_key_ltable ( %s ) not in the DataFrame' - % foreign_key_ltable) - - # Check if the foreign key rtable column is present in the candset - if not ch.check_attrs_present(candset, foreign_key_rtable): - raise KeyError( - 'Input fk_rtable ( %s ) not in the DataFrame' % foreign_key_rtable) - - # We expect the ltable to be of type pandas DataFrame - validate_object_type(ltable, pd.DataFrame, error_prefix='Input ltable') - - # We expect the rtable to be of type pandas DataFrame - validate_object_type(rtable, pd.DataFrame, error_prefix='Input rtable') - - # We expect the ltable key to be present in the ltable - if not ch.check_attrs_present(ltable, ltable_key): - raise KeyError('ltable key ( %s ) not in ltable' % ltable_key) - - # We expect the rtable key to be present in the rtable - if not ch.check_attrs_present(rtable, rtable_key): - raise KeyError('rtable key ( %s ) not in rtable' % rtable_key) - - # First validate metadata for the candidate set (as a table) - _validate_metadata_for_table(candset, key, 'candset', lgr, verbose) - - ch.log_info(lgr, 'Validating foreign key constraint for left table', - verbose) - # Second check foreign key constraints - if not ch.check_fk_constraint(candset, foreign_key_ltable, - ltable, ltable_key): - raise AssertionError( - 'Candset does not satisfy foreign key constraint with ' - 'the left table') - - if not ch.check_fk_constraint(candset, foreign_key_rtable, - rtable, rtable_key): - raise AssertionError( - 'Candset does not satisfy foreign key constraint with ' - 'the right table') - - ch.log_info(lgr, '..... Done', verbose) - ch.log_info(lgr, 'Validating foreign key constraint for right table', - verbose) - ch.log_info(lgr, '..... Done', verbose) - - return True - - -# noinspection PyIncorrectDocstring -def get_keys_for_ltable_rtable(ltable, rtable, lgr, verbose): - """ - Gets keys for the ltable and rtable. - """ - # We expect the ltable to be of type pandas DataFrame - if not isinstance(ltable, pd.DataFrame): - logger.error('Input ltable is not of type pandas data frame') - raise AssertionError('Input ltable is not of type pandas data frame') - - # We expect the rtable to be of type pandas DataFrame - if not isinstance(rtable, pd.DataFrame): - logger.error('Input rtable is not of type pandas data frame') - raise AssertionError('Input rtable is not of type pandas data frame') - - ch.log_info(lgr, 'Required metadata: ltable key, rtable key', verbose) - ch.log_info(lgr, 'Getting metadata from the catalog', verbose) - # Get the ltable key and rtable key from the catalog - ltable_key = get_key(ltable) - rtable_key = get_key(rtable) - ch.log_info(lgr, '..... Done', verbose) - # return the ltable and rtable keys - return ltable_key, rtable_key - - -# noinspection PyIncorrectDocstring -def get_metadata_for_candset(candset, lgr, verbose): - """ - Gets metadata for the candset - - """ - # Validate input parameters - validate_object_type(candset, pd.DataFrame, error_prefix='Input candset') - - ch.log_info(lgr, 'Getting metadata from the catalog', verbose) - # Get the key, foreign keys, ltable, rtable and their keys - # # Get key - key = get_key(candset) - # # Get the foreign keys - fk_ltable = get_fk_ltable(candset) - fk_rtable = get_fk_rtable(candset) - # # Get the base tables - ltable = get_ltable(candset) - rtable = get_rtable(candset) - # Get the base table keys - l_key = get_key(ltable) - r_key = get_key(rtable) - ch.log_info(lgr, '..... Done', verbose) - # Return the metadata - return key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key - - -# noinspection PyIncorrectDocstring -def get_ltable(candset): - """ - Gets the ltable for a DataFrame from the catalog. - - Args: - candset (DataFrame): The input table for which the ltable must be - returned. - - Returns: - A pandas DataFrame that is pointed by 'ltable' property of the input - table. - - Examples: - >>> import magellan_labeler as pl - >>> A = pd.DataFrame({'id' : [1, 2], 'colA':['a', 'b'], 'colB' : [10, 20]}) - >>> B = pd.DataFrame({'id' : [1, 2], 'colA':['c', 'd'], 'colB' : [30, 40]}) - >>> pl.set_key(A, 'id') - >>> pl.set_key(B, 'id') - >>> C = pd.DataFrame({'id':[1, 2], 'ltable_id':[1, 2], 'rtable_id':[2, 1]}) - >>> pl.set_key(C, 'id') - >>> pl.set_ltable(C, A) - >>> id(pl.get_ltable(A) == id(A) - # True - - - See Also: - :meth:`~magellan_labeler.get_property` - """ - # Return the ltable for a candidate set. This function is just a sugar - return get_property(candset, 'ltable') - - -# noinspection PyIncorrectDocstring -def get_rtable(candset): - """ - Gets the rtable for a DataFrame from the catalog. - - Args: - candset (DataFrame): Input table for which the rtable must be returned. - - Returns: - A pandas DataFrame that is pointed by 'rtable' property of the input - table. - - Examples: - >>> import magellan_labeler as pl - >>> A = pd.DataFrame({'id' : [1, 2], 'colA':['a', 'b'], 'colB' : [10, 20]}) - >>> B = pd.DataFrame({'id' : [1, 2], 'colA':['c', 'd'], 'colB' : [30, 40]}) - >>> pl.set_key(A, 'id') - >>> pl.set_key(B, 'id') - >>> C = pd.DataFrame({'id':[1, 2], 'ltable_id':[1, 2], 'rtable_id':[2, 1]}) - >>> pl.set_key(C, 'id') - >>> pl.set_rtable(C, B) - >>> id(pl.get_rtable(B) == id(B) - # True - - - See Also: - :meth:`~magellan_labeler.get_property` - """ - # Return the rtable for a candidate set. This function is just a sugar - - return get_property(candset, 'rtable') - - -def set_ltable(candset, table): - """ - Sets the ltable for a DataFrame in the catalog. - - Args: - candset (DataFrame): The input table for which the ltable must be set. - table (DataFrame): The table (typically a pandas DataFrame) that must - be set as ltable for the input DataFrame. - - Returns: - A Boolean value of True is returned, if the update was successful. - - Examples: - >>> import magellan_labeler as pl - >>> A = pd.DataFrame({'id' : [1, 2], 'colA':['a', 'b'], 'colB' : [10, 20]}) - >>> B = pd.DataFrame({'id' : [1, 2], 'colA':['c', 'd'], 'colB' : [30, 40]}) - >>> pl.set_key(A, 'id') - >>> pl.set_key(B, 'id') - >>> C = pd.DataFrame({'id':[1, 2], 'ltable_id':[1, 2], 'rtable_id':[2, 1]}) - >>> pl.set_key(C, 'id') - >>> pl.set_ltable(C, A) - >>> id(pl.get_ltable(A) == id(A) - # True - - - See Also: - :meth:`~magellan_labeler.set_property` - """ - # Return the ltable for a candidate set. This function is just a sugar - return set_property(candset, 'ltable', table) - - -# noinspection PyIncorrectDocstring -def set_rtable(candset, table): - """ - Sets the rtable for a DataFrame in the catalog. - - Args: - candset (DataFrame): The input table for which the rtable must be set. - table (DataFrame): The table that must be set as rtable for the input - DataFrame. - - Returns: - A Boolean value of True is returned, if the update was successful. - - Examples: - >>> import magellan_labeler as pl - >>> A = pd.DataFrame({'id' : [1, 2], 'colA':['a', 'b'], 'colB' : [10, 20]}) - >>> B = pd.DataFrame({'id' : [1, 2], 'colA':['c', 'd'], 'colB' : [30, 40]}) - >>> pl.set_key(A, 'id') - >>> pl.set_key(B, 'id') - >>> C = pd.DataFrame({'id':[1, 2], 'ltable_id':[1, 2], 'rtable_id':[2, 1]}) - >>> pl.set_key(C, 'id') - >>> pl.set_rtable(C, B) - >>> id(pl.get_rtable(B) == id(B) - # True - - - See Also: - :meth:`~magellan_labeler.set_property` - """ - # Return the rtable for a candidate set. This function is just a sugar - - return set_property(candset, 'rtable', table) diff --git a/py_labeler/io/__init__.py b/py_labeler/io/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/py_labeler/io/parsers.py b/py_labeler/io/parsers.py deleted file mode 100644 index 5d849b1..0000000 --- a/py_labeler/io/parsers.py +++ /dev/null @@ -1,465 +0,0 @@ -# coding=utf-8 -"""This module defines functions to read and write CSV files""" -import collections -import logging -import os -import pandas as pd -import six - -import py_labeler.catalog.catalog_manager as cm -from py_labeler.utils.validation_helper import validate_object_type - -logger = logging.getLogger(__name__) - - -def read_csv_metadata(file_path, **kwargs): - """ - Reads a CSV (comma-separated values) file into a pandas DataFrame - and update the catalog with the metadata. The CSV files typically contain - data for the input tables or a candidate set. - - Specifically, this function first reads the CSV file from the given file - path into a pandas DataFrame, by using pandas' in-built 'read_csv' - method. Then, it updates the catalog with the metadata. There are three - ways to update the metadata: (1) using a metadata file, (2) using the - key-value parameters supplied in the function, and (3) using both - metadata file and key-value parameters. - - To update the metadata in the catalog using the metadata file, - the function will look for a file in the same directory with same file name - but with a specific extension. This extension can be optionally given by - the user (defaults to '.metadata'). If the metadata file is present, - the function will read and update the catalog appropriately. If the - metadata file is not present, the function will issue a warning that the - metadata file is not present. - - The metadata information can also be given as parameters to the function - (see description of arguments for more details). If given, the function - will update the catalog with the given information. - - Further, the metadata can partly reside in the metdata file and partly as - supplied parameters. The function will take a union of the two and - update the catalog appropriately. - If the same metadata is given in both the metadata file - and the function, then the metadata in the function takes precedence over - the metadata given in the file. - - Args: - file_path(string): The CSV file path - - kwargs(dictionary): A Python dictionary containing key-value arguments. - There are a few key-value pairs that are specific to - read_csv_metadata and all the other key-value pairs are passed - to pandas read_csv method - - Returns: - A pandas DataFrame read from the input CSV file. - Raises: - AssertionError: If `file_path` is not of type string. - AssertionError: If a file does not exist in the - given `file_path`. - - Examples: - *Example 1:* Read from CSV file and set metadata - - >>> A = pl.read_csv_metadata('path_to_csv_file', key='id') - >>> pl.get_key(A) - # 'id' - - *Example 2:* Read from CSV file (with metadata file in the same directory - - Let the metadata file contain the following contents: - - #key = id - - >>> A = pl.read_csv_metadata('path_to_csv_file') - >>> pl.get_key(A) - # 'id' - - See Also: - :meth:`~py_entitymatching.to_csv_metadata` - """ - # Validate the input parameters. - - validate_object_type(file_path, six.string_types, error_prefix='Input file path') - - # # Check if the given path is valid. - if not os.path.exists(file_path): - logger.error('File does not exist at path %s' % file_path) - raise AssertionError('File does not exist at path %s' % file_path) - - # Check if the user has specified the metadata file's extension. - extension = kwargs.pop('metadata_extn', None) - - # If the extension is not specified then set the extension to .metadata'. - if extension is None: - extension = '.metadata' - - # Format the extension to include a '.' in front if the user has not - # given one. - if not extension.startswith('.'): - extension = '.' + extension - - # If the file is present, then update metadata from file. - if _is_metadata_file_present(file_path, extension=extension): - file_name, _ = os.path.splitext(file_path) - file_name = ''.join([file_name, extension]) - metadata, _ = _get_metadata_from_file(file_name) - - # Else issue a warning that the metadata file is not present - else: - logger.warning('Metadata file is not present in the given path; ' - 'proceeding to read the csv file.') - metadata = {} - - # Update the metadata with the key-value pairs given in the command. The - # function _update_metadata_for_read_cmd takes care of updating the - # metadata with only the key-value pairs specific to read_csv_metadata - # method - metadata, kwargs = _update_metadata_for_read_cmd(metadata, **kwargs) - - # Validate the metadata. - _check_metadata_for_read_cmd(metadata) - - # Read the csv file using pandas read_csv method. - data_frame = pd.read_csv(file_path, **kwargs) - - # Get the value for 'key' property and update the catalog. - key = metadata.pop('key', None) - if key is not None: - cm.set_key(data_frame, key) - - fk_ltable = metadata.pop('fk_ltable', None) - if fk_ltable is not None: - cm.set_fk_ltable(data_frame, fk_ltable) - - fk_rtable = metadata.pop('fk_rtable', None) - if fk_ltable is not None: - cm.set_fk_rtable(data_frame, fk_rtable) - - # Update the catalog with other properties. - for property_name, property_value in six.iteritems(metadata): - cm.set_property(data_frame, property_name, property_value) - if not cm.is_dfinfo_present(data_frame): - cm.init_properties(data_frame) - - # Return the DataFrame - return data_frame - - -def to_csv_metadata(data_frame, file_path, **kwargs): - """ - Writes the DataFrame contents to a CSV file and the DataFrame's metadata - (to a separate text file). - - This function writes the DataFrame contents to a CSV file in - the given file path. It uses 'to_csv' method from pandas to write - the CSV file. The metadata contents are written to the same directory - derived from the file path but with the different extension. This - extension can be optionally given by the user (with the default value - set to .metadata). - - Args: - data_frame (DataFrame): The DataFrame that should be written to disk. - file_path (string): The file path to which the DataFrame contents - should be written. Metadata is written with the same file name - with the extension given by the user (defaults to '.metadata'). - kwargs (dictionary): A Python dictionary containing key-value pairs. - There is one key-value pair that is specific to - to_csv_metadata: metadata_extn. All the other key-value pairs - are passed to pandas to_csv function. - Here the metadata_extn is the metadata - extension (defaults to '.metadata'), with which - the metadata file must be written. - Returns: - A Boolean value of True is returned if the files were written - successfully. - - Raises: - AssertionError: If `data_frame` is not of type pandas - DataFrame. - AssertionError: If `file_path` is not of type string. - AssertionError: If DataFrame cannot be written to the given - `file_path`. - - Examples: - - >>> import pandas as pd - >>> A = pd.DataFrame({'id' : [1, 2], 'colA':['a', 'b'], 'colB' : [10, 20]}) - >>> pl.set_key(A, 'id') - >>> pl.to_csv_metadata(A, 'path_to_csv_file') - - - See Also: - :meth:`~py_entitymatching.read_csv_metadata` - - """ - # Validate input parameters - - validate_object_type(data_frame, pd.DataFrame) - - validate_object_type(file_path, six.string_types, error_prefix='Input file path') - - # Check if the user has specified the metadata file's extension. - extension = kwargs.pop('metadata_extn', None) - if extension is None: - extension = '.metadata' - if not extension.startswith('.'): - extension = '.' + extension - - # If the user has not specified whether the index should be written, - # we explicitly set it to be false. The reason is writing the index - # along makes the CSV file cumbersome to view and later read back into a - # DataFrame. - index = kwargs.pop('index', None) - if index is None: - kwargs['index'] = False - - # retrieve the file name and the extension from the given file path. - file_name, _ = os.path.splitext(file_path) - metadata_filename = file_name + extension - - # check if we access privileges to write a file in the given file path, - # and also check if a file already exists in the file path. - can_write, file_exists = _check_file_path(file_path) - - if can_write: - # check if the file already exists. If so issue a warning and - # overwrite the file. - if file_exists: - logger.warning('File already exists at %s; Overwriting it', - file_path) - data_frame.to_csv(file_path, **kwargs) - else: - data_frame.to_csv(file_path, **kwargs) - else: - # If we cannot write in the given file path, raise an exception. - logger.error('Cannot write in the file path %s; Exiting' % file_path) - raise AssertionError('Cannot write in the file path %s' % file_path) - - # repeat the process (as writing the DataFrame) to write the metadata. - - # check for access privileges and file existence. - can_write, file_exists = _check_file_path(metadata_filename) - if can_write: - if file_exists: - logger.warning('Metadata file already exists at %s. Overwriting ' - 'it', metadata_filename) - _write_metadata(data_frame, metadata_filename) - else: - _write_metadata(data_frame, metadata_filename) - else: - # If we cannot write in the given file path, raise an exception. - logger.error('Cannot write in the file path %s; Exiting' % file_path) - raise AssertionError('Cannot write in the file path %s' % file_path) - - return True - - -def _write_metadata(data_frame, file_path): - """ - Write metadata contents to disk. - """ - # Initialize a metadata dictionary to store the metadata. - metadata_dict = collections.OrderedDict() - - # Get all the properties for the input data frame - if cm.is_dfinfo_present(data_frame) is True: - properties = cm.get_all_properties(data_frame) - else: - # If the data_frame is not in the catalog, then return immedidately. - return False - - # If the properties are present in the catalog, then write properties to - # disk - if len(properties) > 0: - for property_name, property_value in six.iteritems(properties): - # If the property value is not of type string, then just write it - # as 'POINTER'. This will be useful while writing the candidate - # sets to disk. The candidate set will have properties such as - # ltable and rtable which are DataFrames. We do not have a simple - # way to write them to disk and link them back the candidate set - # while reading back from disk. So to get around this problem we - # will use 'POINTER' as the special value to indicate objects - # other than strings. - if isinstance(property_value, six.string_types) is False: - metadata_dict[property_name] = 'POINTER' - else: - metadata_dict[property_name] = property_value - - # Write the properties to a file in disk. The file will one property - # per line. We follow a special syntax to write the properties. The - # syntax is: - # #property_name=property_value - with open(file_path, 'w') as file_handler: - for property_name, property_value in six.iteritems(metadata_dict): - file_handler.write('#%s=%s\n' % (property_name, property_value)) - - return True - - -def _is_metadata_file_present(file_path, extension='.metadata'): - """ - Check if the metadata file is present. - """ - # Get the file name and the extension from the file path. - file_name, _ = os.path.splitext(file_path) - # Create a file name with the given extension. - file_name = ''.join([file_name, extension]) - # Check if the file already exists. - return os.path.exists(file_name) - - -def _get_metadata_from_file(file_path): - """ - Get the metadata information from the file. - """ - # Initialize a dictionary to store the metadata read from the file. - metadata = dict() - - # Get the number of lines from the file - num_lines = sum(1 for _ in open(file_path)) - - # If there are some contents in the file (i.e num_lines > 0), - # read its contents. - if num_lines > 0: - with open(file_path) as file_handler: - for _ in range(num_lines): - line = next(file_handler) - # Consider only the lines that are starting with '#' - if line.startswith('#'): - # Remove the leading '#' - line = line.lstrip('#') - # Split the line with '=' as the delimiter - tokens = line.split('=') - # Based on the special syntax we use, there should be - # exactly two tokens after we split using '=' - assert len(tokens) is 2, 'Error in file, he num tokens ' \ - 'is not 2' - # Retrieve the property_names and values. - property_name = tokens[0].strip() - property_value = tokens[1].strip() - # If the property value is not 'POINTER' then store it in - # the metadata dictionary. - if property_value is not 'POINTER': - metadata[property_name] = property_value - - # Return the metadata dictionary and the number of lines in the file. - return metadata, num_lines - - -def _update_metadata_for_read_cmd(metadata, **kwargs): - """ - Update metadata for read_csv_metadata method. - """ - # Create a copy of incoming metadata. We will update the incoming - # metadata dict with kwargs. - copy_metadata = metadata.copy() - - # The updation is going to happen in two steps: (1) overriding the - # properties in metadata dict using kwargs, and (2) adding the properties - # to metadata dict from kwargs. - - # Step 1 - # We will override the properties in the metadata dict with the - # properties from kwargs. - - # Get the property from metadata dict. - for property_name in copy_metadata.keys(): - # If the same property is present in kwargs, then override it in the - # metadata dict. - if property_name in kwargs: - property_value = kwargs.pop(property_name) - if property_value is not None: - metadata[property_name] = property_value - else: - # Warn the users if the metadata dict had a valid value, - # but the kwargs sets it to None. - logger.warning( - '%s key had a value (%s)in file but input arg is set to ' - 'None' % (property_name, metadata[property_name])) - # Remove the property from the dictionary. - metadata.pop(property_name) # remove the key-value pair - - # Step 2 - # Add the properties from kwargs. - # We should be careful here. The kwargs contains the key-value pairs that - # are used by read_csv method (of pandas). We will just pick the - # properties that we expect from the read_csv_metadata method. - properties = ['key', 'ltable', 'rtable', 'fk_ltable', 'fk_rtable'] - - # For the properties that we expect, read from kwargs and update the - # metadata dict. - for property_name in properties: - if property_name in kwargs: - property_value = kwargs.pop(property_name) - if property_value is not None: - metadata[property_name] = property_value - else: - # Warn the users if the properties in the kwargs is set to None. - logger.warning('Metadata %s is set to None', property_name) - # Remove the property from the metadata dict. - metadata.pop(property_name, None) - - return metadata, kwargs - - -def _check_metadata_for_read_cmd(metadata): - """ - Check the metadata for read_csv_metadata command - """ - - # Do basic validation checks for the metadata. - - # We require consistency of properties given for the canidate set. We - # expect the user to provide all the required properties for the - # candidate set. - required_properties = ['ltable', 'rtable', 'fk_ltable', 'fk_rtable'] - - # Check what the user has given - given_properties = set(required_properties).intersection(metadata.keys()) - - # Check if all the required properties are given - if len(given_properties) > 0: - # Check the lengths are same. If not, this means that the user is - # missing something. So, raise an error. - if len(given_properties) is not len(required_properties): - logger.error( - 'Dataframe requires all valid ltable, rtable, fk_ltable, ' - 'fk_rtable parameters set') - raise AssertionError( - 'Dataframe requires all valid ltable, rtable, fk_ltable, ' - 'fk_rtable parameters set') - - # ltable is expected to be of type pandas DataFrame. If not raise an - # error. - if not isinstance(metadata['ltable'], pd.DataFrame): - logger.error('The parameter ltable must be set to valid Dataframe') - raise AssertionError( - 'The parameter ltable must be set to valid Dataframe') - - # rtable is expected to be of type pandas DataFrame. If not raise an - # error. - if not isinstance(metadata['rtable'], pd.DataFrame): - logger.error('The parameter rtable must be set to valid Dataframe') - raise AssertionError( - 'The parameter rtable must be set to valid Dataframe') - # If the length of comman properties is 0, it will fall out to return - # True, which is ok. - return True - - -def _check_file_path(file_path): - """ - Check validity (access privileges and existence of a file already)of the - given file path. - """ - # returns a tuple can_write, file_exists - if os.path.exists(file_path): - # the file is there - return True, True - elif os.access(os.path.dirname(file_path), os.W_OK): - return True, False - # the file does not exists but write privileges are given - else: - return False, False diff --git a/requirements.yml b/requirements.yml index 8baa41a..509f010 100644 --- a/requirements.yml +++ b/requirements.yml @@ -1,4 +1,4 @@ -name: py_entitymatching_dev +name: py_labeler_dev channels: - conda-forge - uwmagellan From 36b1a1d48ec5ac872fbf8204acaa9b7871f39bb8 Mon Sep 17 00:00:00 2001 From: pavankm Date: Sat, 30 Dec 2017 15:27:28 -0600 Subject: [PATCH 03/12] remove check for version + remove ununsed cmd from ipynb --- notebooks/Sampling and Labeling.ipynb | 85 +++++++++++++-------------- py_labeler/labeler/labeler.py | 6 +- 2 files changed, 45 insertions(+), 46 deletions(-) diff --git a/notebooks/Sampling and Labeling.ipynb b/notebooks/Sampling and Labeling.ipynb index 18d7631..c14f57b 100644 --- a/notebooks/Sampling and Labeling.ipynb +++ b/notebooks/Sampling and Labeling.ipynb @@ -17,7 +17,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 11, "metadata": { "ExecuteTime": { "end_time": "2017-12-30T18:12:19.100155Z", @@ -34,7 +34,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ @@ -46,7 +46,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -55,7 +55,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 14, "metadata": {}, "outputs": [ { @@ -171,7 +171,7 @@ "4 1988 Joseph Kuan 94122 1982 " ] }, - "execution_count": 4, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -182,7 +182,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 15, "metadata": {}, "outputs": [ { @@ -191,7 +191,7 @@ "14" ] }, - "execution_count": 5, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -209,7 +209,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ @@ -218,16 +218,16 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "'5.6.2'" + "'5.9.3'" ] }, - "execution_count": 9, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -238,22 +238,9 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 18, "metadata": {}, - "outputs": [ - { - "ename": "ImportError", - "evalue": "PyQt 5.9.3 or greater is required", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mImportError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# Label the data set\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;31m# Specify the name for the label column\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0mG\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlabeler\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlabel_table\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mC\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'gold_label'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;32m~/miniconda3/lib/python3.5/site-packages/py_labeler-0.1.0-py3.5.egg/py_labeler/labeler/labeler.py\u001b[0m in \u001b[0;36mlabel_table\u001b[0;34m(df, label_column_name)\u001b[0m\n\u001b[1;32m 178\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mImportError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Python 3.3 or greater is required\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 179\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mQT_VERSION_STR\u001b[0m \u001b[0;34m<\u001b[0m \u001b[0;34m'5.9.3'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 180\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mImportError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"PyQt 5.9.3 or greater is required\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 181\u001b[0m \u001b[0m_validate_inputs\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlabel_column_name\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 182\u001b[0m \u001b[0mdf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcopy\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdeep\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mImportError\u001b[0m: PyQt 5.9.3 or greater is required" - ] - } - ], + "outputs": [], "source": [ "# Label the data set\n", "# Specify the name for the label column\n", @@ -262,7 +249,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 19, "metadata": {}, "outputs": [ { @@ -378,7 +365,7 @@ "4 1988 Joseph Kuan 94122 1982 " ] }, - "execution_count": 8, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -389,7 +376,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 20, "metadata": {}, "outputs": [ { @@ -422,7 +409,9 @@ " rtable_name\n", " rtable_zipcode\n", " rtable_birth_year\n", - " label\n", + " gold_label\n", + " comments\n", + " tags\n", " \n", " \n", " \n", @@ -437,7 +426,9 @@ " Mark Levene\n", " 94107\n", " 1987\n", - " Yes\n", + " Not-Matched\n", + " \n", + " \n", " \n", " \n", " 1\n", @@ -450,7 +441,9 @@ " Bill Bridge\n", " 94107\n", " 1986\n", - " Not-Matched\n", + " Yes\n", + " \n", + " \n", " \n", " \n", " 2\n", @@ -463,7 +456,9 @@ " Michael Brodie\n", " 94107\n", " 1987\n", - " Yes\n", + " Not-Labeled\n", + " \n", + " \n", " \n", " \n", " 3\n", @@ -476,7 +471,9 @@ " Mike Franklin\n", " 94122\n", " 1988\n", - " Not-Matched\n", + " Not-Sure\n", + " \n", + " \n", " \n", " \n", " 4\n", @@ -489,7 +486,9 @@ " Joseph Kuan\n", " 94122\n", " 1982\n", - " Yes\n", + " Not-Labeled\n", + " \n", + " \n", " \n", " \n", "\n", @@ -510,15 +509,15 @@ "3 1988 Mike Franklin 94122 1988 \n", "4 1988 Joseph Kuan 94122 1982 \n", "\n", - " label \n", - "0 Yes \n", - "1 Not-Matched \n", - "2 Yes \n", - "3 Not-Matched \n", - "4 Yes " + " gold_label comments tags \n", + "0 Not-Matched \n", + "1 Yes \n", + "2 Not-Labeled \n", + "3 Not-Sure \n", + "4 Not-Labeled " ] }, - "execution_count": 13, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -545,7 +544,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.2" + "version": "3.6.3" } }, "nbformat": 4, diff --git a/py_labeler/labeler/labeler.py b/py_labeler/labeler/labeler.py index 6ecd7c5..a7e9dfa 100644 --- a/py_labeler/labeler/labeler.py +++ b/py_labeler/labeler/labeler.py @@ -10,7 +10,7 @@ from PyQt5.QtWebChannel import QWebChannel from PyQt5.QtWebEngineWidgets import QWebEngineView, QWebEnginePage, QWebEngineScript from PyQt5.QtWidgets import QApplication - from PyQt5.QtCore import QT_VERSION_STR +# from PyQt5.QtCore import QT_VERSION_STR except ImportError: raise ImportError('PyQt5 is not installed. Please install PyQt5 to use ' 'GUI related functions in py_labeler.') @@ -176,8 +176,8 @@ def label_table(df, label_column_name): """ if sys.version_info < (3, 5): raise ImportError("Python 3.3 or greater is required") - if QT_VERSION_STR < '5.9.3': - raise ImportError("PyQt 5.9.3 or greater is required") + # if QT_VERSION_STR < '5.9.2': + # raise ImportError("PyQt 5.9.2 or greater is required") _validate_inputs(df, label_column_name) df = df.copy(deep=True) From 65f1432e713a8dca6871d42a01b7e2de840750c0 Mon Sep 17 00:00:00 2001 From: pavankm Date: Tue, 2 Jan 2018 21:04:28 -0600 Subject: [PATCH 04/12] [FIX] Canceling save to file gives error --- py_labeler/labeler/view/templates/common_js.html | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/py_labeler/labeler/view/templates/common_js.html b/py_labeler/labeler/view/templates/common_js.html index cae5af2..0495a61 100644 --- a/py_labeler/labeler/view/templates/common_js.html +++ b/py_labeler/labeler/view/templates/common_js.html @@ -5,6 +5,7 @@