Optimization of regroup_data_series (#21)

dgarlor · web-flow · commit 17fd1aafcda9 · 2023-09-01T11:48:07.000+02:00
* regroup_data_series: avoid double unique, add percentage

* add test with not enough duplicates

* remove wrappers when calling a wrapped function

* clean errors in merge
diff --git a/tests/test_1_utils.py b/tests/test_1_utils.py
@@ -511,6 +511,8 @@ def test_function(docs):
         docs_results =  pd.Series(['avant'] + ["test"] * 5000 + ['milieu'] + ["test"] * 5000 + ['après'], name='test')
         data_no_duplicates = pd.Series(['avant'] + ["ceci est un test"] + ['milieu'] + ['après'], name='test')
         data_no_duplicates_results = pd.Series(['avant'] + ["test"] + ['milieu'] + ['après'], name='test')
+        data_not_enough_duplicates = pd.Series(['avant'] + [f"ceci est un test {i}" for i in range(10)] + ['milieu']*2 + ['après'], name='test')
+        data_not_enough_duplicates_results = pd.Series(['avant'] + ["test" for i in range(10)] + ['milieu'] * 2 + ['après'], name='test')
 
 
         # Vérification du fonctionnement type
@@ -521,6 +523,11 @@ def test_function(docs):
         pd.testing.assert_series_equal(docs_test, docs_test_copy)
         # Vérification fonctionnement quand pas de doublons
         pd.testing.assert_series_equal(utils.regroup_data_series(test_function, min_nb_data=1)(data_no_duplicates), data_no_duplicates_results)
+        # Vérification du foctionnement pas assez de doublons
+        pd.testing.assert_series_equal(utils.regroup_data_series(test_function, min_nb_data=1, max_percent_unique=0.5)(data_not_enough_duplicates), data_not_enough_duplicates_results)
+
+
+
 
 
     def test_regroup_data_df(self):
diff --git a/words_n_fun/preprocessing/basic.py b/words_n_fun/preprocessing/basic.py
@@ -38,18 +38,19 @@
 # - fix_text -> Fixes numerous inconsistencies within a text (via ftfy)
 
 
-import ftfy
 import logging
 import unicodedata
+from typing import List, Union
+
+import ftfy
+import numpy as np
 import pandas as pd
-from typing import Union, List
 from nltk.stem.snowball import FrenchStemmer
 
-from words_n_fun import utils
 from words_n_fun import CustomTqdm as tqdm
-from words_n_fun.preprocessing import stopwords
-from words_n_fun.preprocessing import lemmatizer
-from words_n_fun.preprocessing import synonym_malefemale_replacement
+from words_n_fun import utils
+from words_n_fun.preprocessing import (lemmatizer, stopwords,
+                                       synonym_malefemale_replacement)
 
 tqdm.pandas()
 
@@ -313,7 +314,6 @@ def remove_numeric(docs: pd.Series, replacement_char: str = ' ') -> pd.Series:
     logger.debug('Calling basic.remove_numeric')
     return impl_remove_numeric(docs, replacement_char)
 
-@utils.regroup_data_series
 def impl_remove_stopwords(docs: pd.Series, opt: str = 'all', set_to_add: Union[list, None] = None,
                      set_to_remove: Union[list, None] = None) -> pd.Series:
     '''Removes stopwords
@@ -327,12 +327,13 @@ def impl_remove_stopwords(docs: pd.Series, opt: str = 'all', set_to_add: Union[l
     Returns:
         pd.Series: Modified documents
     '''
+    # stopwords.remove_stopwords use data_agnostic and regroup_data_series wrappers already
     return stopwords.remove_stopwords(docs, opt=opt, set_to_add=set_to_add, set_to_remove=set_to_remove)
 
 
-@utils.data_agnostic
-def remove_stopwords(docs: pd.Series, opt: str = 'all', set_to_add: Union[list, None] = None,
-                     set_to_remove: Union[list, None] = None) -> pd.Series:
+# called function already with wrappers
+def remove_stopwords(docs: Union[str, list, np.ndarray, pd.Series, pd.DataFrame], opt: str = 'all', set_to_add: Union[list, None] = None,
+                     set_to_remove: Union[list, None] = None) -> Union[str, list, np.ndarray, pd.Series, pd.DataFrame]:
     '''Removes stopwords
 
     Args:
@@ -379,7 +380,6 @@ def remove_accents(docs: pd.Series, use_tqdm: bool = False) -> pd.Series:
     '''
     return impl_remove_accents(docs, use_tqdm)
 
-@utils.regroup_data_series
 def impl_remove_gender_synonyms(docs: pd.Series) -> pd.Series:
     '''[French] Removes gendered synonyms
     # Find occurences such as "male version / female version" (eg: Coiffeur / Coiffeuse)
@@ -391,11 +391,12 @@ def impl_remove_gender_synonyms(docs: pd.Series) -> pd.Series:
     Returns:
         pd.Series: Modified documents
     '''
+    # synonym_malefemale_replacement.remove_gender_synonyms uses data_agnostic and regroup_data_series wrappers already
     return synonym_malefemale_replacement.remove_gender_synonyms(docs)
 
 
-@utils.data_agnostic
-def remove_gender_synonyms(docs: pd.Series) -> pd.Series:
+# wrappers in the main function
+def remove_gender_synonyms(docs: Union[str, list, np.ndarray, pd.Series, pd.DataFrame]) -> Union[str, list, np.ndarray, pd.Series, pd.DataFrame]:
     '''[French] Removes gendered synonyms
     # Find occurences such as "male version / female version" (eg: Coiffeur / Coiffeuse)
     # By convention, the male version is kept (in accordance with the lemmatizer)
@@ -409,7 +410,7 @@ def remove_gender_synonyms(docs: pd.Series) -> pd.Series:
     logger.debug('Calling basic.remove_gender_synonyms')
     return impl_remove_gender_synonyms(docs)
 
-@utils.regroup_data_series
+# lemmatizer.lemmatize has already wrappers
 def impl_lemmatize(docs: pd.Series) -> pd.Series:
     '''Lemmatizes the documents
     Appel à une API externe
@@ -424,9 +425,8 @@ def impl_lemmatize(docs: pd.Series) -> pd.Series:
     # Process
     return lemmatizer.lemmatize(docs)
 
-
-@utils.data_agnostic
-def lemmatize(docs: pd.Series) -> pd.Series:
+# lemmatizer.lemmatize has already wrappers
+def lemmatize(docs: Union[str, list, np.ndarray, pd.Series, pd.DataFrame]) -> Union[str, list, np.ndarray, pd.Series, pd.DataFrame]:
     '''Lemmatizes the documents
     Appel à une API externe
 
diff --git a/words_n_fun/utils.py b/words_n_fun/utils.py
@@ -656,7 +656,7 @@ def get_column_to_be_processed(docs: Union[str, list, np.ndarray, pd.Series, pd.
         return prefered_column
 
 
-def regroup_data_series(function: Callable, min_nb_data: int = 1000, prefix_text: Union[str, None] = None) -> Callable:
+def regroup_data_series(function: Callable, min_nb_data:int = 1000, prefix_text: Union[str, None] = None, max_percent_unique: float = 0.9) -> Callable:
     '''Wrapper to regroup identical data of a pd.Series before being processed
     Can be used as a decorator
 
@@ -665,6 +665,8 @@ def regroup_data_series(function: Callable, min_nb_data: int = 1000, prefix_text
     Kwargs:
         min_nb_data (int): Minimum number of rows within the document required to apply this wrapper (default : 1000)
         prefix_text (str): Prefix to add
+        max_percent_unique (float): value [0-1] percentage of unique values to perform reduction 
+                            for very quick functions min_percent_unique should be low to have a real speed up
     Returns:
         function: Decorated function
     '''
@@ -678,7 +680,7 @@ def regroup_data_series(function: Callable, min_nb_data: int = 1000, prefix_text
 
     # Set wrapper
     @wraps(function)
-    def wrapper(docs: Union[str, list, np.ndarray, pd.Series, pd.DataFrame], *args, **kwargs) -> pd.Series:
+    def wrapper(docs: pd.Series, *args, **kwargs) -> pd.Series:
         '''Wrapper
 
         Args:
@@ -692,16 +694,19 @@ def wrapper(docs: Union[str, list, np.ndarray, pd.Series, pd.DataFrame], *args,
         # If there is not enough data, the wrapper is discarded and the function returned as is
         if init_len < min_nb_data:
             return function(docs, *args, **kwargs)
-        # If there is no duplicates in the data, the wrapper is discarded as well
-        elif len(docs.unique()) == init_len:
+        
+        # If there is not enough duplicates in the data, the wrapper is discarded as well
+        unique_docs = docs.unique()
+        if  ( len(unique_docs) / init_len ) > max_percent_unique:
             return function(docs, *args, **kwargs)
+        
         init_name = docs.name
         init_index = docs.index
         # Put docs into a dataframe
         df = pd.DataFrame(docs)
         df.columns = ["input_data"]
         # Regroup same values together
-        input_data = df["input_data"].dropna().drop_duplicates()
+        input_data = pd.Series(unique_docs).dropna()
         logger.debug(f"{prefix_text} Reduced data to be processed by {100 * (df.shape[0] - len(input_data)) / df.shape[0]} % (grouped duplicated rows)")
         # Get output
         output_data = function(input_data, *args, **kwargs)