38
38
# - fix_text -> Fixes numerous inconsistencies within a text (via ftfy)
39
39
40
40
41
- import ftfy
42
41
import logging
43
42
import unicodedata
43
+ from typing import List , Union
44
+
45
+ import ftfy
46
+ import numpy as np
44
47
import pandas as pd
45
- from typing import Union , List
46
48
from nltk .stem .snowball import FrenchStemmer
47
49
48
- from words_n_fun import utils
49
50
from words_n_fun import CustomTqdm as tqdm
50
- from words_n_fun . preprocessing import stopwords
51
- from words_n_fun .preprocessing import lemmatizer
52
- from words_n_fun . preprocessing import synonym_malefemale_replacement
51
+ from words_n_fun import utils
52
+ from words_n_fun .preprocessing import ( lemmatizer , stopwords ,
53
+ synonym_malefemale_replacement )
53
54
54
55
tqdm .pandas ()
55
56
@@ -313,7 +314,6 @@ def remove_numeric(docs: pd.Series, replacement_char: str = ' ') -> pd.Series:
313
314
logger .debug ('Calling basic.remove_numeric' )
314
315
return impl_remove_numeric (docs , replacement_char )
315
316
316
- @utils .regroup_data_series
317
317
def impl_remove_stopwords (docs : pd .Series , opt : str = 'all' , set_to_add : Union [list , None ] = None ,
318
318
set_to_remove : Union [list , None ] = None ) -> pd .Series :
319
319
'''Removes stopwords
@@ -327,12 +327,13 @@ def impl_remove_stopwords(docs: pd.Series, opt: str = 'all', set_to_add: Union[l
327
327
Returns:
328
328
pd.Series: Modified documents
329
329
'''
330
+ # stopwords.remove_stopwords use data_agnostic and regroup_data_series wrappers already
330
331
return stopwords .remove_stopwords (docs , opt = opt , set_to_add = set_to_add , set_to_remove = set_to_remove )
331
332
332
333
333
- @ utils . data_agnostic
334
- def remove_stopwords (docs : pd .Series , opt : str = 'all' , set_to_add : Union [list , None ] = None ,
335
- set_to_remove : Union [list , None ] = None ) -> pd .Series :
334
+ # called function already with wrappers
335
+ def remove_stopwords (docs : Union [ str , list , np . ndarray , pd .Series , pd . DataFrame ] , opt : str = 'all' , set_to_add : Union [list , None ] = None ,
336
+ set_to_remove : Union [list , None ] = None ) -> Union [ str , list , np . ndarray , pd .Series , pd . DataFrame ] :
336
337
'''Removes stopwords
337
338
338
339
Args:
@@ -379,7 +380,6 @@ def remove_accents(docs: pd.Series, use_tqdm: bool = False) -> pd.Series:
379
380
'''
380
381
return impl_remove_accents (docs , use_tqdm )
381
382
382
- @utils .regroup_data_series
383
383
def impl_remove_gender_synonyms (docs : pd .Series ) -> pd .Series :
384
384
'''[French] Removes gendered synonyms
385
385
# Find occurences such as "male version / female version" (eg: Coiffeur / Coiffeuse)
@@ -391,11 +391,12 @@ def impl_remove_gender_synonyms(docs: pd.Series) -> pd.Series:
391
391
Returns:
392
392
pd.Series: Modified documents
393
393
'''
394
+ # synonym_malefemale_replacement.remove_gender_synonyms uses data_agnostic and regroup_data_series wrappers already
394
395
return synonym_malefemale_replacement .remove_gender_synonyms (docs )
395
396
396
397
397
- @ utils . data_agnostic
398
- def remove_gender_synonyms (docs : pd .Series ) -> pd .Series :
398
+ # wrappers in the main function
399
+ def remove_gender_synonyms (docs : Union [ str , list , np . ndarray , pd .Series , pd . DataFrame ] ) -> Union [ str , list , np . ndarray , pd .Series , pd . DataFrame ] :
399
400
'''[French] Removes gendered synonyms
400
401
# Find occurences such as "male version / female version" (eg: Coiffeur / Coiffeuse)
401
402
# By convention, the male version is kept (in accordance with the lemmatizer)
@@ -409,7 +410,7 @@ def remove_gender_synonyms(docs: pd.Series) -> pd.Series:
409
410
logger .debug ('Calling basic.remove_gender_synonyms' )
410
411
return impl_remove_gender_synonyms (docs )
411
412
412
- @ utils . regroup_data_series
413
+ # lemmatizer.lemmatize has already wrappers
413
414
def impl_lemmatize (docs : pd .Series ) -> pd .Series :
414
415
'''Lemmatizes the documents
415
416
Appel à une API externe
@@ -424,9 +425,8 @@ def impl_lemmatize(docs: pd.Series) -> pd.Series:
424
425
# Process
425
426
return lemmatizer .lemmatize (docs )
426
427
427
-
428
- @utils .data_agnostic
429
- def lemmatize (docs : pd .Series ) -> pd .Series :
428
+ # lemmatizer.lemmatize has already wrappers
429
+ def lemmatize (docs : Union [str , list , np .ndarray , pd .Series , pd .DataFrame ]) -> Union [str , list , np .ndarray , pd .Series , pd .DataFrame ]:
430
430
'''Lemmatizes the documents
431
431
Appel à une API externe
432
432
0 commit comments