diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md new file mode 100644 index 0000000..8dcafac --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -0,0 +1,51 @@ +--- +name: Bug report +about: Create a report to help us improve +title: '' +labels: 'bug' +assignees: '' + +--- + +### Describe the bug + +A clear and concise description of what the bug is. + +### To Reproduce + +Code snippet: + +``` + +``` + +(Please provide a code snippet! This will help expedite us finding and solving the problem.) + +Steps to reproduce the behavior: +1. ... +2. ... + +### Expected behavior + +A clear and concise description of what you expected to happen. + +### Actual behavior + +Explain the buggy behavior you experience when you go through the steps above. If applicable, add screenshots to help explain your problem. + +### Is this a regression? + +That is, did this use to work the way you expected in the past? + +- [ ] Yes +- [ ] No + +### Debug info + +- Words'n Fun version: +- Python version: (get it with `$ python --version`) +- OS version: + +### Additional context + +Add any other context about the problem here. diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md new file mode 100644 index 0000000..f8d4583 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -0,0 +1,37 @@ +--- +name: Feature request +about: Suggest an idea for this project +title: '' +labels: 'enhancement' +assignees: '' + + +--- + + + +### Problem + +Is your feature request related to a problem? +Please describe the problem here. Ex. I'm always frustrated when [...] + +### Solution + +Describe the solution you'd like, i.e. a clear and concise description of what you want to happen. +Ideally you want to fullfil the following bullet points : + +- **MVP:** What's the smallest possible solution that would get 80% of the problem out of the way? + +- **Possible additions:** What are other things that could be added to the MVP over time to make it better? + +- **Preferred solution:** If you don't like the MVP above, tell us why, and what you'd like done instead. + +- **Alternatives solution:** Alternative solutions or features you've considered. + + +### Additional context + +Add any other context or screenshots about the feature request here. diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md new file mode 100644 index 0000000..0b686ea --- /dev/null +++ b/.github/pull_request_template.md @@ -0,0 +1,47 @@ + + +## ✒️ Context + +_Please describe the project or issue background here_ + +- What kind of change does this PR introduce ? + + - [ ] Bugfix + - [ ] Feature + - [ ] Refactoring + - [ ] Other, please describe: + +## 🧱 Description of Changes + +- _Add bullet points summarizing your changes here_ + + +## 🩺 Testing + +- _Add bullet points summarizing your changes here_ + + - [ ] This change does not need new tests + - [ ] Added/Updated unit tests + +## 🔗 References + +_Does this depend on other work, documents, or tickets?_ + +- **Issue**: Closes #XXXX + +--- + +**Contribution License Agreement** + +By submitting this pull request you agree that all contributions to this project are made under the GNU AFFERO GENERAL PUBLIC LICENSE. diff --git a/.github/workflows/wnf_build_tests.yaml b/.github/workflows/wnf_build_tests.yaml index bed0aff..16e4480 100644 --- a/.github/workflows/wnf_build_tests.yaml +++ b/.github/workflows/wnf_build_tests.yaml @@ -11,7 +11,7 @@ on: - 'LICENSE' - 'Makefile' pull_request: - types: [opened, reopened, edit] + types: [opened, reopened, edit, synchronize] branches: - 'main' - 'release/v*' @@ -21,6 +21,8 @@ on: - '*.md' - 'LICENSE' - 'Makefile' + schedule: + - cron: '0 0 * * 0' workflow_dispatch: jobs: diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 0000000..fdf794d --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,128 @@ +# Contributor Covenant Code of Conduct + +## Our Pledge + +We as members, contributors, and leaders pledge to make participation in our +community a harassment-free experience for everyone, regardless of age, body +size, visible or invisible disability, ethnicity, sex characteristics, gender +identity and expression, level of experience, education, socio-economic status, +nationality, personal appearance, race, religion, or sexual identity +and orientation. + +We pledge to act and interact in ways that contribute to an open, welcoming, +diverse, inclusive, and healthy community. + +## Our Standards + +Examples of behavior that contributes to a positive environment for our +community include: + +* Demonstrating empathy and kindness toward other people +* Being respectful of differing opinions, viewpoints, and experiences +* Giving and gracefully accepting constructive feedback +* Accepting responsibility and apologizing to those affected by our mistakes, + and learning from the experience +* Focusing on what is best not just for us as individuals, but for the + overall community + +Examples of unacceptable behavior include: + +* The use of sexualized language or imagery, and sexual attention or + advances of any kind +* Trolling, insulting or derogatory comments, and personal or political attacks +* Public or private harassment +* Publishing others' private information, such as a physical or email + address, without their explicit permission +* Other conduct which could reasonably be considered inappropriate in a + professional setting + +## Enforcement Responsibilities + +Community leaders are responsible for clarifying and enforcing our standards of +acceptable behavior and will take appropriate and fair corrective action in +response to any behavior that they deem inappropriate, threatening, offensive, +or harmful. + +Community leaders have the right and responsibility to remove, edit, or reject +comments, commits, code, wiki edits, issues, and other contributions that are +not aligned to this Code of Conduct, and will communicate reasons for moderation +decisions when appropriate. + +## Scope + +This Code of Conduct applies within all community spaces, and also applies when +an individual is officially representing the community in public spaces. +Examples of representing our community include using an official e-mail address, +posting via an official social media account, or acting as an appointed +representative at an online or offline event. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be +reported to the community leaders responsible for enforcement at +oss@pole-emploi.fr. +All complaints will be reviewed and investigated promptly and fairly. + +All community leaders are obligated to respect the privacy and security of the +reporter of any incident. + +## Enforcement Guidelines + +Community leaders will follow these Community Impact Guidelines in determining +the consequences for any action they deem in violation of this Code of Conduct: + +### 1. Correction + +**Community Impact**: Use of inappropriate language or other behavior deemed +unprofessional or unwelcome in the community. + +**Consequence**: A private, written warning from community leaders, providing +clarity around the nature of the violation and an explanation of why the +behavior was inappropriate. A public apology may be requested. + +### 2. Warning + +**Community Impact**: A violation through a single incident or series +of actions. + +**Consequence**: A warning with consequences for continued behavior. No +interaction with the people involved, including unsolicited interaction with +those enforcing the Code of Conduct, for a specified period of time. This +includes avoiding interactions in community spaces as well as external channels +like social media. Violating these terms may lead to a temporary or +permanent ban. + +### 3. Temporary Ban + +**Community Impact**: A serious violation of community standards, including +sustained inappropriate behavior. + +**Consequence**: A temporary ban from any sort of interaction or public +communication with the community for a specified period of time. No public or +private interaction with the people involved, including unsolicited interaction +with those enforcing the Code of Conduct, is allowed during this period. +Violating these terms may lead to a permanent ban. + +### 4. Permanent Ban + +**Community Impact**: Demonstrating a pattern of violation of community +standards, including sustained inappropriate behavior, harassment of an +individual, or aggression toward or disparagement of classes of individuals. + +**Consequence**: A permanent ban from any sort of public interaction within +the community. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant][homepage], +version 2.0, available at +https://www.contributor-covenant.org/version/2/0/code_of_conduct.html. + +Community Impact Guidelines were inspired by [Mozilla's code of conduct +enforcement ladder](https://github.com/mozilla/diversity). + +[homepage]: https://www.contributor-covenant.org + +For answers to common questions about this code of conduct, see the FAQ at +https://www.contributor-covenant.org/faq. Translations are available at +https://www.contributor-covenant.org/translations. diff --git a/README.md b/README.md index 1db045d..a68359b 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,11 @@ [![License: AGPL v3](https://img.shields.io/badge/License-AGPL%20v3-blue.svg)](https://www.gnu.org/licenses/agpl-3.0) [![Generic badge](https://img.shields.io/badge/python-3.7|3.8-blue.svg)](https://shields.io/) -# WORDS N FUN : Semantic analysis module built by agence Data Services +# WORDS N FUN : Semantic analysis module built + +The purpose of this project is two folds: +1. To normalize tools and how tos of semantic analysis projects +2. To offer end to end pipelines to speed up the time to market of NLP services --- @@ -12,15 +16,6 @@ This project is distributed under the GNU AFFERO GENERAL PUBLIC LICENSE V3.0. Please check the LICENSE file. ---- - -## Why ? - -The purpose of this project is two folds: -1. To normalize tools and how tos of semantic analysis projects -2. To offer end to end pipelines to speed up the time to market of NLP services - - --- ## Philosophy of this package diff --git a/requirements.txt b/requirements.txt index d785c77..a35a0e6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,27 +1,29 @@ # Data manipulation -numpy==1.19.5 -pandas==1.3.5 +numpy==1.21.6; python_version < "3.8" +numpy==1.23.2; python_version >= "3.8" +pandas==1.3.5; python_version < "3.8" +pandas==1.4.4; python_version >= "3.8" # NLP -nltk==3.4.5 -ftfy==5.8 +nltk==3.7 +ftfy==6.1.1 # Others -simplejson==3.17.2 -tqdm==4.62.2 -requests==2.23.0 +tqdm==4.64.1 +simplejson==3.17.6 +requests==2.28.1 # Optionnals - code quality & cie -flake8==3.7.9 -black==19.10b0 -isort==4.3.21 +flake8==5.0.4 +black==22.8.0 +isort==5.10.1 nose==1.3.7 nose-exclude==0.5.0 -coverage==5.3 +coverage==6.4.4 # Has to be installed last / optionnal to use spacy lemmatizer markupsafe==2.0.1 # BUG FIX -> https://github.com/aws/aws-sam-cli/issues/3661 Cython==0.29.24 spacy==3.3.1 -# The following line downloads a spacy model. It can be commented if you don't have an internet access to download it, but lemmatizer features won't work. +# The following line downloads a frnech spacy model. It can be commented if you don't have an internet access to download it, but lemmatizer features won't work. https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.3.0/fr_core_news_sm-3.3.0-py3-none-any.whl diff --git a/setup.py b/setup.py index 74eaaec..1472b9f 100644 --- a/setup.py +++ b/setup.py @@ -19,7 +19,7 @@ # Get package directory package_directory = os.path.dirname(os.path.abspath(__file__)) -# Get package version (env variable or verion file + -local) +# Get package version (env variable or version file + -local) version_path = os.path.join(package_directory, 'version.txt') with open(version_path, 'r') as version_file: version = version_file.read().strip() @@ -38,22 +38,26 @@ license='AGPL-3.0', long_description=long_description, long_description_content_type='text/markdown', - author='Agence Data Services PE Nantes', + author="Agence Data Services PE Nantes", + author_email="contactadsaiframeworks.00619@pole-emploi.fr", description="Semantic library of the Data Services agency", url="https://github.com/OSS-Pole-Emploi/words_n_fun", platforms=['windows', 'linux'], + python_requires='>=3.7', package_data={ 'words_n_fun': ['configs/*.json', 'nltk_data/corpora/stopwords/french'] }, include_package_data=True, install_requires=[ - 'pandas==1.3.5', - 'numpy==1.19.5', - 'nltk>=3.4.5,<3.6', - 'tqdm==4.62.2', # https://github.com/tqdm/tqdm/issues/780 - 'simplejson>=3.17.0,<3.17.3', - 'requests>=2.23.0,<2.25.1', - 'ftfy>=5.8,<6.0', + 'pandas>=1.3,<1.4; python_version < "3.8"', + 'pandas>=1.3,<1.5; python_version >= "3.8"', + 'numpy>=1.19,<1.22; python_version < "3.8"', + 'numpy>=1.19,<1.24; python_version >= "3.8"', + 'nltk>=3.4,<3.8', + 'ftfy>=5.8,<6.2', + 'tqdm>=4.40,<4.65', + 'simplejson>=3.17,<3.18', + 'requests>=2.23,<2.29', ], extras_require={ "lemmatizer": ["spacy==3.3.1", "markupsafe==2.0.1", "Cython==0.29.24", "fr-core-news-sm==3.3.0"] diff --git a/tests/test_1_utils.py b/tests/test_1_utils.py index 2ef5ae4..e545dec 100644 --- a/tests/test_1_utils.py +++ b/tests/test_1_utils.py @@ -17,25 +17,24 @@ # along with this program. If not, see . # -# Libs unittest -import unittest -from unittest.mock import Mock -from unittest.mock import patch - +# Pour ces tests, on garde le logger pour certains tests +# Du coup, default to critical +import logging +import ntpath # Utils libs import os import re -import ntpath + +# Libs unittest +import unittest +from unittest.mock import Mock, patch + import numpy as np import pandas as pd - import words_n_fun as wnf from words_n_fun import utils -# Pour ces tests, on garde le logger pour certains tests -# Du coup, default to critical -import logging logging.disable(logging.CRITICAL) logger = wnf.logger @@ -528,11 +527,11 @@ def test_regroup_data_df(self): # Definition d'une fonction à wrapper def test_function_1(df): if type(df) != pd.DataFrame: raise TypeError('') - df['test1'] = df['test1'].str.replace('toto', 'titi') + df['test1'] = df['test1'].str.replace('toto', 'titi', regex=False) return df def test_function_2(df): if type(df) != pd.DataFrame: raise TypeError('') - df['test3'] = df['test2'].str.replace('toto', 'tata') + df['test3'] = df['test2'].str.replace('toto', 'tata', regex=False) return df # Vals à tester df_test = pd.DataFrame([['toto', 'titi', 'tata'], ['tata', 'toto', 'titi'], ['titi', 'tata', 'toto']]*50000, diff --git a/words_n_fun/preprocessing/basic.py b/words_n_fun/preprocessing/basic.py index 0c1cb4a..22ff090 100644 --- a/words_n_fun/preprocessing/basic.py +++ b/words_n_fun/preprocessing/basic.py @@ -97,7 +97,7 @@ def get_true_spaces(docs: pd.Series) -> pd.Series: pd.Series: Modified documents ''' logger.debug('Calling basic.get_true_spaces') - return docs.str.replace(r'\s', ' ') + return docs.str.replace(r'\s', ' ', regex=True) @utils.data_agnostic @@ -136,7 +136,7 @@ def pe_matching(docs: pd.Series) -> pd.Series: logger.debug('Calling basic.pe_matching') # One can add more rules here regex = utils.get_regex_match_words(['(permis)\s+(b)'], case_insensitive=True, words_as_regex=True) - docs = docs.str.replace(regex, r'\2\3') + docs = docs.str.replace(regex, r'\2\3', regex=True) return docs @@ -158,7 +158,7 @@ def remove_punct(docs: pd.Series, del_parenthesis: bool = True, replacement_char regex = r"[^\w\s\(\)\/]|_" else: regex = r"[^\w\s]|_" - return docs.str.replace(regex, replacement_char) + return docs.str.replace(regex, replacement_char, regex=True) @utils.data_agnostic @@ -174,7 +174,7 @@ def trim_string(docs: pd.Series) -> pd.Series: ''' logger.debug('Calling basic.trim_string') # TODO: better way ? - docs = docs.str.replace(r'[\t\f\v ]{2,}', ' ') + docs = docs.str.replace(r'[\t\f\v ]{2,}', ' ', regex=True) docs = remove_leading_and_ending_spaces(docs) return docs @@ -191,8 +191,8 @@ def remove_leading_and_ending_spaces(docs: pd.Series) -> pd.Series: pd.Series: Modified documents ''' logger.debug('Calling basic.remove_leading_and_ending_spaces') - docs = docs.str.replace(r'^(\s)+', '') - return docs.str.replace(r'(\s)+$', '') + docs = docs.str.replace(r'^(\s)+', '', regex=True) + return docs.str.replace(r'(\s)+$', '', regex=True) @utils.data_agnostic @@ -208,7 +208,7 @@ def remove_numeric(docs: pd.Series, replacement_char: str = ' ') -> pd.Series: pd.Series: Modified documents ''' logger.debug('Calling basic.remove_numeric') - return docs.str.replace(r'([0-9]+)', replacement_char) + return docs.str.replace(r'([0-9]+)', replacement_char, regex=True) @utils.data_agnostic @@ -331,7 +331,7 @@ def deal_with_specific_characters(docs: pd.Series) -> pd.Series: pd.Series: Modified documents ''' logger.debug('Calling basic.deal_with_specific_characters') - return docs.str.replace(r"(\s)?([',.;:])(\s)?", r' \2 ') + return docs.str.replace(r"(\s)?([',.;:])(\s)?", r' \2 ', regex=True) @utils.data_agnostic @@ -351,9 +351,9 @@ def replace_urls(docs: pd.Series, replacement_char: str = ' ', replace_with_doma # based on : https://stackoverflow.com/questions/6038061/regular-expression-to-find-urls-within-a-string regex = r'(?i)(? Lemmatizes text +# Get logger +import logging import sys + import pandas as pd from words_n_fun import utils -# Get logger -import logging logger = logging.getLogger(__name__) @@ -76,8 +77,8 @@ def lemmatize(docs: pd.Series) -> pd.Series: docs = ( pd.Series(docs) .str.lower() - .str.replace('\W', ' ') - .str.replace(r"([0-9]+(\.[0-9]+)?)", r" \1 ") + .str.replace('\W', ' ', regex=True) + .str.replace(r"([0-9]+(\.[0-9]+)?)", r" \1 ", regex=True) .str.replace('\s+', ' ', regex=True) .str.strip() ) diff --git a/words_n_fun/preprocessing/stopwords.py b/words_n_fun/preprocessing/stopwords.py index eaf6295..1be4bc9 100644 --- a/words_n_fun/preprocessing/stopwords.py +++ b/words_n_fun/preprocessing/stopwords.py @@ -136,7 +136,15 @@ def remove_stopwords(docs: pd.Series, opt: str = 'all', set_to_add: Union[list, if set_to_remove is None: set_to_remove = [] # Check if everything is in lowercase (NaNs are replaced, letters are kept) - if docs.fillna('').str.replace(r"[^A-Za-z]", '').replace('', 'placeholder').str.islower().sum() != docs.shape[0]: + if ( + docs + .fillna('') + .str.replace(r"[^A-Za-z]", '', regex=True) + .replace('', 'placeholder', regex=True) + .str.islower() + .sum() + != docs.shape[0] + ): logger.warning(docs) logger.warning('Some characters appear to be in uppercase, stopwords are in lowercase only.') # Common soptwords lists @@ -168,7 +176,7 @@ def remove_stopwords(docs: pd.Series, opt: str = 'all', set_to_add: Union[list, return docs.apply(lambda x: x if isinstance(x, str) else None) regex = utils.get_regex_match_words(stopwords_list) - return docs.str.replace(regex, '') + return docs.str.replace(regex, '', regex=True) def stopwords_ascii() -> list: diff --git a/words_n_fun/preprocessing/synonym_malefemale_replacement.py b/words_n_fun/preprocessing/synonym_malefemale_replacement.py index 0f1da5d..b1a174a 100644 --- a/words_n_fun/preprocessing/synonym_malefemale_replacement.py +++ b/words_n_fun/preprocessing/synonym_malefemale_replacement.py @@ -70,9 +70,9 @@ def remove_gender_synonyms(docs: pd.Series) -> pd.Series: logger.debug('Calling synonym_malefemale_replacement.getSynonyms') # Preprocessing - docs = docs.str.replace('(\s*)/(\s*)', '/') # Removes whitespaces around "/" - docs = docs.str.replace('(\s*)\((\s*)', '(') # Removes potential whitespaces before "(" - docs = docs.str.replace('\)', ') ') # Add a space after ")" + docs = docs.str.replace('(\s*)/(\s*)', '/', regex=True) # Removes whitespaces around "/" + docs = docs.str.replace('(\s*)\((\s*)', '(', regex=True) # Removes potential whitespaces before "(" + docs = docs.str.replace('\)', ') ', regex=True) # Add a space after ")" # Set match paterns parenthesis_pattern = r"([\w\-]+)\(([\w\-]+)\)()" # Case : serveur(se)