From 5902d7bb679cccac23055e83d02e7f46b20e3a90 Mon Sep 17 00:00:00 2001 From: abbeyyyy Date: Sat, 9 Apr 2022 16:50:25 -0400 Subject: [PATCH 01/11] add abbreviation replacement data augmentation op and test --- .../algorithms/abbreviation_replacement_op.py | 101 ++++++++++++++++++ .../abbreviation_replacement_op_test.py | 52 +++++++++ 2 files changed, 153 insertions(+) create mode 100644 forte/processors/data_augment/algorithms/abbreviation_replacement_op.py create mode 100644 tests/forte/processors/data_augment/algorithms/abbreviation_replacement_op_test.py diff --git a/forte/processors/data_augment/algorithms/abbreviation_replacement_op.py b/forte/processors/data_augment/algorithms/abbreviation_replacement_op.py new file mode 100644 index 000000000..73d2fa98b --- /dev/null +++ b/forte/processors/data_augment/algorithms/abbreviation_replacement_op.py @@ -0,0 +1,101 @@ +# Copyright 2020 The Forte Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import random +import json +from typing import Tuple, Union, Dict, Any + +import requests +from forte.data.ontology import Annotation +from forte.processors.data_augment.algorithms.single_annotation_op import ( + SingleAnnotationAugmentOp, +) +from forte.common.configuration import Config + +__all__ = [ + "AbbreviationReplacementOp", +] + + +class AbbreviationReplacementOp(SingleAnnotationAugmentOp): + r""" + This class is a replacement op utilizing a pre-defined + abbreviation to replace words. + + Args: + configs: + - prob (float): The probability of replacement, + should fall in [0, 1]. + - dict_path (str): the `url` or the path to the pre-defined + abbreviation json file. The key is a word / phrase we want to replace. + The value is an abbreviated word of the corresponding key. + """ + + def __init__(self, configs: Union[Config, Dict[str, Any]]): + super().__init__(configs) + if "dict_path" in configs.keys(): + self.dict_path = configs["dict_path"] + else: + self.dict_path = ( + "https://raw.githubusercontent.com/GEM-benchmark/NL-Augmenter/main/transformations" + "/abbreviation_transformation/phrase_abbrev_dict.json" + ) + + try: + r = requests.get(self.dict_path) + self.data = json.loads(r.text) + except requests.exceptions.RequestException: + with open(self.dict_path, encoding="utf8") as json_file: + self.data = json.load(json_file) + + def single_annotation_augment( + self, input_anno: Annotation + ) -> Tuple[bool, str]: + r""" + This function replaces a word from an abbreviation dictionary. + + Args: + input_anno (Annotation): The input annotation. + Returns: + A tuple, where the first element is a boolean value indicating + whether the replacement happens, and the second element is the + replaced string. + """ + # If the replacement does not happen, return False. + if random.random() > self.configs.prob: + return False, input_anno.text + if input_anno.text in self.data.keys(): + result: str = self.data[input_anno.text] + return True, result + else: + return False, input_anno.text + + @classmethod + def default_configs(cls) -> Dict[str, Any]: + r""" + Returns: + A dictionary with the default config for this processor. + Following are the keys for this dictionary: + - prob (float): The probability of replacement, + should fall in [0, 1]. Default value is 0.1 + - dict_path (str): the `url` or the path to the pre-defined + abbreviation json file. The key is a word / phrase we want to replace. + The value is an abbreviated word of the corresponding key. + """ + return { + "dict_path": "https://raw.githubusercontent.com/GEM-benchmark/NL-Augmenter/main/transformations" + + "/abbreviation_transformation/phrase_abbrev_dict.json", + "prob": 0.5, + } diff --git a/tests/forte/processors/data_augment/algorithms/abbreviation_replacement_op_test.py b/tests/forte/processors/data_augment/algorithms/abbreviation_replacement_op_test.py new file mode 100644 index 000000000..751056520 --- /dev/null +++ b/tests/forte/processors/data_augment/algorithms/abbreviation_replacement_op_test.py @@ -0,0 +1,52 @@ +# Copyright 2020 The Forte Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Unit tests for dictionary word replacement op. +""" + +import unittest +from forte.data.data_pack import DataPack +from ft.onto.base_ontology import Token +from forte.processors.data_augment.algorithms.abbreviation_replacement_op import ( + AbbreviationReplacementOp, +) + + +class TestAbbreviationReplacementOp(unittest.TestCase): + def setUp(self): + self.abre = AbbreviationReplacementOp( + configs={ + "prob": 1.0, + } + ) + + def test_replace(self): + data_pack = DataPack() + text = "see you later" + data_pack.set_text(text) + token = Token(data_pack, 0, len(text)) + data_pack.add_entry(token) + + augmented_data_pack = self.abre.perform_augmentation(data_pack) + + augmented_token = list(augmented_data_pack.get('ft.onto.base_ontology.Token'))[0] + + self.assertIn( + augmented_token.text, + ["syl8r", "cul83r", "cul8r"], + ) + + +if __name__ == "__main__": + unittest.main() From aa98d53feb0687e8796d96dc484015563a694de8 Mon Sep 17 00:00:00 2001 From: abbeyyyy Date: Mon, 11 Apr 2022 10:18:06 -0400 Subject: [PATCH 02/11] black reformatting --- .../algorithms/abbreviation_replacement_op.py | 18 +++++++++--------- .../abbreviation_replacement_op_test.py | 4 +++- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/forte/processors/data_augment/algorithms/abbreviation_replacement_op.py b/forte/processors/data_augment/algorithms/abbreviation_replacement_op.py index 73d2fa98b..49cdc99e1 100644 --- a/forte/processors/data_augment/algorithms/abbreviation_replacement_op.py +++ b/forte/processors/data_augment/algorithms/abbreviation_replacement_op.py @@ -85,15 +85,15 @@ def single_annotation_augment( @classmethod def default_configs(cls) -> Dict[str, Any]: r""" - Returns: - A dictionary with the default config for this processor. - Following are the keys for this dictionary: - - prob (float): The probability of replacement, - should fall in [0, 1]. Default value is 0.1 - - dict_path (str): the `url` or the path to the pre-defined - abbreviation json file. The key is a word / phrase we want to replace. - The value is an abbreviated word of the corresponding key. - """ + Returns: + A dictionary with the default config for this processor. + Following are the keys for this dictionary: + - prob (float): The probability of replacement, + should fall in [0, 1]. Default value is 0.1 + - dict_path (str): the `url` or the path to the pre-defined + abbreviation json file. The key is a word / phrase we want to replace. + The value is an abbreviated word of the corresponding key. + """ return { "dict_path": "https://raw.githubusercontent.com/GEM-benchmark/NL-Augmenter/main/transformations" + "/abbreviation_transformation/phrase_abbrev_dict.json", diff --git a/tests/forte/processors/data_augment/algorithms/abbreviation_replacement_op_test.py b/tests/forte/processors/data_augment/algorithms/abbreviation_replacement_op_test.py index 751056520..0bc8a612a 100644 --- a/tests/forte/processors/data_augment/algorithms/abbreviation_replacement_op_test.py +++ b/tests/forte/processors/data_augment/algorithms/abbreviation_replacement_op_test.py @@ -40,7 +40,9 @@ def test_replace(self): augmented_data_pack = self.abre.perform_augmentation(data_pack) - augmented_token = list(augmented_data_pack.get('ft.onto.base_ontology.Token'))[0] + augmented_token = list( + augmented_data_pack.get("ft.onto.base_ontology.Token") + )[0] self.assertIn( augmented_token.text, From a3df17de14c1905c6da64b1fea0702059e22f8d3 Mon Sep 17 00:00:00 2001 From: abbeyyyy Date: Mon, 11 Apr 2022 10:41:05 -0400 Subject: [PATCH 03/11] reformatting string line length --- .../algorithms/abbreviation_replacement_op.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/forte/processors/data_augment/algorithms/abbreviation_replacement_op.py b/forte/processors/data_augment/algorithms/abbreviation_replacement_op.py index 49cdc99e1..e9bbc93eb 100644 --- a/forte/processors/data_augment/algorithms/abbreviation_replacement_op.py +++ b/forte/processors/data_augment/algorithms/abbreviation_replacement_op.py @@ -49,8 +49,9 @@ def __init__(self, configs: Union[Config, Dict[str, Any]]): self.dict_path = configs["dict_path"] else: self.dict_path = ( - "https://raw.githubusercontent.com/GEM-benchmark/NL-Augmenter/main/transformations" - "/abbreviation_transformation/phrase_abbrev_dict.json" + "https://raw.githubusercontent.com/GEM-benchmark/NL-Augmenter/" + + "main/transformations/abbreviation_transformation/" + + "phrase_abbrev_dict.json" ) try: @@ -91,11 +92,13 @@ def default_configs(cls) -> Dict[str, Any]: - prob (float): The probability of replacement, should fall in [0, 1]. Default value is 0.1 - dict_path (str): the `url` or the path to the pre-defined - abbreviation json file. The key is a word / phrase we want to replace. - The value is an abbreviated word of the corresponding key. + abbreviation json file. The key is a word / phrase we want + to replace. The value is an abbreviated word of the + corresponding key. """ return { - "dict_path": "https://raw.githubusercontent.com/GEM-benchmark/NL-Augmenter/main/transformations" - + "/abbreviation_transformation/phrase_abbrev_dict.json", + "dict_path": "https://raw.githubusercontent.com/GEM-benchmark/" + + "NL-Augmenter/main/transformations/" + + "abbreviation_transformation/phrase_abbrev_dict.json", "prob": 0.5, } From f7795491485ce5a05f5de0856acfa6dcba45a486 Mon Sep 17 00:00:00 2001 From: abbeyyyy Date: Mon, 11 Apr 2022 19:34:52 -0400 Subject: [PATCH 04/11] fix docstring --- .../algorithms/abbreviation_replacement_op.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/forte/processors/data_augment/algorithms/abbreviation_replacement_op.py b/forte/processors/data_augment/algorithms/abbreviation_replacement_op.py index e9bbc93eb..d53537184 100644 --- a/forte/processors/data_augment/algorithms/abbreviation_replacement_op.py +++ b/forte/processors/data_augment/algorithms/abbreviation_replacement_op.py @@ -36,9 +36,9 @@ class AbbreviationReplacementOp(SingleAnnotationAugmentOp): Args: configs: - - prob (float): The probability of replacement, + - prob: The probability of replacement, should fall in [0, 1]. - - dict_path (str): the `url` or the path to the pre-defined + - dict_path: the `url` or the path to the pre-defined abbreviation json file. The key is a word / phrase we want to replace. The value is an abbreviated word of the corresponding key. """ @@ -68,7 +68,7 @@ def single_annotation_augment( This function replaces a word from an abbreviation dictionary. Args: - input_anno (Annotation): The input annotation. + input_anno: The input annotation. Returns: A tuple, where the first element is a boolean value indicating whether the replacement happens, and the second element is the @@ -89,9 +89,9 @@ def default_configs(cls) -> Dict[str, Any]: Returns: A dictionary with the default config for this processor. Following are the keys for this dictionary: - - prob (float): The probability of replacement, + - prob: The probability of replacement, should fall in [0, 1]. Default value is 0.1 - - dict_path (str): the `url` or the path to the pre-defined + - dict_path: the `url` or the path to the pre-defined abbreviation json file. The key is a word / phrase we want to replace. The value is an abbreviated word of the corresponding key. From 247ef590ec47895f77a6d45d1cd448d4786900dc Mon Sep 17 00:00:00 2001 From: abbeyyyy <43121769+abbeyyyy@users.noreply.github.com> Date: Wed, 27 Apr 2022 00:09:33 -0400 Subject: [PATCH 05/11] fix documentation / changed the replaced annotation to phrase --- .../algorithms/abbreviation_replacement_op.py | 59 ++++++++++--------- .../abbreviation_replacement_op_test.py | 19 +++--- 2 files changed, 41 insertions(+), 37 deletions(-) diff --git a/forte/processors/data_augment/algorithms/abbreviation_replacement_op.py b/forte/processors/data_augment/algorithms/abbreviation_replacement_op.py index d53537184..6523963a1 100644 --- a/forte/processors/data_augment/algorithms/abbreviation_replacement_op.py +++ b/forte/processors/data_augment/algorithms/abbreviation_replacement_op.py @@ -1,4 +1,4 @@ -# Copyright 2020 The Forte Authors. All Rights Reserved. +# Copyright 2022 The Forte Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,10 +15,10 @@ import random import json -from typing import Tuple, Union, Dict, Any +from typing import Tuple, Dict, Any import requests -from forte.data.ontology import Annotation +from ft.onto.base_ontology import Phrase from forte.processors.data_augment.algorithms.single_annotation_op import ( SingleAnnotationAugmentOp, ) @@ -32,7 +32,8 @@ class AbbreviationReplacementOp(SingleAnnotationAugmentOp): r""" This class is a replacement op utilizing a pre-defined - abbreviation to replace words. + abbreviation to replace words, to replace the input phrase + with an abbreviation. Args: configs: @@ -43,32 +44,29 @@ class AbbreviationReplacementOp(SingleAnnotationAugmentOp): The value is an abbreviated word of the corresponding key. """ - def __init__(self, configs: Union[Config, Dict[str, Any]]): + def __init__(self, configs: Config): super().__init__(configs) - if "dict_path" in configs.keys(): - self.dict_path = configs["dict_path"] - else: - self.dict_path = ( - "https://raw.githubusercontent.com/GEM-benchmark/NL-Augmenter/" - + "main/transformations/abbreviation_transformation/" - + "phrase_abbrev_dict.json" - ) + + dict_path = configs["dict_path"] try: - r = requests.get(self.dict_path) - self.data = json.loads(r.text) + r = requests.get(dict_path) + self.data = r.json() except requests.exceptions.RequestException: - with open(self.dict_path, encoding="utf8") as json_file: + with open(dict_path, encoding="utf8") as json_file: self.data = json.load(json_file) def single_annotation_augment( - self, input_anno: Annotation + self, input_phrase: Phrase ) -> Tuple[bool, str]: r""" - This function replaces a word from an abbreviation dictionary. + This function replaces a phrase from an abbreviation dictionary + with `prob` as the probability of replacement. + If the input phrase does not have a corresponding phrase in the + dictionary, no replacement will happen, return False. Args: - input_anno: The input annotation. + input_phrase: The input phrase. Returns: A tuple, where the first element is a boolean value indicating whether the replacement happens, and the second element is the @@ -76,12 +74,12 @@ def single_annotation_augment( """ # If the replacement does not happen, return False. if random.random() > self.configs.prob: - return False, input_anno.text - if input_anno.text in self.data.keys(): - result: str = self.data[input_anno.text] + return False, input_phrase.text + if input_phrase.text in self.data.keys(): + result: str = self.data[input_phrase.text] return True, result else: - return False, input_anno.text + return False, input_phrase.text @classmethod def default_configs(cls) -> Dict[str, Any]: @@ -90,15 +88,20 @@ def default_configs(cls) -> Dict[str, Any]: A dictionary with the default config for this processor. Following are the keys for this dictionary: - prob: The probability of replacement, - should fall in [0, 1]. Default value is 0.1 + should fall in [0, 1]. Default value is 0.5. - dict_path: the `url` or the path to the pre-defined abbreviation json file. The key is a word / phrase we want to replace. The value is an abbreviated word of the - corresponding key. + corresponding key. Default dictionary is from a web-scraped + slang dictionary ("https://github.com/abbeyyyy/JsonFiles/ + blob/main/abbreviate.json"). """ return { - "dict_path": "https://raw.githubusercontent.com/GEM-benchmark/" - + "NL-Augmenter/main/transformations/" - + "abbreviation_transformation/phrase_abbrev_dict.json", + "augment_entry": "ft.onto.base_ontology.Phrase", + "other_entry_policy": { + "ft.onto.base_ontology.Phrase": "auto_align", + }, + "dict_path": "https://raw.githubusercontent.com/abbeyyyy/" + "JsonFiles/main/abbreviate.json", "prob": 0.5, } diff --git a/tests/forte/processors/data_augment/algorithms/abbreviation_replacement_op_test.py b/tests/forte/processors/data_augment/algorithms/abbreviation_replacement_op_test.py index 0bc8a612a..a0e84f8e7 100644 --- a/tests/forte/processors/data_augment/algorithms/abbreviation_replacement_op_test.py +++ b/tests/forte/processors/data_augment/algorithms/abbreviation_replacement_op_test.py @@ -1,4 +1,4 @@ -# Copyright 2020 The Forte Authors. All Rights Reserved. +# Copyright 2022 The Forte Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -17,7 +17,7 @@ import unittest from forte.data.data_pack import DataPack -from ft.onto.base_ontology import Token +from ft.onto.base_ontology import Phrase from forte.processors.data_augment.algorithms.abbreviation_replacement_op import ( AbbreviationReplacementOp, ) @@ -27,25 +27,26 @@ class TestAbbreviationReplacementOp(unittest.TestCase): def setUp(self): self.abre = AbbreviationReplacementOp( configs={ + "dict_path": "https://raw.githubusercontent.com/abbeyyyy/" + "JsonFiles/main/abbreviate.json", "prob": 1.0, } ) def test_replace(self): data_pack = DataPack() - text = "see you later" + text = "I will see you later!" data_pack.set_text(text) - token = Token(data_pack, 0, len(text)) - data_pack.add_entry(token) + phrase = Phrase(data_pack, 7, len(text) - 1) + data_pack.add_entry(phrase) augmented_data_pack = self.abre.perform_augmentation(data_pack) - - augmented_token = list( - augmented_data_pack.get("ft.onto.base_ontology.Token") + augmented_phrase = list( + augmented_data_pack.get("ft.onto.base_ontology.Phrase") )[0] self.assertIn( - augmented_token.text, + augmented_phrase.text, ["syl8r", "cul83r", "cul8r"], ) From cc471abc980614f42d972b633611e922bd103d6c Mon Sep 17 00:00:00 2001 From: abbeyyyy <43121769+abbeyyyy@users.noreply.github.com> Date: Wed, 27 Apr 2022 01:12:45 -0400 Subject: [PATCH 06/11] fix argument --- .../algorithms/abbreviation_replacement_op.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/forte/processors/data_augment/algorithms/abbreviation_replacement_op.py b/forte/processors/data_augment/algorithms/abbreviation_replacement_op.py index 6523963a1..710c8341b 100644 --- a/forte/processors/data_augment/algorithms/abbreviation_replacement_op.py +++ b/forte/processors/data_augment/algorithms/abbreviation_replacement_op.py @@ -18,7 +18,7 @@ from typing import Tuple, Dict, Any import requests -from ft.onto.base_ontology import Phrase +from forte.data.ontology import Annotation from forte.processors.data_augment.algorithms.single_annotation_op import ( SingleAnnotationAugmentOp, ) @@ -57,7 +57,7 @@ def __init__(self, configs: Config): self.data = json.load(json_file) def single_annotation_augment( - self, input_phrase: Phrase + self, input_anno: Annotation ) -> Tuple[bool, str]: r""" This function replaces a phrase from an abbreviation dictionary @@ -66,7 +66,7 @@ def single_annotation_augment( dictionary, no replacement will happen, return False. Args: - input_phrase: The input phrase. + input_anno: The input annotation, could be a word or phrase. Returns: A tuple, where the first element is a boolean value indicating whether the replacement happens, and the second element is the @@ -74,12 +74,12 @@ def single_annotation_augment( """ # If the replacement does not happen, return False. if random.random() > self.configs.prob: - return False, input_phrase.text - if input_phrase.text in self.data.keys(): - result: str = self.data[input_phrase.text] + return False, input_anno.text + if input_anno.text in self.data.keys(): + result: str = self.data[input_anno.text] return True, result else: - return False, input_phrase.text + return False, input_anno.text @classmethod def default_configs(cls) -> Dict[str, Any]: From d295df0451c0722b202eebf57285a5ef1da44087 Mon Sep 17 00:00:00 2001 From: abbeyyyy <43121769+abbeyyyy@users.noreply.github.com> Date: Fri, 6 May 2022 12:06:49 -0400 Subject: [PATCH 07/11] Add test and docs --- docs/code/data_aug.rst | 5 ++ .../algorithms/abbreviation_replacement_op.py | 6 ++- .../abbreviation_replacement_op_test.py | 51 +++++++++++++++---- 3 files changed, 50 insertions(+), 12 deletions(-) diff --git a/docs/code/data_aug.rst b/docs/code/data_aug.rst index ecdbb5294..38ff385b9 100644 --- a/docs/code/data_aug.rst +++ b/docs/code/data_aug.rst @@ -159,6 +159,11 @@ Data Augmentation Ops .. autoclass:: forte.processors.data_augment.algorithms.eda_ops.RandomDeletionDataAugmentOp :members: +:hidden:`AbbreviationReplacementOp` +------------------------------------------ +.. autoclass:: forte.processors.data_augment.algorithms.AbbreviationReplacementOp + :members: + Data Augmentation Models ======================================== diff --git a/forte/processors/data_augment/algorithms/abbreviation_replacement_op.py b/forte/processors/data_augment/algorithms/abbreviation_replacement_op.py index 710c8341b..53724db67 100644 --- a/forte/processors/data_augment/algorithms/abbreviation_replacement_op.py +++ b/forte/processors/data_augment/algorithms/abbreviation_replacement_op.py @@ -32,8 +32,10 @@ class AbbreviationReplacementOp(SingleAnnotationAugmentOp): r""" This class is a replacement op utilizing a pre-defined - abbreviation to replace words, to replace the input phrase - with an abbreviation. + abbreviation dictionary to replace word or phrase + with an abbreviation. The abbreviation dictionary can + be user-defined, we also provide a default dictionary. + `prob` indicates the probability of replacement. Args: configs: diff --git a/tests/forte/processors/data_augment/algorithms/abbreviation_replacement_op_test.py b/tests/forte/processors/data_augment/algorithms/abbreviation_replacement_op_test.py index a0e84f8e7..6e905a72d 100644 --- a/tests/forte/processors/data_augment/algorithms/abbreviation_replacement_op_test.py +++ b/tests/forte/processors/data_augment/algorithms/abbreviation_replacement_op_test.py @@ -34,22 +34,53 @@ def setUp(self): ) def test_replace(self): - data_pack = DataPack() - text = "I will see you later!" - data_pack.set_text(text) - phrase = Phrase(data_pack, 7, len(text) - 1) - data_pack.add_entry(phrase) - - augmented_data_pack = self.abre.perform_augmentation(data_pack) - augmented_phrase = list( - augmented_data_pack.get("ft.onto.base_ontology.Phrase") + data_pack_1 = DataPack() + text_1 = "I will see you later!" + data_pack_1.set_text(text_1) + phrase_1 = Phrase(data_pack_1, 7, len(text_1) - 1) + data_pack_1.add_entry(phrase_1) + + augmented_data_pack_1 = self.abre.perform_augmentation(data_pack_1) + augmented_phrase_1 = list( + augmented_data_pack_1.get("ft.onto.base_ontology.Phrase") )[0] self.assertIn( - augmented_phrase.text, + augmented_phrase_1.text, ["syl8r", "cul83r", "cul8r"], ) + # Empty phrase + data_pack_2 = DataPack() + data_pack_2.set_text(text_1) + phrase_2 = Phrase(data_pack_2, 0, 0) + data_pack_2.add_entry(phrase_2) + + augmented_data_pack_2 = self.abre.perform_augmentation(data_pack_2) + augmented_phrase_2 = list( + augmented_data_pack_2.get("ft.onto.base_ontology.Phrase") + )[0] + + self.assertIn( + augmented_phrase_2.text, + [""], + ) + + # no abbreviation exist + data_pack_3 = DataPack() + data_pack_3.set_text(text_1) + phrase_3 = Phrase(data_pack_3, 2, 6) + data_pack_3.add_entry(phrase_3) + + augmented_data_pack_3 = self.abre.perform_augmentation(data_pack_3) + augmented_phrase_3 = list( + augmented_data_pack_3.get("ft.onto.base_ontology.Phrase") + )[0] + + self.assertIn( + augmented_phrase_3.text, + ["will"], + ) if __name__ == "__main__": unittest.main() From e657d499c2eedff54487e89ffc2f19f6e1bc2676 Mon Sep 17 00:00:00 2001 From: abbeyyyy <43121769+abbeyyyy@users.noreply.github.com> Date: Tue, 10 May 2022 11:28:22 -0400 Subject: [PATCH 08/11] fix docs --- docs/code/data_aug.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/code/data_aug.rst b/docs/code/data_aug.rst index 38ff385b9..9bd925b32 100644 --- a/docs/code/data_aug.rst +++ b/docs/code/data_aug.rst @@ -161,7 +161,7 @@ Data Augmentation Ops :hidden:`AbbreviationReplacementOp` ------------------------------------------ -.. autoclass:: forte.processors.data_augment.algorithms.AbbreviationReplacementOp +.. autoclass:: forte.processors.data_augment.algorithms.abbreviation_replacement_op.AbbreviationReplacementOp :members: Data Augmentation Models From 8ae01127bc47aea4225d72d9d6942c8be3e9b655 Mon Sep 17 00:00:00 2001 From: abbeyyyy <43121769+abbeyyyy@users.noreply.github.com> Date: Tue, 10 May 2022 11:54:05 -0400 Subject: [PATCH 09/11] fix docs --- .../algorithms/abbreviation_replacement_op.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/forte/processors/data_augment/algorithms/abbreviation_replacement_op.py b/forte/processors/data_augment/algorithms/abbreviation_replacement_op.py index 53724db67..db351ad1a 100644 --- a/forte/processors/data_augment/algorithms/abbreviation_replacement_op.py +++ b/forte/processors/data_augment/algorithms/abbreviation_replacement_op.py @@ -41,9 +41,11 @@ class AbbreviationReplacementOp(SingleAnnotationAugmentOp): configs: - prob: The probability of replacement, should fall in [0, 1]. + - dict_path: the `url` or the path to the pre-defined - abbreviation json file. The key is a word / phrase we want to replace. - The value is an abbreviated word of the corresponding key. + abbreviation json file. The key is a word / phrase we want to + replace. The value is an abbreviated word of the corresponding key. + """ def __init__(self, configs: Config): @@ -69,6 +71,7 @@ def single_annotation_augment( Args: input_anno: The input annotation, could be a word or phrase. + Returns: A tuple, where the first element is a boolean value indicating whether the replacement happens, and the second element is the @@ -89,14 +92,17 @@ def default_configs(cls) -> Dict[str, Any]: Returns: A dictionary with the default config for this processor. Following are the keys for this dictionary: + - prob: The probability of replacement, should fall in [0, 1]. Default value is 0.5. + - dict_path: the `url` or the path to the pre-defined abbreviation json file. The key is a word / phrase we want to replace. The value is an abbreviated word of the corresponding key. Default dictionary is from a web-scraped slang dictionary ("https://github.com/abbeyyyy/JsonFiles/ blob/main/abbreviate.json"). + """ return { "augment_entry": "ft.onto.base_ontology.Phrase", From 512b1a0eb040388dcdf85782cc5b207b43812399 Mon Sep 17 00:00:00 2001 From: abbeyyyy <43121769+abbeyyyy@users.noreply.github.com> Date: Tue, 10 May 2022 13:40:17 -0400 Subject: [PATCH 10/11] fix docs error --- .../algorithms/abbreviation_replacement_op.py | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/forte/processors/data_augment/algorithms/abbreviation_replacement_op.py b/forte/processors/data_augment/algorithms/abbreviation_replacement_op.py index db351ad1a..97e2beb08 100644 --- a/forte/processors/data_augment/algorithms/abbreviation_replacement_op.py +++ b/forte/processors/data_augment/algorithms/abbreviation_replacement_op.py @@ -36,16 +36,6 @@ class AbbreviationReplacementOp(SingleAnnotationAugmentOp): with an abbreviation. The abbreviation dictionary can be user-defined, we also provide a default dictionary. `prob` indicates the probability of replacement. - - Args: - configs: - - prob: The probability of replacement, - should fall in [0, 1]. - - - dict_path: the `url` or the path to the pre-defined - abbreviation json file. The key is a word / phrase we want to - replace. The value is an abbreviated word of the corresponding key. - """ def __init__(self, configs: Config): @@ -76,6 +66,7 @@ def single_annotation_augment( A tuple, where the first element is a boolean value indicating whether the replacement happens, and the second element is the replaced string. + """ # If the replacement does not happen, return False. if random.random() > self.configs.prob: From a62c86487f9ee3dd0774f1cc73fc1a20a8b9d241 Mon Sep 17 00:00:00 2001 From: abbeyyyy <43121769+abbeyyyy@users.noreply.github.com> Date: Tue, 10 May 2022 14:06:14 -0400 Subject: [PATCH 11/11] Update abbreviation_replacement_op.py --- .../data_augment/algorithms/abbreviation_replacement_op.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/forte/processors/data_augment/algorithms/abbreviation_replacement_op.py b/forte/processors/data_augment/algorithms/abbreviation_replacement_op.py index 97e2beb08..79bed2b9c 100644 --- a/forte/processors/data_augment/algorithms/abbreviation_replacement_op.py +++ b/forte/processors/data_augment/algorithms/abbreviation_replacement_op.py @@ -91,8 +91,8 @@ def default_configs(cls) -> Dict[str, Any]: abbreviation json file. The key is a word / phrase we want to replace. The value is an abbreviated word of the corresponding key. Default dictionary is from a web-scraped - slang dictionary ("https://github.com/abbeyyyy/JsonFiles/ - blob/main/abbreviate.json"). + slang dictionary + ("https://github.com/abbeyyyy/JsonFiles/blob/main/abbreviate.json"). """ return {