PyThaiNLP
diff --git a/‎bin/act‎
12.8 MB b/‎bin/act‎
12.8 MB
diff --git a/‎pythainlp/augment/lm/fasttext.py‎
Lines changed: 5 additions & 6 deletions b/‎pythainlp/augment/lm/fasttext.py‎
Lines changed: 5 additions & 6 deletions
diff --git a/‎pythainlp/augment/lm/wangchanberta.py‎
Lines changed: 13 additions & 15 deletions b/‎pythainlp/augment/lm/wangchanberta.py‎
Lines changed: 13 additions & 15 deletions
diff --git a/‎pythainlp/augment/word2vec/__init__.py‎
Lines changed: 1 addition & 5 deletions b/‎pythainlp/augment/word2vec/__init__.py‎
Lines changed: 1 addition & 5 deletions
diff --git a/‎pythainlp/augment/word2vec/bpemb_wv.py‎
Lines changed: 3 additions & 1 deletion b/‎pythainlp/augment/word2vec/bpemb_wv.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎pythainlp/augment/word2vec/core.py‎
Lines changed: 4 additions & 8 deletions b/‎pythainlp/augment/word2vec/core.py‎
Lines changed: 4 additions & 8 deletions
diff --git a/‎pythainlp/augment/word2vec/ltw2v.py‎
Lines changed: 4 additions & 6 deletions b/‎pythainlp/augment/word2vec/ltw2v.py‎
Lines changed: 4 additions & 6 deletions
diff --git a/‎pythainlp/augment/word2vec/thai2fit.py‎
Lines changed: 3 additions & 5 deletions b/‎pythainlp/augment/word2vec/thai2fit.py‎
Lines changed: 3 additions & 5 deletions
diff --git a/‎pythainlp/augment/wordnet.py‎
Lines changed: 6 additions & 8 deletions b/‎pythainlp/augment/wordnet.py‎
Lines changed: 6 additions & 8 deletions
diff --git a/‎pythainlp/benchmarks/word_tokenization.py‎
Lines changed: 1 addition & 1 deletion b/‎pythainlp/benchmarks/word_tokenization.py‎
Lines changed: 1 addition & 1 deletion
@@ -12,13 +12,14 @@ class FastTextAug:
 
     :param str model_path: path of model file
     """
+
     def __init__(self, model_path: str):
         """
         :param str model_path: path of model file
         """
-        if model_path.endswith('.bin'):
+        if model_path.endswith(".bin"):
             self.model = FastText_gensim.load_facebook_vectors(model_path)
-        elif model_path.endswith('.vec'):
+        elif model_path.endswith(".vec"):
             self.model = KeyedVectors.load_word2vec_format(model_path)
         else:
             self.model = FastText_gensim.load(model_path)
@@ -33,7 +34,7 @@ def tokenize(self, text: str) -> List[str]:
         :return: list of word
         :rtype: List[str]
         """
-        return word_tokenize(text, engine='icu')
+        return word_tokenize(text, engine="icu")
 
     def modify_sent(self, sent: str, p: float = 0.7) -> List[List[str]]:
         """
@@ -44,9 +45,7 @@ def modify_sent(self, sent: str, p: float = 0.7) -> List[List[str]]:
         list_sent_new = []
         for i in sent:
             if i in self.dict_wv:
-                w = [
-                    j for j, v in self.model.most_similar(i) if v >= p
-                ]
+                w = [j for j, v in self.model.most_similar(i) if v >= p]
                 if w == []:
                     list_sent_new.append([i])
                 else:
 
@@ -14,26 +14,26 @@ def __init__(self):
         self.model_name = "airesearch/wangchanberta-base-att-spm-uncased"
         self.target_tokenizer = CamembertTokenizer
         self.tokenizer = CamembertTokenizer.from_pretrained(
-                                    self.model_name,
-                                    revision='main')
+            self.model_name, revision="main"
+        )
         self.tokenizer.additional_special_tokens = [
-            '<s>NOTUSED',
-            '</s>NOTUSED',
-            '<_>'
+            "<s>NOTUSED",
+            "</s>NOTUSED",
+            "<_>",
         ]
         self.fill_mask = pipeline(
-            task='fill-mask',
+            task="fill-mask",
             tokenizer=self.tokenizer,
-            model=f'{self.model_name}',
-            revision='main'
+            model=f"{self.model_name}",
+            revision="main",
         )
         self.MASK_TOKEN = self.tokenizer.mask_token
 
     def generate(self, sentence: str, num_replace_tokens: int = 3):
         self.sent2 = []
         self.input_text = sentence
         sent = [
-            i for i in self.tokenizer.tokenize(self.input_text) if i != '▁'
+            i for i in self.tokenizer.tokenize(self.input_text) if i != "▁"
         ]
         if len(sent) < num_replace_tokens:
             num_replace_tokens = len(sent)
@@ -42,18 +42,16 @@ def generate(self, sentence: str, num_replace_tokens: int = 3):
             replace_token = [
                 sent.pop(random.randrange(len(sent))) for _ in range(1)
             ][0]
-            masked_text = masked_text+self.MASK_TOKEN
+            masked_text = masked_text + self.MASK_TOKEN
             self.sent2 += [
-                str(j['sequence']).replace('<s> ', '').replace('</s>', '')
+                str(j["sequence"]).replace("<s> ", "").replace("</s>", "")
                 for j in self.fill_mask(masked_text)
-                if j['sequence'] not in self.sent2
+                if j["sequence"] not in self.sent2
             ]
             masked_text = self.input_text
         return self.sent2
 
-    def augment(
-        self, sentence: str, num_replace_tokens: int = 3
-    ) -> List[str]:
+    def augment(self, sentence: str, num_replace_tokens: int = 3) -> List[str]:
         """
         Text Augment from wangchanberta
 
 
@@ -3,11 +3,7 @@
 Word2Vec
 """
 
-__all__ = [
-    "Word2VecAug",
-    "Thai2fitAug",
-    "LTW2VAug"
-]
+__all__ = ["Word2VecAug", "Thai2fitAug", "LTW2VAug"]
 
 from pythainlp.augment.word2vec.core import Word2VecAug
 from pythainlp.augment.word2vec.thai2fit import Thai2fitAug
 
@@ -10,8 +10,10 @@ class BPEmbAug:
     BPEmb:
     `github.com/bheinzerling/bpemb <https://github.com/bheinzerling/bpemb>`_
     """
+
     def __init__(self, lang: str = "th", vs: int = 100000, dim: int = 300):
         from bpemb import BPEmb
+
         self.bpemb_temp = BPEmb(lang=lang, dim=dim, vs=vs)
         self.model = self.bpemb_temp.emb
         self.load_w2v()
@@ -58,6 +60,6 @@ def augment(
         for i in self.temp:
             self.t = ""
             for j in i:
-                self.t += j.replace('▁', '')
+                self.t += j.replace("▁", "")
             self.temp_new.append(self.t)
         return self.temp_new
@@ -13,12 +13,13 @@ def __init__(
         :param str type: moodel type (file, binary)
         """
         import gensim.models.keyedvectors as word2vec
+
         self.tokenizer = tokenize
         if type == "file":
             self.model = word2vec.KeyedVectors.load_word2vec_format(model)
         elif type == "binary":
             self.model = word2vec.KeyedVectors.load_word2vec_format(
-                model, binary=True, unicode_errors='ignore'
+                model, binary=True, unicode_errors="ignore"
             )
         else:
             self.model = model
@@ -33,9 +34,7 @@ def modify_sent(self, sent: str, p: float = 0.7) -> List[List[str]]:
         list_sent_new = []
         for i in sent:
             if i in self.dict_wv:
-                w = [
-                    j for j, v in self.model.most_similar(i) if v >= p
-                ]
+                w = [j for j, v in self.model.most_similar(i) if v >= p]
                 if w == []:
                     list_sent_new.append([i])
                 else:
@@ -45,10 +44,7 @@ def modify_sent(self, sent: str, p: float = 0.7) -> List[List[str]]:
         return list_sent_new
 
     def augment(
-        self,
-        sentence: str,
-        n_sent: int = 1,
-        p: float = 0.7
+        self, sentence: str, n_sent: int = 1, p: float = 0.7
     ) -> List[Tuple[str]]:
         """
         :param str sentence: text sentence
 
@@ -12,16 +12,17 @@ class LTW2VAug:
     LTW2V:
     `github.com/PyThaiNLP/large-thaiword2vec <https://github.com/PyThaiNLP/large-thaiword2vec>`_
     """
+
     def __init__(self):
-        self.ltw2v_wv = get_corpus_path('ltw2v')
+        self.ltw2v_wv = get_corpus_path("ltw2v")
         self.load_w2v()
 
     def tokenizer(self, text: str) -> List[str]:
         """
         :param str text: thai text
         :rtype: List[str]
         """
-        return word_tokenize(text, engine='newmm')
+        return word_tokenize(text, engine="newmm")
 
     def load_w2v(self):  # insert substitute
         """
@@ -30,10 +31,7 @@ def load_w2v(self):  # insert substitute
         self.aug = Word2VecAug(self.ltw2v_wv, self.tokenizer, type="binary")
 
     def augment(
-        self,
-        sentence: str,
-        n_sent: int = 1,
-        p: float = 0.7
+        self, sentence: str, n_sent: int = 1, p: float = 0.7
     ) -> List[Tuple[str]]:
         """
         Text Augment using word2vec from Thai2Fit
 
@@ -12,8 +12,9 @@ class Thai2fitAug:
     Thai2Fit:
     `github.com/cstorm125/thai2fit <https://github.com/cstorm125/thai2fit>`_
     """
+
     def __init__(self):
-        self.thai2fit_wv = get_corpus_path('thai2fit_wv')
+        self.thai2fit_wv = get_corpus_path("thai2fit_wv")
         self.load_w2v()
 
     def tokenizer(self, text: str) -> List[str]:
@@ -30,10 +31,7 @@ def load_w2v(self):
         self.aug = Word2VecAug(self.thai2fit_wv, self.tokenizer, type="binary")
 
     def augment(
-        self,
-        sentence: str,
-        n_sent: int = 1,
-        p: float = 0.7
+        self, sentence: str, n_sent: int = 1, p: float = 0.7
     ) -> List[Tuple[str]]:
         """
         Text Augment using word2vec from Thai2Fit
 
@@ -107,7 +107,7 @@ def postype2wordnet(pos: str, corpus: str):
     **Options for corpus**
         * *orchid* - Orchid Corpus
     """
-    if corpus not in ['orchid']:
+    if corpus not in ["orchid"]:
         return None
     return orchid[pos]
 
@@ -116,14 +116,12 @@ class WordNetAug:
     """
     Text Augment using wordnet
     """
+
     def __init__(self):
         pass
 
     def find_synonyms(
-        self,
-        word: str,
-        pos: str = None,
-        postag_corpus: str = "orchid"
+        self, word: str, pos: str = None, postag_corpus: str = "orchid"
     ) -> List[str]:
         """
         Find synonyms from wordnet
@@ -139,13 +137,13 @@ def find_synonyms(
             self.list_synsets = wordnet.synsets(word)
         else:
             self.p2w_pos = postype2wordnet(pos, postag_corpus)
-            if self.p2w_pos != '':
+            if self.p2w_pos != "":
                 self.list_synsets = wordnet.synsets(word, pos=self.p2w_pos)
             else:
                 self.list_synsets = wordnet.synsets(word)
 
         for self.synset in wordnet.synsets(word):
-            for self.syn in self.synset.lemma_names(lang='tha'):
+            for self.syn in self.synset.lemma_names(lang="tha"):
                 self.synonyms.append(self.syn)
 
         self.synonyms_without_duplicates = list(
@@ -159,7 +157,7 @@ def augment(
         tokenize: object = word_tokenize,
         max_syn_sent: int = 6,
         postag: bool = True,
-        postag_corpus: str = "orchid"
+        postag_corpus: str = "orchid",
     ) -> List[List[str]]:
         """
         Text Augment using wordnet
 
@@ -199,7 +199,7 @@ def compute_stats(ref_sample: str, raw_sample: str) -> dict:
         "word_level": {
             "correctly_tokenised_words": correctly_tokenised_words,
             "total_words_in_sample": np.sum(sample),
-            "total_words_in_ref_sample": np.sum(ref_sample)
+            "total_words_in_ref_sample": np.sum(ref_sample),
         },
         "global": {
             "tokenisation_indicators": "".join(tokenization_indicators)