Skip to content

Commit e07673f

Browse files
committed
style: format all files according to black
1 parent c7358da commit e07673f

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

81 files changed

+1113
-994
lines changed

bin/act

12.8 MB
Binary file not shown.

pythainlp/augment/lm/fasttext.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12,13 +12,14 @@ class FastTextAug:
1212
1313
:param str model_path: path of model file
1414
"""
15+
1516
def __init__(self, model_path: str):
1617
"""
1718
:param str model_path: path of model file
1819
"""
19-
if model_path.endswith('.bin'):
20+
if model_path.endswith(".bin"):
2021
self.model = FastText_gensim.load_facebook_vectors(model_path)
21-
elif model_path.endswith('.vec'):
22+
elif model_path.endswith(".vec"):
2223
self.model = KeyedVectors.load_word2vec_format(model_path)
2324
else:
2425
self.model = FastText_gensim.load(model_path)
@@ -33,7 +34,7 @@ def tokenize(self, text: str) -> List[str]:
3334
:return: list of word
3435
:rtype: List[str]
3536
"""
36-
return word_tokenize(text, engine='icu')
37+
return word_tokenize(text, engine="icu")
3738

3839
def modify_sent(self, sent: str, p: float = 0.7) -> List[List[str]]:
3940
"""
@@ -44,9 +45,7 @@ def modify_sent(self, sent: str, p: float = 0.7) -> List[List[str]]:
4445
list_sent_new = []
4546
for i in sent:
4647
if i in self.dict_wv:
47-
w = [
48-
j for j, v in self.model.most_similar(i) if v >= p
49-
]
48+
w = [j for j, v in self.model.most_similar(i) if v >= p]
5049
if w == []:
5150
list_sent_new.append([i])
5251
else:

pythainlp/augment/lm/wangchanberta.py

Lines changed: 13 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -14,26 +14,26 @@ def __init__(self):
1414
self.model_name = "airesearch/wangchanberta-base-att-spm-uncased"
1515
self.target_tokenizer = CamembertTokenizer
1616
self.tokenizer = CamembertTokenizer.from_pretrained(
17-
self.model_name,
18-
revision='main')
17+
self.model_name, revision="main"
18+
)
1919
self.tokenizer.additional_special_tokens = [
20-
'<s>NOTUSED',
21-
'</s>NOTUSED',
22-
'<_>'
20+
"<s>NOTUSED",
21+
"</s>NOTUSED",
22+
"<_>",
2323
]
2424
self.fill_mask = pipeline(
25-
task='fill-mask',
25+
task="fill-mask",
2626
tokenizer=self.tokenizer,
27-
model=f'{self.model_name}',
28-
revision='main'
27+
model=f"{self.model_name}",
28+
revision="main",
2929
)
3030
self.MASK_TOKEN = self.tokenizer.mask_token
3131

3232
def generate(self, sentence: str, num_replace_tokens: int = 3):
3333
self.sent2 = []
3434
self.input_text = sentence
3535
sent = [
36-
i for i in self.tokenizer.tokenize(self.input_text) if i != '▁'
36+
i for i in self.tokenizer.tokenize(self.input_text) if i != "▁"
3737
]
3838
if len(sent) < num_replace_tokens:
3939
num_replace_tokens = len(sent)
@@ -42,18 +42,16 @@ def generate(self, sentence: str, num_replace_tokens: int = 3):
4242
replace_token = [
4343
sent.pop(random.randrange(len(sent))) for _ in range(1)
4444
][0]
45-
masked_text = masked_text+self.MASK_TOKEN
45+
masked_text = masked_text + self.MASK_TOKEN
4646
self.sent2 += [
47-
str(j['sequence']).replace('<s> ', '').replace('</s>', '')
47+
str(j["sequence"]).replace("<s> ", "").replace("</s>", "")
4848
for j in self.fill_mask(masked_text)
49-
if j['sequence'] not in self.sent2
49+
if j["sequence"] not in self.sent2
5050
]
5151
masked_text = self.input_text
5252
return self.sent2
5353

54-
def augment(
55-
self, sentence: str, num_replace_tokens: int = 3
56-
) -> List[str]:
54+
def augment(self, sentence: str, num_replace_tokens: int = 3) -> List[str]:
5755
"""
5856
Text Augment from wangchanberta
5957

pythainlp/augment/word2vec/__init__.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,7 @@
33
Word2Vec
44
"""
55

6-
__all__ = [
7-
"Word2VecAug",
8-
"Thai2fitAug",
9-
"LTW2VAug"
10-
]
6+
__all__ = ["Word2VecAug", "Thai2fitAug", "LTW2VAug"]
117

128
from pythainlp.augment.word2vec.core import Word2VecAug
139
from pythainlp.augment.word2vec.thai2fit import Thai2fitAug

pythainlp/augment/word2vec/bpemb_wv.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,10 @@ class BPEmbAug:
1010
BPEmb:
1111
`github.com/bheinzerling/bpemb <https://github.com/bheinzerling/bpemb>`_
1212
"""
13+
1314
def __init__(self, lang: str = "th", vs: int = 100000, dim: int = 300):
1415
from bpemb import BPEmb
16+
1517
self.bpemb_temp = BPEmb(lang=lang, dim=dim, vs=vs)
1618
self.model = self.bpemb_temp.emb
1719
self.load_w2v()
@@ -58,6 +60,6 @@ def augment(
5860
for i in self.temp:
5961
self.t = ""
6062
for j in i:
61-
self.t += j.replace('▁', '')
63+
self.t += j.replace("▁", "")
6264
self.temp_new.append(self.t)
6365
return self.temp_new

pythainlp/augment/word2vec/core.py

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,13 @@ def __init__(
1313
:param str type: moodel type (file, binary)
1414
"""
1515
import gensim.models.keyedvectors as word2vec
16+
1617
self.tokenizer = tokenize
1718
if type == "file":
1819
self.model = word2vec.KeyedVectors.load_word2vec_format(model)
1920
elif type == "binary":
2021
self.model = word2vec.KeyedVectors.load_word2vec_format(
21-
model, binary=True, unicode_errors='ignore'
22+
model, binary=True, unicode_errors="ignore"
2223
)
2324
else:
2425
self.model = model
@@ -33,9 +34,7 @@ def modify_sent(self, sent: str, p: float = 0.7) -> List[List[str]]:
3334
list_sent_new = []
3435
for i in sent:
3536
if i in self.dict_wv:
36-
w = [
37-
j for j, v in self.model.most_similar(i) if v >= p
38-
]
37+
w = [j for j, v in self.model.most_similar(i) if v >= p]
3938
if w == []:
4039
list_sent_new.append([i])
4140
else:
@@ -45,10 +44,7 @@ def modify_sent(self, sent: str, p: float = 0.7) -> List[List[str]]:
4544
return list_sent_new
4645

4746
def augment(
48-
self,
49-
sentence: str,
50-
n_sent: int = 1,
51-
p: float = 0.7
47+
self, sentence: str, n_sent: int = 1, p: float = 0.7
5248
) -> List[Tuple[str]]:
5349
"""
5450
:param str sentence: text sentence

pythainlp/augment/word2vec/ltw2v.py

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12,16 +12,17 @@ class LTW2VAug:
1212
LTW2V:
1313
`github.com/PyThaiNLP/large-thaiword2vec <https://github.com/PyThaiNLP/large-thaiword2vec>`_
1414
"""
15+
1516
def __init__(self):
16-
self.ltw2v_wv = get_corpus_path('ltw2v')
17+
self.ltw2v_wv = get_corpus_path("ltw2v")
1718
self.load_w2v()
1819

1920
def tokenizer(self, text: str) -> List[str]:
2021
"""
2122
:param str text: thai text
2223
:rtype: List[str]
2324
"""
24-
return word_tokenize(text, engine='newmm')
25+
return word_tokenize(text, engine="newmm")
2526

2627
def load_w2v(self): # insert substitute
2728
"""
@@ -30,10 +31,7 @@ def load_w2v(self): # insert substitute
3031
self.aug = Word2VecAug(self.ltw2v_wv, self.tokenizer, type="binary")
3132

3233
def augment(
33-
self,
34-
sentence: str,
35-
n_sent: int = 1,
36-
p: float = 0.7
34+
self, sentence: str, n_sent: int = 1, p: float = 0.7
3735
) -> List[Tuple[str]]:
3836
"""
3937
Text Augment using word2vec from Thai2Fit

pythainlp/augment/word2vec/thai2fit.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,9 @@ class Thai2fitAug:
1212
Thai2Fit:
1313
`github.com/cstorm125/thai2fit <https://github.com/cstorm125/thai2fit>`_
1414
"""
15+
1516
def __init__(self):
16-
self.thai2fit_wv = get_corpus_path('thai2fit_wv')
17+
self.thai2fit_wv = get_corpus_path("thai2fit_wv")
1718
self.load_w2v()
1819

1920
def tokenizer(self, text: str) -> List[str]:
@@ -30,10 +31,7 @@ def load_w2v(self):
3031
self.aug = Word2VecAug(self.thai2fit_wv, self.tokenizer, type="binary")
3132

3233
def augment(
33-
self,
34-
sentence: str,
35-
n_sent: int = 1,
36-
p: float = 0.7
34+
self, sentence: str, n_sent: int = 1, p: float = 0.7
3735
) -> List[Tuple[str]]:
3836
"""
3937
Text Augment using word2vec from Thai2Fit

pythainlp/augment/wordnet.py

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@ def postype2wordnet(pos: str, corpus: str):
107107
**Options for corpus**
108108
* *orchid* - Orchid Corpus
109109
"""
110-
if corpus not in ['orchid']:
110+
if corpus not in ["orchid"]:
111111
return None
112112
return orchid[pos]
113113

@@ -116,14 +116,12 @@ class WordNetAug:
116116
"""
117117
Text Augment using wordnet
118118
"""
119+
119120
def __init__(self):
120121
pass
121122

122123
def find_synonyms(
123-
self,
124-
word: str,
125-
pos: str = None,
126-
postag_corpus: str = "orchid"
124+
self, word: str, pos: str = None, postag_corpus: str = "orchid"
127125
) -> List[str]:
128126
"""
129127
Find synonyms from wordnet
@@ -139,13 +137,13 @@ def find_synonyms(
139137
self.list_synsets = wordnet.synsets(word)
140138
else:
141139
self.p2w_pos = postype2wordnet(pos, postag_corpus)
142-
if self.p2w_pos != '':
140+
if self.p2w_pos != "":
143141
self.list_synsets = wordnet.synsets(word, pos=self.p2w_pos)
144142
else:
145143
self.list_synsets = wordnet.synsets(word)
146144

147145
for self.synset in wordnet.synsets(word):
148-
for self.syn in self.synset.lemma_names(lang='tha'):
146+
for self.syn in self.synset.lemma_names(lang="tha"):
149147
self.synonyms.append(self.syn)
150148

151149
self.synonyms_without_duplicates = list(
@@ -159,7 +157,7 @@ def augment(
159157
tokenize: object = word_tokenize,
160158
max_syn_sent: int = 6,
161159
postag: bool = True,
162-
postag_corpus: str = "orchid"
160+
postag_corpus: str = "orchid",
163161
) -> List[List[str]]:
164162
"""
165163
Text Augment using wordnet

pythainlp/benchmarks/word_tokenization.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -199,7 +199,7 @@ def compute_stats(ref_sample: str, raw_sample: str) -> dict:
199199
"word_level": {
200200
"correctly_tokenised_words": correctly_tokenised_words,
201201
"total_words_in_sample": np.sum(sample),
202-
"total_words_in_ref_sample": np.sum(ref_sample)
202+
"total_words_in_ref_sample": np.sum(ref_sample),
203203
},
204204
"global": {
205205
"tokenisation_indicators": "".join(tokenization_indicators)

0 commit comments

Comments
 (0)