Skip to content

Commit 488ddbe

Browse files
DPigeonDPigeon
DPigeon
authored and
DPigeon
committed
Fixing the duplicates in isalpha for model 2
1 parent 33b69a0 commit 488ddbe

File tree

4 files changed

+9
-8
lines changed

4 files changed

+9
-8
lines changed

corpus_testing.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -44,9 +44,9 @@ def determite_vocabulary(self): # Used to get all the information needed in the
4444
char_size = len(self.low_letters) + len(self.up_letters)
4545
letters = (self.low_letters + self.up_letters).copy() # Merge the two
4646
elif self.v == '2':
47-
list_isalpha = self.alpha_voc
48-
char_size = len(list_isalpha)
49-
letters = list_isalpha.copy() # Merge the three
47+
new_list = list(dict.fromkeys(self.alpha_voc)) # Removes duplicates from the list (must be unique character)
48+
char_size = len(new_list)
49+
letters = new_list.copy()
5050

5151
info = dict()
5252
info['char_size'] = char_size

corpus_training.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -44,9 +44,9 @@ def determite_vocabulary(self): # Used to get all the information needed in the
4444
char_size = len(self.low_letters) + len(self.up_letters)
4545
letters = (self.low_letters + self.up_letters).copy() # Merge the two
4646
elif self.v == '2':
47-
list_isalpha = self.alpha_voc
48-
char_size = len(list_isalpha)
49-
letters = list_isalpha.copy() # Merge the three
47+
new_list = list(dict.fromkeys(self.alpha_voc)) # Removes duplicates from the list (must be unique character)
48+
char_size = len(new_list)
49+
letters = new_list.copy()
5050

5151
info = dict()
5252
info['char_size'] = char_size

input/input.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
1 1 0.9 training_set testing_set
1+
2 1 0.5 training_set testing_set

naive_bayes.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -20,14 +20,15 @@ def class_probability(self, i, tweets, characters): # example: compute P('eu')
2020
if tweets[t].get_language() == i.value:
2121
count_doc_i = count_doc_i + 1
2222
prob_i = count_doc_i / count_all_doc
23+
#print(prob_i)
2324
return prob_i
2425

2526
def cond_probability(self, i, j, tweets, letters, characters, lang, d): # example: compute P('a'|eu) = count('a', eu) / sum(count('a', eu))
2627
count_j_i = 0
2728
sum_j_i = 0
2829
prob_j_i = 0
2930
for t in range(len(characters)):
30-
if tweets[t].get_language() == i.value:
31+
if tweets[t].get_language() == i.value: # blocks here
3132
count_j_i = count_j_i + characters[t].count(
3233
letters[j]) # Getting the number of characters in each languages class
3334
sum_j_i = sum_j_i + len(characters[t]) # Getting the sum of all characters in each language class

0 commit comments

Comments
 (0)