Fixing the duplicates in isalpha for model 2

DPigeon · DPigeon · commit 488ddbe938f4 · 2020-04-05T14:22:29.000-04:00
diff --git a/corpus_testing.py b/corpus_testing.py
@@ -44,9 +44,9 @@ def determite_vocabulary(self): # Used to get all the information needed in the
             char_size = len(self.low_letters) + len(self.up_letters)
             letters = (self.low_letters + self.up_letters).copy()  # Merge the two
         elif self.v == '2':
-            list_isalpha = self.alpha_voc
-            char_size = len(list_isalpha)
-            letters = list_isalpha.copy() # Merge the three
+            new_list = list(dict.fromkeys(self.alpha_voc)) # Removes duplicates from the list (must be unique character)
+            char_size = len(new_list)
+            letters = new_list.copy()
 
         info = dict()
         info['char_size'] = char_size
diff --git a/corpus_training.py b/corpus_training.py
@@ -44,9 +44,9 @@ def determite_vocabulary(self): # Used to get all the information needed in the
             char_size = len(self.low_letters) + len(self.up_letters)
             letters = (self.low_letters + self.up_letters).copy()  # Merge the two
         elif self.v == '2':
-            list_isalpha = self.alpha_voc
-            char_size = len(list_isalpha)
-            letters = list_isalpha.copy() # Merge the three
+            new_list = list(dict.fromkeys(self.alpha_voc)) # Removes duplicates from the list (must be unique character)
+            char_size = len(new_list)
+            letters = new_list.copy()
 
         info = dict()
         info['char_size'] = char_size
diff --git a/input/input.txt b/input/input.txt
@@ -1 +1 @@
-1 1 0.9 training_set testing_set
+2 1 0.5 training_set testing_set
diff --git a/naive_bayes.py b/naive_bayes.py
@@ -20,14 +20,15 @@ def class_probability(self, i, tweets, characters):  # example: compute P('eu')
             if tweets[t].get_language() == i.value:
                 count_doc_i = count_doc_i + 1
         prob_i = count_doc_i / count_all_doc
+        #print(prob_i)
         return prob_i
 
     def cond_probability(self, i, j, tweets, letters, characters, lang, d):  # example: compute P('a'|eu) = count('a', eu) / sum(count('a', eu))
         count_j_i = 0
         sum_j_i = 0
         prob_j_i = 0
         for t in range(len(characters)):
-            if tweets[t].get_language() == i.value:
+            if tweets[t].get_language() == i.value: # blocks here
                 count_j_i = count_j_i + characters[t].count(
                     letters[j])  # Getting the number of characters in each languages class
                 sum_j_i = sum_j_i + len(characters[t])  # Getting the sum of all characters in each language class

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-1 1 0.9 training_set testing_set`
	`1`	`+2 1 0.5 training_set testing_set`