-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclassifier.py
115 lines (90 loc) · 4.09 KB
/
classifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import corpus_training
import corpus_testing
import input_parser
import output_parser
import naive_bayes
import n_gram
import tweet
import language
# The main file
inputPath = "input/input.txt"
model = 2 # if 1 --> Model 1, if 2 --> Model 2
def merge_two_dicts(x, y):
"""Given two dicts, merge them into a new dict."""
z = language.to_dict(0)
for f, b in zip(x.items(), y.items()):
z[f[0]] = f[1] + b[1]
return z
def generate_trace_file(v, n, d, output, tweets, scores, model_type):
for i in range(len(tweets)):
dict_scores = scores[i]
likelyClass = max(dict_scores, key=dict_scores.get).value
maxScore = max(dict_scores.values())
correctClass = tweets[i].get_language()
label = ""
if likelyClass == correctClass:
label = "correct"
else:
label = "wrong"
output.create_trace_file(model_type, v, n, d, tweets[i].get_id(), likelyClass, maxScore, tweets[i].get_language(), label)
def main():
the_input = input_parser.InputParser(inputPath)
# Getting hyper-parameters
v = the_input.get_vocabulary()
n = the_input.get_n_gram_size()
d = the_input.get_smoothing_value()
training_file = the_input.get_training_file()
testing_file = the_input.get_testing_file()
# Reading the training set
the_input.read_set_file("training")
tweet_training_ids = the_input.get_tweet_training_ids()
tweet_training_usernames = the_input.get_tweet_training_usernames()
tweet_training_languages = the_input.get_tweet_training_languages()
tweet_training_messages = the_input.get_tweet_training_messages()
# Reading the testing set
the_input.read_set_file("testing")
tweet_testing_ids = the_input.get_tweet_testing_ids()
tweet_testing_usernames = the_input.get_tweet_testing_usernames()
tweet_testing_languages = the_input.get_tweet_testing_languages()
tweet_testing_messages = the_input.get_tweet_testing_messages()
# Creating the tweets with the training set
training_tweets = []
for i in range(len(tweet_training_ids)):
training_tweets.append(
tweet.Tweet(tweet_training_ids[i], tweet_training_usernames[i], tweet_training_languages[i],
tweet_training_messages[i]))
# Creating the tweets with the testing set
testing_tweets = []
for i in range(len(tweet_testing_ids)):
testing_tweets.append(tweet.Tweet(tweet_testing_ids[i], tweet_testing_usernames[i], tweet_testing_languages[i],
tweet_testing_messages[i]))
if model == 1:
# Creating the trace file for Model 1
output = output_parser.OutputParser()
output.init_trace_file("normal", v, n, d) # For our first model with hyper-parameters
# MODEL 1
ngram = n_gram.Ngram(n, training_tweets, d, v)
ngram.test_all(testing_tweets)
ngram_scores = ngram.get_scores()
# Writing track file for model 1
generate_trace_file(v, n, d, output, testing_tweets, ngram_scores, "normal")
# Writing evaluation file for model 1
output.create_evaluation_file("normal", v, n, d)
print("Writing the evaluation file for Model 1...")
if model == 2:
# Creating the trace file for Model 2
output = output_parser.OutputParser()
output.init_trace_file("other", v, n, d) # For our first model with hyper-parameters
# MODEL 2
corpus_training_info = corpus_training.Corpus(v, training_tweets)
model_2 = naive_bayes.NaiveBayes(v, d, training_tweets, corpus_training_info)
corpus_testing_info = corpus_testing.Corpus(v, testing_tweets)
model_2.test(v, testing_tweets, corpus_testing_info)
# Writing track file for model 2
naive_scores = model_2.init_dict(model_2.get_scores())
generate_trace_file(v, n, d, output, testing_tweets, naive_scores, "other")
# Writing evaluation file for model 2
output.create_evaluation_file("other", v, n, d)
print("Writing the evaluation file for Model 2...")
print("Completed both classification!")
main()