-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmarkov-model-builder.py
71 lines (52 loc) · 2.05 KB
/
markov-model-builder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import numpy as np
import sys
from tqdm import tqdm
# so we can use characters as array indices
INDEX_OFFSET = ord('a')
def train(file):
# init counts with 0
initial_state_vector = np.zeros(26, dtype=np.float32)
follow_matrix = np.zeros([26, 26], dtype=np.float32)
# iterate through lines in training set
for line in tqdm(file):
line = line.lower().strip()
initial_state_vector[char_to_index(line[0])] += 1 # increment count for starting char in initial chars
for c, c_follow in zip(line[0:-1], line[1:]): # iterate through all chars of the line and their respective follow char
follow_matrix[char_to_index(c), char_to_index(c_follow)] += 1 # increment count for "after c follows c_follow"
initial_state_likelihood_vector = normalize(initial_state_vector) # convert counts to likelihoods
follow_likelihood_matrix = normalize(follow_matrix) # convert counts to likelohoods (row-wise)
return initial_state_likelihood_vector, follow_likelihood_matrix
# convert char value to array index
def char_to_index(c):
return ord(c) - INDEX_OFFSET
# convert counts to likelihoods
def normalize(x):
# normalize initial state vector
if(len(x.shape) == 1):
return x / x.sum()
# otherwise, normalize follow matrix, row-wise
for i in range(x.shape[0]):
x[i] = x[i] / x[i].sum()
return x
# if args count doesn't match expected count, print usage and terminate
def verify_args_or_die(argv):
if len(argv) != 2:
print("Usage: markov-model-builder.py <input_filename> <output_filename_no_extension>")
exit(0)
def main(argv):
# check and parse cmd args
verify_args_or_die(argv)
input_filename = argv[0]
output_filename = argv[1]
# load training file
print("reading input file")
with open(input_filename, "r") as infile:
print("training Markov model")
initial_states, follow = train(infile)
# export learned model parts
print("saving model")
np.save(output_filename + "_initial", initial_states) # initial char likelihoods
np.save(output_filename + "_follow", follow) # follow likelihoods
print("done")
if __name__ == "__main__":
main(sys.argv[1:])