@@ -17,17 +17,18 @@ def __init__(self, hidden_sizes=None, activation_func=None, error_func=None, lr=
17
17
self .error_func = error_func
18
18
self .lr = lr
19
19
self .output_size = output_size
20
- self .init_weights (input_size , hidden_sizes , output_size )
20
+ self .init_weights (input_size , hidden_sizes , output_size ) # initialize random weights
21
21
22
22
def init_weights (self , input_size , hidden_sizes , output_size ):
23
- # get all dimensions in the network
23
+ # get all layer sizes in the network
24
24
layer_sizes = np .concatenate ((input_size , hidden_sizes , output_size ), axis = None ).astype (int )
25
25
26
26
for i in range (self .layer_num ):
27
- stdv = 1. / math .sqrt (layer_sizes [i ])
28
- self .net ['w_' + str (i + 1 )] = np .random .uniform (- stdv , stdv , (layer_sizes [i ], layer_sizes [i + 1 ])).astype (
27
+ std = 1. / math .sqrt (layer_sizes [i ])
28
+ # use float32 to avoid overflow in the upcoming calculations
29
+ self .net ['w_' + str (i + 1 )] = np .random .uniform (- std , std , (layer_sizes [i ], layer_sizes [i + 1 ])).astype (
29
30
'float32' )
30
- self .net ['b_' + str (i + 1 )] = np .random .uniform (- stdv , stdv , layer_sizes [i + 1 ]).astype ('float32' )
31
+ self .net ['b_' + str (i + 1 )] = np .random .uniform (- std , std , layer_sizes [i + 1 ]).astype ('float32' )
31
32
32
33
# Activation functions - Start
33
34
def sigmoid (self , z ):
@@ -40,7 +41,7 @@ def tanh(self, z):
40
41
def relu (self , z ):
41
42
return np .maximum (0 , z ).astype ('float32' )
42
43
43
- # Activation functions and derivatives - End
44
+ # Activation functions - End
44
45
45
46
# Activation functions derivatives - Start
46
47
def d_sigmoid (self , a ):
@@ -55,60 +56,53 @@ def d_relu(self, z):
55
56
# Activation functions derivatives - End
56
57
57
58
def softmax (self , z ):
58
- shifted = z - np .max (z , axis = 1 , keepdims = True )
59
- z = np .sum (np .exp (shifted ), axis = 1 , keepdims = True )
60
- log_probs = shifted - np .log (z )
59
+ shift_z = z - np .max (z , axis = 1 , keepdims = True ) # shift for stable softmax
60
+ exp_z = np .sum (np .exp (shift_z ), axis = 1 , keepdims = True )
61
+ log_probs = shift_z - np .log (exp_z )
61
62
probs = np .exp (log_probs )
62
63
return log_probs , probs
63
64
64
65
# Error functions - Start
65
- def sum_neg_log_likelihood (self , z , y ):
66
- log_probs , probs = self .softmax (z )
67
- n = z .shape [0 ]
66
+ def sum_neg_log_likelihood (self , y , probs , log_probs , n ):
68
67
loss = - np .sum (log_probs [np .arange (n ), y ]) / n
69
68
d_x = probs .copy ()
70
- d_x [np .arange (n ), y ] -= 1
71
- d_x /= n
69
+ d_x [np .arange (n ), y ] = d_x [ np . arange ( n ), y ] - 1
70
+ d_x = d_x / n
72
71
return loss , d_x
73
72
74
- def mean_squared_err (self , z , y ):
75
- _ , probs = self .softmax (z )
76
- n = z .shape [0 ]
73
+ def sum_squared_err (self , y , probs , n ):
77
74
one_hot_y = np .zeros ((n , self .output_size ), dtype = 'float32' )
78
75
one_hot_y [np .arange (n ), y ] = 1.
79
- loss = np .sum (np .power (one_hot_y - probs , 2 )) / n
80
- d_x = - 2 * (one_hot_y - probs ) / n
76
+ loss = np .sum (np .power (one_hot_y - probs , 2 ))
77
+ d_x = - 2 * (one_hot_y - probs )
81
78
return loss , d_x
82
79
83
- def sum_squared_err (self , z , y ):
84
- _ , probs = self .softmax (z )
85
- n = z .shape [0 ]
80
+ def mean_squared_err (self , y , probs , n ):
86
81
one_hot_y = np .zeros ((n , self .output_size ), dtype = 'float32' )
87
82
one_hot_y [np .arange (n ), y ] = 1.
88
- loss = np .sum (np .power (one_hot_y - probs , 2 ))
89
- d_x = - 2 * (one_hot_y - probs )
83
+ loss = np .sum (np .power (one_hot_y - probs , 2 )) / n
84
+ d_x = - 2 * (one_hot_y - probs ) / n
90
85
return loss , d_x
91
86
92
87
# Error functions - End
93
88
94
89
# Forward - Start
95
- def forward_pass (self , X , valid = False ):
90
+ def forward_pass (self , X ):
96
91
inputs = X
97
- self .caches = []
92
+ self .layer_history = [] # keep forward pass information for backward pass
98
93
99
- for i in range (self .layer_num - 1 ):
100
- inputs , cache = self .activated_forward (inputs , self .net ['w_' + str (i + 1 )], self .net ['b_' + str (i + 1 )])
101
- self .caches .append (cache )
94
+ for i in range (self .layer_num - 1 ): # apply forward pass and activation for each layer except last one
95
+ inputs , history = self .activated_forward (inputs , self .net ['w_' + str (i + 1 )], self .net ['b_' + str (i + 1 )])
96
+ self .layer_history .append (history )
102
97
103
- scores , cache = self .forward (inputs , self .net ['w_' + str (self .layer_num )], self . net [ 'b_' + str ( self . layer_num )])
104
- if not valid :
105
- self .caches .append (cache )
98
+ scores , history = self .forward (inputs , self .net ['w_' + str (self .layer_num )],
99
+ self . net [ 'b_' + str ( self . layer_num )])
100
+ self .layer_history .append (history )
106
101
return scores
107
102
108
103
def forward (self , x , w , b ):
109
- z = x .reshape (x .shape [0 ], - 1 ).dot (w ) + b
110
- cache = (x , w , b )
111
- return z , cache
104
+ z = x .reshape (x .shape [0 ], - 1 ).dot (w ) + b # linear formula computation
105
+ return z , (x , w , b )
112
106
113
107
def activate (self , z ):
114
108
if self .activation_func == 'sigmoid' :
@@ -120,35 +114,41 @@ def activate(self, z):
120
114
return activated
121
115
122
116
def activated_forward (self , x , w , b ):
123
- z , fwd_cache = self .forward (x , w , b )
117
+ z , fwd_history = self .forward (x , w , b )
124
118
activated = self .activate (z )
125
- return activated , (fwd_cache , z , activated )
119
+ return activated , (fwd_history , z , activated )
126
120
127
121
# Forward - End
128
122
129
123
# Backward - Start
130
124
def backward_pass (self , scores , y ):
131
125
gradients = {}
126
+ log_probs , probs = self .softmax (scores )
127
+ n = scores .shape [0 ]
128
+
129
+ # get loss and derivative of error wrt output
132
130
if self .error_func == 'log' :
133
- loss , d_o = self .sum_neg_log_likelihood (scores , y )
131
+ loss , d_o = self .sum_neg_log_likelihood (y , probs , log_probs , n )
134
132
elif self .error_func == 'sse' :
135
- loss , d_o = self .sum_squared_err (scores , y )
133
+ loss , d_o = self .sum_squared_err (y , probs , n )
136
134
elif self .error_func == 'mse' :
137
- loss , d_o = self .mean_squared_err (scores , y )
135
+ loss , d_o = self .mean_squared_err (y , probs , n )
138
136
139
- d_o , d_w , d_b = self .backward (d_o , self .caches .pop ())
137
+ # apply backward pass to compute gradients
138
+ d_o , d_w , d_b = self .backward (d_o , self .layer_history .pop ())
140
139
gradients ['w_' + str (self .layer_num )] = d_w
141
140
gradients ['b_' + str (self .layer_num )] = d_b
142
141
143
142
for i in range (self .layer_num - 2 , - 1 , - 1 ):
144
- d_o , d_w , d_b = self .activated_backward (d_o , self .caches .pop ())
143
+ d_o , d_w , d_b = self .activated_backward (d_o , self .layer_history .pop ())
145
144
gradients ['w_' + str (i + 1 )] = d_w
146
145
gradients ['b_' + str (i + 1 )] = d_b
147
146
148
147
return loss , gradients
149
148
150
- def backward (self , d_o , cache ):
151
- x , w , b = cache
149
+ def backward (self , d_o , history ):
150
+ x , w , b = history
151
+ # compute gradients of input, weight and bias
152
152
d_x = d_o .dot (w .T ).reshape (x .shape )
153
153
d_w = x .reshape (x .shape [0 ], - 1 ).T .dot (d_o )
154
154
d_b = np .sum (d_o , axis = 0 )
@@ -161,20 +161,20 @@ def d_activate(self, d_o, z, a):
161
161
d_x = self .d_tanh (a )
162
162
elif self .activation_func == 'relu' :
163
163
d_x = self .d_relu (z )
164
- return d_x * d_o
164
+ return d_x * d_o # apply chain rule
165
165
166
- def activated_backward (self , d_o , cache ):
167
- fwd_cache , z_cache , a_cache = cache
168
- d_a = self .d_activate (d_o , z_cache , a_cache )
169
- return self .backward (d_a , fwd_cache )
166
+ def activated_backward (self , d_o , history ):
167
+ fwd_history , z_history , a_history = history
168
+ d_a = self .d_activate (d_o , z_history , a_history )
169
+ return self .backward (d_a , fwd_history )
170
170
171
171
def update_weights (self , gradients ):
172
- for param , w in self .net .items ():
173
- updated_w = self .sgd (w , gradients [param ])
172
+ for param , w in self .net .items (): # update each parameter in the network
173
+ updated_w = self .gradient_descent (w , gradients [param ])
174
174
self .net [param ] = updated_w
175
175
176
- def sgd (self , w , d_w ):
177
- w -= self .lr * d_w
176
+ def gradient_descent (self , w , d_w ):
177
+ w = w - self .lr * d_w # apply gradient descent to update the weights
178
178
return w
179
179
180
180
# Backward - End
@@ -185,7 +185,8 @@ def train(self, X, y):
185
185
return loss , gradients
186
186
187
187
def predict (self , X ):
188
- return self .forward_pass (X , valid = True )
188
+ scores = self .forward_pass (X )
189
+ return np .argmax (scores , axis = 1 ) # predict the label with max score
189
190
190
191
def extract_model (self ):
191
192
name = '%dnn_lr=%0.3f_err=%s_act=%s_vgg.pkl' % (self .layer_num , self .lr , self .error_func , self .activation_func )
0 commit comments