submit

fidansamet · fidansamet · commit 0722bee4afa2 · 2021-05-02T23:58:12.000+03:00
diff --git a/data_loader.py b/data_loader.py
@@ -10,7 +10,7 @@
 
 TRAIN_PATH = '/seg_train/seg_train/'
 VALID_PATH = '/seg_dev/seg_dev/'
-TEST_PATH = '/seg_test/'
+# TEST_PATH = '/seg_test/'
 VGG19_PATH = 'vgg19_features/'
 CLASS_NAMES = ['buildings', 'forest', 'glacier', 'mountain', 'sea', 'street']
 RANDOM_SEED = 42
@@ -68,7 +68,7 @@ def load_test(self):
             self.get_vgg_features('test')
         self.X_test, self.y_test = [], []
 
-        cur_path = self.opt.data_path + TEST_PATH  # TODO
+        cur_path = self.opt.data_path
         img_paths = os.listdir(cur_path)
         img_paths.sort(key=lambda f: int(re.sub('\D', '', f)))
         csv_reader = csv.reader(open(self.opt.test_label_path), delimiter=';')
diff --git a/neural_network.py b/neural_network.py
@@ -17,17 +17,18 @@ def __init__(self, hidden_sizes=None, activation_func=None, error_func=None, lr=
             self.error_func = error_func
             self.lr = lr
             self.output_size = output_size
-            self.init_weights(input_size, hidden_sizes, output_size)
+            self.init_weights(input_size, hidden_sizes, output_size)  # initialize random weights
 
     def init_weights(self, input_size, hidden_sizes, output_size):
-        # get all dimensions in the network
+        # get all layer sizes in the network
         layer_sizes = np.concatenate((input_size, hidden_sizes, output_size), axis=None).astype(int)
 
         for i in range(self.layer_num):
-            stdv = 1. / math.sqrt(layer_sizes[i])
-            self.net['w_' + str(i + 1)] = np.random.uniform(-stdv, stdv, (layer_sizes[i], layer_sizes[i + 1])).astype(
+            std = 1. / math.sqrt(layer_sizes[i])
+            # use float32 to avoid overflow in the upcoming calculations
+            self.net['w_' + str(i + 1)] = np.random.uniform(-std, std, (layer_sizes[i], layer_sizes[i + 1])).astype(
                 'float32')
-            self.net['b_' + str(i + 1)] = np.random.uniform(-stdv, stdv, layer_sizes[i + 1]).astype('float32')
+            self.net['b_' + str(i + 1)] = np.random.uniform(-std, std, layer_sizes[i + 1]).astype('float32')
 
     # Activation functions - Start
     def sigmoid(self, z):
@@ -40,7 +41,7 @@ def tanh(self, z):
     def relu(self, z):
         return np.maximum(0, z).astype('float32')
 
-    # Activation functions and derivatives - End
+    # Activation functions - End
 
     # Activation functions derivatives - Start
     def d_sigmoid(self, a):
@@ -55,60 +56,53 @@ def d_relu(self, z):
     # Activation functions derivatives - End
 
     def softmax(self, z):
-        shifted = z - np.max(z, axis=1, keepdims=True)
-        z = np.sum(np.exp(shifted), axis=1, keepdims=True)
-        log_probs = shifted - np.log(z)
+        shift_z = z - np.max(z, axis=1, keepdims=True)  # shift for stable softmax
+        exp_z = np.sum(np.exp(shift_z), axis=1, keepdims=True)
+        log_probs = shift_z - np.log(exp_z)
         probs = np.exp(log_probs)
         return log_probs, probs
 
     # Error functions - Start
-    def sum_neg_log_likelihood(self, z, y):
-        log_probs, probs = self.softmax(z)
-        n = z.shape[0]
+    def sum_neg_log_likelihood(self, y, probs, log_probs, n):
         loss = -np.sum(log_probs[np.arange(n), y]) / n
         d_x = probs.copy()
-        d_x[np.arange(n), y] -= 1
-        d_x /= n
+        d_x[np.arange(n), y] = d_x[np.arange(n), y] - 1
+        d_x = d_x / n
         return loss, d_x
 
-    def mean_squared_err(self, z, y):
-        _, probs = self.softmax(z)
-        n = z.shape[0]
+    def sum_squared_err(self, y, probs, n):
         one_hot_y = np.zeros((n, self.output_size), dtype='float32')
         one_hot_y[np.arange(n), y] = 1.
-        loss = np.sum(np.power(one_hot_y - probs, 2)) / n
-        d_x = -2 * (one_hot_y - probs) / n
+        loss = np.sum(np.power(one_hot_y - probs, 2))
+        d_x = -2 * (one_hot_y - probs)
         return loss, d_x
 
-    def sum_squared_err(self, z, y):
-        _, probs = self.softmax(z)
-        n = z.shape[0]
+    def mean_squared_err(self, y, probs, n):
         one_hot_y = np.zeros((n, self.output_size), dtype='float32')
         one_hot_y[np.arange(n), y] = 1.
-        loss = np.sum(np.power(one_hot_y - probs, 2))
-        d_x = -2 * (one_hot_y - probs)
+        loss = np.sum(np.power(one_hot_y - probs, 2)) / n
+        d_x = -2 * (one_hot_y - probs) / n
         return loss, d_x
 
     # Error functions - End
 
     # Forward - Start
-    def forward_pass(self, X, valid=False):
+    def forward_pass(self, X):
         inputs = X
-        self.caches = []
+        self.layer_history = []  # keep forward pass information for backward pass
 
-        for i in range(self.layer_num - 1):
-            inputs, cache = self.activated_forward(inputs, self.net['w_' + str(i + 1)], self.net['b_' + str(i + 1)])
-            self.caches.append(cache)
+        for i in range(self.layer_num - 1):  # apply forward pass and activation for each layer except last one
+            inputs, history = self.activated_forward(inputs, self.net['w_' + str(i + 1)], self.net['b_' + str(i + 1)])
+            self.layer_history.append(history)
 
-        scores, cache = self.forward(inputs, self.net['w_' + str(self.layer_num)], self.net['b_' + str(self.layer_num)])
-        if not valid:
-            self.caches.append(cache)
+        scores, history = self.forward(inputs, self.net['w_' + str(self.layer_num)],
+                                       self.net['b_' + str(self.layer_num)])
+        self.layer_history.append(history)
         return scores
 
     def forward(self, x, w, b):
-        z = x.reshape(x.shape[0], -1).dot(w) + b
-        cache = (x, w, b)
-        return z, cache
+        z = x.reshape(x.shape[0], -1).dot(w) + b  # linear formula computation
+        return z, (x, w, b)
 
     def activate(self, z):
         if self.activation_func == 'sigmoid':
@@ -120,35 +114,41 @@ def activate(self, z):
         return activated
 
     def activated_forward(self, x, w, b):
-        z, fwd_cache = self.forward(x, w, b)
+        z, fwd_history = self.forward(x, w, b)
         activated = self.activate(z)
-        return activated, (fwd_cache, z, activated)
+        return activated, (fwd_history, z, activated)
 
     # Forward - End
 
     # Backward - Start
     def backward_pass(self, scores, y):
         gradients = {}
+        log_probs, probs = self.softmax(scores)
+        n = scores.shape[0]
+
+        # get loss and derivative of error wrt output
         if self.error_func == 'log':
-            loss, d_o = self.sum_neg_log_likelihood(scores, y)
+            loss, d_o = self.sum_neg_log_likelihood(y, probs, log_probs, n)
         elif self.error_func == 'sse':
-            loss, d_o = self.sum_squared_err(scores, y)
+            loss, d_o = self.sum_squared_err(y, probs, n)
         elif self.error_func == 'mse':
-            loss, d_o = self.mean_squared_err(scores, y)
+            loss, d_o = self.mean_squared_err(y, probs, n)
 
-        d_o, d_w, d_b = self.backward(d_o, self.caches.pop())
+        # apply backward pass to compute gradients
+        d_o, d_w, d_b = self.backward(d_o, self.layer_history.pop())
         gradients['w_' + str(self.layer_num)] = d_w
         gradients['b_' + str(self.layer_num)] = d_b
 
         for i in range(self.layer_num - 2, -1, -1):
-            d_o, d_w, d_b = self.activated_backward(d_o, self.caches.pop())
+            d_o, d_w, d_b = self.activated_backward(d_o, self.layer_history.pop())
             gradients['w_' + str(i + 1)] = d_w
             gradients['b_' + str(i + 1)] = d_b
 
         return loss, gradients
 
-    def backward(self, d_o, cache):
-        x, w, b = cache
+    def backward(self, d_o, history):
+        x, w, b = history
+        # compute gradients of input, weight and bias
         d_x = d_o.dot(w.T).reshape(x.shape)
         d_w = x.reshape(x.shape[0], -1).T.dot(d_o)
         d_b = np.sum(d_o, axis=0)
@@ -161,20 +161,20 @@ def d_activate(self, d_o, z, a):
             d_x = self.d_tanh(a)
         elif self.activation_func == 'relu':
             d_x = self.d_relu(z)
-        return d_x * d_o
+        return d_x * d_o  # apply chain rule
 
-    def activated_backward(self, d_o, cache):
-        fwd_cache, z_cache, a_cache = cache
-        d_a = self.d_activate(d_o, z_cache, a_cache)
-        return self.backward(d_a, fwd_cache)
+    def activated_backward(self, d_o, history):
+        fwd_history, z_history, a_history = history
+        d_a = self.d_activate(d_o, z_history, a_history)
+        return self.backward(d_a, fwd_history)
 
     def update_weights(self, gradients):
-        for param, w in self.net.items():
-            updated_w = self.sgd(w, gradients[param])
+        for param, w in self.net.items():  # update each parameter in the network
+            updated_w = self.gradient_descent(w, gradients[param])
             self.net[param] = updated_w
 
-    def sgd(self, w, d_w):
-        w -= self.lr * d_w
+    def gradient_descent(self, w, d_w):
+        w = w - self.lr * d_w  # apply gradient descent to update the weights
         return w
 
     # Backward - End
@@ -185,7 +185,8 @@ def train(self, X, y):
         return loss, gradients
 
     def predict(self, X):
-        return self.forward_pass(X, valid=True)
+        scores = self.forward_pass(X)
+        return np.argmax(scores, axis=1)  # predict the label with max score
 
     def extract_model(self):
         name = '%dnn_lr=%0.3f_err=%s_act=%s_vgg.pkl' % (self.layer_num, self.lr, self.error_func, self.activation_func)
diff --git a/options.py b/options.py
@@ -8,17 +8,18 @@ def initialize(self, parser):
 
         # train
         parser.add_argument('-model_name', type=str, default='model.txt', help='name of the model to save')
-        parser.add_argument('-hidden_layer_num', type=int, default=1, help='number of hidden layers')
-        parser.add_argument('-hidden_unit_num', type=int, default=300, help='number of hidden units in hidden layers')
+        parser.add_argument('-hidden_layer_num', type=int, default=2, help='number of hidden layers')
+        parser.add_argument('-hidden_unit_num', type=int, default=500, help='number of hidden units in hidden layers')
         parser.add_argument('-epoch_num', type=int, default=50, help='number of epochs')
-        parser.add_argument('-batch_size', type=int, default=32, help='batch size for mini-batch gradient descent')
+        parser.add_argument('-batch_size', type=int, default=64, help='batch size for mini-batch gradient descent')
         parser.add_argument('-learning_rate', type=float, default=0.1, help='learning rate for gradient descent')
         parser.add_argument('-reduce_lr', action='store_true', help='if specified, reduce learning rate')
         parser.add_argument('-activation_func', type=str, default='relu', help='sigmoid | tanh | relu')
         parser.add_argument('-objective_func', type=str, default='log', help='log | sse | mse')
 
         # test
-        parser.add_argument('-model_path', type=str, default='./model/sl_nn_', help='path to saved model')
+        parser.add_argument('-model_path', type=str, default='./model/30x30_best.pkl', help='path to saved model')
+        parser.add_argument('-test_label_path', type=str, default='./data/test_label.csv', help='path to test labels')
 
         self.parser = parser
         return parser
diff --git a/train.py b/train.py
@@ -3,12 +3,12 @@
 from data_loader import DataLoader
 from options import Options
 from neural_network import NeuralNetwork
-from utils import validate, plot_loss, plot_acc
+from utils import validate, plot_loss, plot_acc, plot_parameters
 from statistics import mean
 
 np.random.seed(12345)
-train_loss_cache, mini_batch_loss_cache = [], []
-train_acc_cache, valid_acc_cache = [], []
+train_losses, mini_batch_losses = [], []
+train_accs, valid_accs = [], []
 lr_decay = 0.95
 
 
@@ -17,8 +17,8 @@ def mini_batch_gd(start_idx, end_idx):
     X_batch = X_train[start_idx:end_idx]
     y_batch = y_train[start_idx:end_idx]
     loss, gradients = nn.train(X_batch, y_batch)  # train network with batches
-    mini_batch_loss_cache.append(loss)
     nn.update_weights(gradients)  # update parameters
+    mini_batch_losses.append(loss)
     return loss
 
 
@@ -47,24 +47,27 @@ def mini_batch_gd(start_idx, end_idx):
             start_idx = i * batch_size
             end_idx = (i + 1) * batch_size
             loss = mini_batch_gd(start_idx, end_idx)
-            # print('Iteration %d in Epoch %d - Loss: %f' % (i+1, epoch+1, loss))
+            print('Iteration %d in Epoch %d - Loss: %f' % (i + 1, epoch + 1, loss))
 
         if opt.reduce_lr:
             nn.lr *= lr_decay
 
         train_acc = validate(nn, X_train, y_train)
-        train_acc_cache.append(train_acc)
+        train_accs.append(train_acc)
         print('Epoch %d/%d - Train acc: %0.2f' % (epoch + 1, epoch_num, train_acc))
 
         valid_acc = validate(nn, X_valid, y_valid)
-        valid_acc_cache.append(valid_acc)
+        valid_accs.append(valid_acc)
         print('Epoch %d/%d - Validation acc: %0.2f' % (epoch + 1, epoch_num, valid_acc))
 
         print("-------------------")
-        train_loss_cache.append(mean(mini_batch_loss_cache))
-        mini_batch_loss_cache = []
+        train_losses.append(mean(mini_batch_losses))
+        mini_batch_losses = []
 
-    nn.extract_model()
-    plot_loss(opt, train_loss_cache)
-    plot_acc(opt, train_acc_cache, valid_acc_cache)
+    # for i in range(6):
+    #     plot_parameters(nn.net['w_1'][:, i], 30, 30)
+
+    # nn.extract_model()
+    plot_loss(opt, train_losses)
+    plot_acc(opt, train_accs, valid_accs)
     # write_file(opt, train_loss_cache, train_acc_cache, valid_acc_cache)
diff --git a/utils.py b/utils.py
@@ -8,18 +8,18 @@
 
 def validate(nn, X, y, valid_batch_size=100):
     preds = []
-
     valid_batch_num = X.shape[0] // valid_batch_size
     if X.shape[0] % valid_batch_size != 0:
         valid_batch_num += 1
 
     for i in range(valid_batch_num):
         start_idx = i * valid_batch_size
         end_idx = (i + 1) * valid_batch_size
-        scores = nn.predict(X[start_idx:end_idx])
-        preds.append(np.argmax(scores, axis=1))
+        pred = nn.predict(X[start_idx:end_idx])  # get predictions for current batch of data
+        preds.append(pred)
 
     preds = np.concatenate(preds, axis=None)
+    # plot_conf_matrix(y, preds)
     correct_classified = np.count_nonzero(preds == y)
     acc = 100 * (correct_classified / len(y))  # calculate the accuracy
     print("%d/%d samples are correctly classified - Accuracy: %0.2f" % (correct_classified, len(y), acc))