From 1714297c31c2854560f02f3155a3eed5d0f5a1c9 Mon Sep 17 00:00:00 2001
From: g <g@g>
Date: Fri, 11 Dec 2015 09:44:08 -0300
Subject: [PATCH] improved cnn support

---
 vdiscover/Cluster.py | 205 ++++++++++++++++++++-----------------------
 vpredictor           |  41 +++++----
 2 files changed, 120 insertions(+), 126 deletions(-)

diff --git a/vdiscover/Cluster.py b/vdiscover/Cluster.py
index 9d27372..28be12e 100644
--- a/vdiscover/Cluster.py
+++ b/vdiscover/Cluster.py
@@ -31,6 +31,81 @@
 from Utils import *
 from Pipeline import *
 
+def mk_cnn(mode, max_features, maxlen, embedding_dims, nb_filters, filter_length, hidden_dims, nb_classes, weights=None):
+
+  #print mode, max_features, maxlen, embedding_dims, nb_filters, filter_length, hidden_dims, nb_classes
+  from keras.preprocessing import sequence
+  from keras.optimizers import RMSprop
+  from keras.models import Sequential
+  from keras.layers.core import Dense, Dropout, Activation, Flatten
+  from keras.layers.embeddings import Embedding
+  from keras.layers.convolutional import Convolution1D, MaxPooling1D
+
+  print('Build model...')
+  model = Sequential()
+
+  # we start off with an efficient embedding layer which maps
+  # our vocab indices into embedding_dims dimensions
+  if mode == "train":
+    model.add(Embedding(max_features, embedding_dims, input_length=maxlen))
+  elif mode == "test":
+    model.add(Embedding(max_features, embedding_dims, input_length=maxlen, weights=weights[0]))
+
+  model.add(Dropout(0.25))
+
+  # we add a Convolution1D, which will learn nb_filters
+  # word group filters of size filter_length:
+  if mode == "train":
+    model.add(Convolution1D(nb_filter=nb_filters,
+                        filter_length=filter_length,
+                        border_mode='valid',
+                        activation='relu',
+                        subsample_length=1))
+
+  elif mode == "test":
+    model.add(Convolution1D(nb_filter=nb_filters,
+                        filter_length=filter_length,
+                        border_mode='valid',
+                        activation='relu',
+                        subsample_length=1,
+                        weights=weights[2]))
+
+
+  # we use standard max pooling (halving the output of the previous layer):
+  model.add(MaxPooling1D(pool_length=2))
+
+  # We flatten the output of the conv layer, so that we can add a vanilla dense layer:
+  model.add(Flatten())
+
+  # Computing the output shape of a conv layer can be tricky;
+  # for a good tutorial, see: http://cs231n.github.io/convolutional-networks/
+  output_size = nb_filters * (((maxlen - filter_length) / 1) + 1) / 2
+  #print output_size, hidden_dims
+
+  # We add a vanilla hidden layer:
+  if mode == "train":
+    model.add(Dense(hidden_dims))
+  if mode == "test":
+    model.add(Dense(hidden_dims, weights=weights[5]))
+
+  if mode == "train":
+
+    model.add(Dropout(0.25))
+    model.add(Activation('relu'))
+
+    # We project onto a single unit output layer, and squash it with a sigmoid:
+    model.add(Dense(nb_classes))
+
+    model.add(Activation('softmax'))
+    model.compile(loss='categorical_crossentropy', optimizer='rmsprop', class_mode="categorical")
+
+  elif mode == "test":
+    model.compile(loss='mean_squared_error', optimizer='rmsprop')
+
+
+  return model
+
+
 def ClusterConv(model_file, train_file, valid_file, ftype, nsamples, outdir):
 
   f = open(model_file+".pre")
@@ -51,7 +126,7 @@ def ClusterConv(model_file, train_file, valid_file, ftype, nsamples, outdir):
   maxlen = window_size
 
   embedding_dims = 20
-  nb_filters = 50
+  nb_filters = 250
   filter_length = 3
   hidden_dims = 250
 
@@ -62,51 +137,10 @@ def ClusterConv(model_file, train_file, valid_file, ftype, nsamples, outdir):
 
   #y = train_programs
   X_train, y_train, labels = preprocessor.preprocess_traces(train_features, y_data=train_classes, labels=train_programs)
-
-  from keras.preprocessing import sequence
-  from keras.optimizers import RMSprop
-  from keras.models import Sequential
-  from keras.layers.core import Dense, Dropout, Activation, Flatten
-  from keras.layers.embeddings import Embedding
-  from keras.layers.convolutional import Convolution1D, MaxPooling1D
-
-  print('Build model...')
-  new_model = Sequential()
-
-  # we start off with an efficient embedding layer which maps
-  # our vocab indices into embedding_dims dimensions
-  new_model.add(Embedding(max_features, embedding_dims, weights=layers[0]))
-  new_model.add(Dropout(0.25))
-
-  # we add a Convolution1D, which will learn nb_filters
-  # word group filters of size filter_length:
-  new_model.add(Convolution1D(input_dim=embedding_dims,
-                        nb_filter=nb_filters,
-                        filter_length=filter_length,
-                        border_mode="valid",
-                        activation="relu",
-                        subsample_length=1,
-                        weights=layers[2]))
-
-  # we use standard max pooling (halving the output of the previous layer):
-  new_model.add(MaxPooling1D(pool_length=2))
-
-  # We flatten the output of the conv layer, so that we can add a vanilla dense layer:
-  new_model.add(Flatten())
-
-  # Computing the output shape of a conv layer can be tricky;
-  # for a good tutorial, see: http://cs231n.github.io/convolutional-networks/
-  output_size = nb_filters * (((maxlen - filter_length) / 1) + 1) / 2
-
-  # We add a vanilla hidden layer:
-  new_model.add(Dense(output_size, hidden_dims, weights=layers[5]))
-  #new_model.add(Dropout(0.25))
-  #new_model.add(Activation('relu'))
-
-  new_model.compile(loss='mean_squared_error', optimizer='rmsprop')
+  new_model = mk_cnn("test", max_features, maxlen, embedding_dims, nb_filters, filter_length, hidden_dims, None, weights=layers)
 
   train_dict = dict()
-  train_dict[ftype] = new_model._predict(X_train)
+  train_dict[ftype] = new_model.predict(X_train)
 
   model = make_cluster_pipeline_subtraces(ftype)
   X_red = model.fit_transform(train_dict)
@@ -118,15 +152,15 @@ def ClusterConv(model_file, train_file, valid_file, ftype, nsamples, outdir):
     x = gauss(0,0.1) + x
     y = gauss(0,0.1) + y
     plt.scatter(x, y, c='r')
-    #plt.text(x, y+0.02, prog.split("/")[-1])
+    plt.text(x, y+0.02, prog.split("/")[-1])
 
 
-  if valid_file is not None: 
+  if valid_file is not None:
     valid_programs, valid_features, valid_classes = read_traces(valid_file, None, cut=10, maxsize=window_size) #None)
     valid_dict = dict()
 
     X_valid, _, valid_labels = preprocessor.preprocess_traces(valid_features, y_data=None, labels=valid_programs)
-    valid_dict[ftype] = new_model._predict(X_valid) 
+    valid_dict[ftype] = new_model._predict(X_valid)
     X_red = model.transform(valid_dict)
 
     for prog,[x,y] in zip(valid_labels, X_red):
@@ -135,7 +169,8 @@ def ClusterConv(model_file, train_file, valid_file, ftype, nsamples, outdir):
       plt.scatter(x, y, c='b')
       plt.text(x, y+0.02, prog.split("/")[-1])
 
-  plt.savefig("plot.png")
+  plt.show()
+  #plt.savefig("plot.png")
   return None
 
 
@@ -167,7 +202,7 @@ def ClusterConv(model_file, train_file, valid_file, ftype, nsamples, outdir):
 
   plt.savefig("plot.png")
   #plt.show()
-  
+
   return zip(labels, cluster_labels)
   #csvwriter = open_csv(train_file+".clusters")
   #for (label, cluster_label) in zip(labels, cluster_labels):
@@ -176,7 +211,7 @@ def ClusterConv(model_file, train_file, valid_file, ftype, nsamples, outdir):
   #print "Clusters dumped!"
 
 
-def TrainDeepRepr(model_file, train_file, valid_file, ftype, nsamples):
+def TrainCnn(model_file, train_file, valid_file, ftype, nsamples):
 
   csvreader = open_csv(train_file)
 
@@ -192,7 +227,7 @@ def TrainDeepRepr(model_file, train_file, valid_file, ftype, nsamples):
   nb_filters = 250
   filter_length = 3
   hidden_dims = 250
-  nb_epoch = 1
+  nb_epoch = 100
 
   train_programs, train_features, train_classes = read_traces(train_file, nsamples, cut=None)
   train_size = len(train_features)
@@ -205,70 +240,22 @@ def TrainDeepRepr(model_file, train_file, valid_file, ftype, nsamples):
   max_features = len(tokenizer.word_counts)
 
   preprocessor = DeepReprPreprocessor(tokenizer, window_size, batch_size)
-  X_train,y_train = preprocessor.preprocess(train_features, 3000)
+  X_train,y_train = preprocessor.preprocess(train_features, 50000)
   nb_classes = len(preprocessor.classes)
   print preprocessor.classes
-  #print X_train[0], len(X_train[0])
-  #print X_train[1], len(X_train[1])
-
-  #print set(y_train)
-  #assert(0)
-
-  from keras.preprocessing import sequence
-  from keras.optimizers import RMSprop
-  from keras.models import Sequential
-  from keras.layers.core import Dense, Dropout, Activation, Flatten
-  from keras.layers.embeddings import Embedding
-  from keras.layers.convolutional import Convolution1D, MaxPooling1D
-
-  print('Build model...')
-  model = Sequential()
-
-  # we start off with an efficient embedding layer which maps
-  # our vocab indices into embedding_dims dimensions
-  model.add(Embedding(max_features, embedding_dims))
-  model.add(Dropout(0.25))
-
-  # we add a Convolution1D, which will learn nb_filters
-  # word group filters of size filter_length:
-  model.add(Convolution1D(input_dim=embedding_dims,
-                        nb_filter=nb_filters,
-                        filter_length=filter_length,
-                        border_mode="valid",
-                        activation="relu",
-                        subsample_length=1))
-
-  # we use standard max pooling (halving the output of the previous layer):
-  model.add(MaxPooling1D(pool_length=2))
-
-  # We flatten the output of the conv layer, so that we can add a vanilla dense layer:
-  model.add(Flatten())
-
-  # Computing the output shape of a conv layer can be tricky;
-  # for a good tutorial, see: http://cs231n.github.io/convolutional-networks/
-  output_size = nb_filters * (((maxlen - filter_length) / 1) + 1) / 2
 
-  # We add a vanilla hidden layer:
-  model.add(Dense(output_size, hidden_dims))
-  model.add(Dropout(0.25))
-  model.add(Activation('relu'))
-
-  # We project onto a single unit output layer, and squash it with a sigmoid:
-  model.add(Dense(hidden_dims, nb_classes))
-  model.add(Activation('softmax'))
-
-  model.compile(loss='categorical_crossentropy', optimizer='rmsprop', class_mode="categorical")
+  model = mk_cnn("train", max_features, maxlen, embedding_dims, nb_filters, filter_length, hidden_dims, nb_classes)
   model.fit(X_train, y_train, validation_split=0.1, batch_size=batch_size, nb_epoch=nb_epoch, show_accuracy=True)
 
   model.mypreprocessor = preprocessor
-  model_file = "cluster-weights.hdf5"
+  #model_file = model_file + ".wei"
   #modelfile = open_model(model_file)
-  print "Saving model to",model_file
-  model.save_weights(model_file)
+  print "Saving model to",model_file + ".wei"
+  model.save_weights(model_file + ".wei")
 
-  model_file = "cluster-preprocessor.pklz"
-  modelfile = open_model(model_file)
-  print "Saving preprocessor to",model_file
+  #model_file = model_file + ".pre"
+  modelfile = open_model(model_file + ".pre")
+  print "Saving preprocessor to",model_file + ".pre"
   #model.save_weights(model_file)
   modelfile.write(pickle.dumps(preprocessor, protocol=2))
 
@@ -306,11 +293,11 @@ def ClusterScikit(model_file, train_file, valid_file, ftype, nsamples):
     plt.text(x, y+0.02, prog.split("/")[-1])
 
 
-  if valid_file is not None: 
+  if valid_file is not None:
     valid_programs, valid_features, valid_classes = read_traces(valid_file, None)
     valid_dict = dict()
     valid_dict[ftype] = valid_features
- 
+
     X_red = model.transform(valid_dict)
     for prog,[x,y],cl in zip(valid_programs, X_red, valid_classes):
       x = gauss(0,0.1) + x
diff --git a/vpredictor b/vpredictor
index 2812e3a..6479a5d 100755
--- a/vpredictor
+++ b/vpredictor
@@ -29,6 +29,7 @@ sys.setrecursionlimit(1024*1024*1024)
 
 from vdiscover.Pipeline import *
 from vdiscover.Recall  import Recall
+from vdiscover.Cluster import TrainCnn
 from vdiscover.Train  import Train
 
 if __name__ == "__main__":
@@ -52,7 +53,7 @@ if __name__ == "__main__":
     parser.add_argument("--test-aggr",
                         help="Test a model using infile (recall only)",
                         action="store_true", default=False)
- 
+
 
     parser.add_argument("--static",
                         help="Use static features",
@@ -70,7 +71,7 @@ if __name__ == "__main__":
                         help="Cluster input traces using BOW",
                         action="store_true", default=False)
 
-    parser.add_argument("--cluster-conv",
+    parser.add_argument("--cluster-cnn",
                         help="Cluster input traces using a convolutional model",
                         action="store_true", default=False)
 
@@ -78,9 +79,13 @@ if __name__ == "__main__":
                         help="Train a Random Forest using infile",
                         action="store_true", default=False)
 
-    parser.add_argument("--train-lstm",
-                        help="Train a LSTM using infile (warning: very experimental and slow)",
-                        action="store_true", default=False)
+    #parser.add_argument("--train-lstm",
+    #                    help="Train a LSTM using infile (warning: very experimental and slow)",
+    #                    action="store_true", default=False)
+
+    #parser.add_argument("--train-cnn",
+    #                    help="Train a CNN using infile",
+    #                    action="store_true", default=False)
 
     parser.add_argument("--n-samples", type=int,
                         help="Select a number of samples from infile (train only)",
@@ -91,18 +96,20 @@ if __name__ == "__main__":
                         type=str, default="/dev/stdout")
 
     options = parser.parse_args()
-    in_file = options.infile 
+    in_file = options.infile
     valid_file = options.valid
-   
+
     test_simple = options.test
     test_aggr = options.test_aggr
 
     training_mode_rf = options.train_rf
-    training_mode_lstm = options.train_lstm
+    #training_mode_lstm = options.train_lstm
+    #training_mode_cnn = options.train_cnn
+
     training_mode_cluster_bow = options.cluster_bow
-    training_mode_cluster_conv = options.cluster_conv
+    training_mode_cluster_cnn = options.cluster_cnn
 
-    training_mode = training_mode_rf or training_mode_lstm or training_mode_cluster_bow or training_mode_cluster_conv
+    training_mode = training_mode_rf or training_mode_cluster_bow or training_mode_cluster_cnn
 
     probability_mode = options.prob
     nsamples = options.n_samples
@@ -124,21 +131,21 @@ if __name__ == "__main__":
     if training_mode:
       if training_mode_rf:
         Train(out_file, in_file, valid_file, "rf", ftype, nsamples)
-      #elif training_mode_:
-      #  Train(out_file, in_file, valid_file, "lstm", ftype, nsamples)
+      #elif training_mode_cnn:
       elif training_mode_cluster_bow:
         from vdiscover.Cluster  import ClusterScikit
 
         #Cluster(in_file, valid_file, ftype, nsamples)
         ClusterScikit(None, in_file, valid_file, ftype, nsamples)
-      elif training_mode_cluster_conv:
+      elif training_mode_cluster_cnn:
         from vdiscover.Cluster  import ClusterConv
 
         #Cluster(in_file, valid_file, ftype, nsamples)
         if (model_file is None):
-          print "Clustering using a convolutional model requires a pre-trained model"
-          exit(-1)
-        
+          TrainCnn(out_file, in_file, valid_file, ftype, nsamples)
+          #print "Clustering using a convolutional model requires a pre-trained model"
+          exit(0)
+
         ClusterConv(model_file, in_file, valid_file, ftype, nsamples, None)
 
 
@@ -146,7 +153,7 @@ if __name__ == "__main__":
       if model_file is None:
         print "VDiscover requires a pre-trained model to predict"
         exit(-1)
-   
+
       test_mode = None
       if test_simple:
         test_mode = "simple"