From 1714297c31c2854560f02f3155a3eed5d0f5a1c9 Mon Sep 17 00:00:00 2001 From: g Date: Fri, 11 Dec 2015 09:44:08 -0300 Subject: [PATCH] improved cnn support --- vdiscover/Cluster.py | 205 ++++++++++++++++++++----------------------- vpredictor | 41 +++++---- 2 files changed, 120 insertions(+), 126 deletions(-) diff --git a/vdiscover/Cluster.py b/vdiscover/Cluster.py index 9d27372..28be12e 100644 --- a/vdiscover/Cluster.py +++ b/vdiscover/Cluster.py @@ -31,6 +31,81 @@ from Utils import * from Pipeline import * +def mk_cnn(mode, max_features, maxlen, embedding_dims, nb_filters, filter_length, hidden_dims, nb_classes, weights=None): + + #print mode, max_features, maxlen, embedding_dims, nb_filters, filter_length, hidden_dims, nb_classes + from keras.preprocessing import sequence + from keras.optimizers import RMSprop + from keras.models import Sequential + from keras.layers.core import Dense, Dropout, Activation, Flatten + from keras.layers.embeddings import Embedding + from keras.layers.convolutional import Convolution1D, MaxPooling1D + + print('Build model...') + model = Sequential() + + # we start off with an efficient embedding layer which maps + # our vocab indices into embedding_dims dimensions + if mode == "train": + model.add(Embedding(max_features, embedding_dims, input_length=maxlen)) + elif mode == "test": + model.add(Embedding(max_features, embedding_dims, input_length=maxlen, weights=weights[0])) + + model.add(Dropout(0.25)) + + # we add a Convolution1D, which will learn nb_filters + # word group filters of size filter_length: + if mode == "train": + model.add(Convolution1D(nb_filter=nb_filters, + filter_length=filter_length, + border_mode='valid', + activation='relu', + subsample_length=1)) + + elif mode == "test": + model.add(Convolution1D(nb_filter=nb_filters, + filter_length=filter_length, + border_mode='valid', + activation='relu', + subsample_length=1, + weights=weights[2])) + + + # we use standard max pooling (halving the output of the previous layer): + model.add(MaxPooling1D(pool_length=2)) + + # We flatten the output of the conv layer, so that we can add a vanilla dense layer: + model.add(Flatten()) + + # Computing the output shape of a conv layer can be tricky; + # for a good tutorial, see: http://cs231n.github.io/convolutional-networks/ + output_size = nb_filters * (((maxlen - filter_length) / 1) + 1) / 2 + #print output_size, hidden_dims + + # We add a vanilla hidden layer: + if mode == "train": + model.add(Dense(hidden_dims)) + if mode == "test": + model.add(Dense(hidden_dims, weights=weights[5])) + + if mode == "train": + + model.add(Dropout(0.25)) + model.add(Activation('relu')) + + # We project onto a single unit output layer, and squash it with a sigmoid: + model.add(Dense(nb_classes)) + + model.add(Activation('softmax')) + model.compile(loss='categorical_crossentropy', optimizer='rmsprop', class_mode="categorical") + + elif mode == "test": + model.compile(loss='mean_squared_error', optimizer='rmsprop') + + + return model + + def ClusterConv(model_file, train_file, valid_file, ftype, nsamples, outdir): f = open(model_file+".pre") @@ -51,7 +126,7 @@ def ClusterConv(model_file, train_file, valid_file, ftype, nsamples, outdir): maxlen = window_size embedding_dims = 20 - nb_filters = 50 + nb_filters = 250 filter_length = 3 hidden_dims = 250 @@ -62,51 +137,10 @@ def ClusterConv(model_file, train_file, valid_file, ftype, nsamples, outdir): #y = train_programs X_train, y_train, labels = preprocessor.preprocess_traces(train_features, y_data=train_classes, labels=train_programs) - - from keras.preprocessing import sequence - from keras.optimizers import RMSprop - from keras.models import Sequential - from keras.layers.core import Dense, Dropout, Activation, Flatten - from keras.layers.embeddings import Embedding - from keras.layers.convolutional import Convolution1D, MaxPooling1D - - print('Build model...') - new_model = Sequential() - - # we start off with an efficient embedding layer which maps - # our vocab indices into embedding_dims dimensions - new_model.add(Embedding(max_features, embedding_dims, weights=layers[0])) - new_model.add(Dropout(0.25)) - - # we add a Convolution1D, which will learn nb_filters - # word group filters of size filter_length: - new_model.add(Convolution1D(input_dim=embedding_dims, - nb_filter=nb_filters, - filter_length=filter_length, - border_mode="valid", - activation="relu", - subsample_length=1, - weights=layers[2])) - - # we use standard max pooling (halving the output of the previous layer): - new_model.add(MaxPooling1D(pool_length=2)) - - # We flatten the output of the conv layer, so that we can add a vanilla dense layer: - new_model.add(Flatten()) - - # Computing the output shape of a conv layer can be tricky; - # for a good tutorial, see: http://cs231n.github.io/convolutional-networks/ - output_size = nb_filters * (((maxlen - filter_length) / 1) + 1) / 2 - - # We add a vanilla hidden layer: - new_model.add(Dense(output_size, hidden_dims, weights=layers[5])) - #new_model.add(Dropout(0.25)) - #new_model.add(Activation('relu')) - - new_model.compile(loss='mean_squared_error', optimizer='rmsprop') + new_model = mk_cnn("test", max_features, maxlen, embedding_dims, nb_filters, filter_length, hidden_dims, None, weights=layers) train_dict = dict() - train_dict[ftype] = new_model._predict(X_train) + train_dict[ftype] = new_model.predict(X_train) model = make_cluster_pipeline_subtraces(ftype) X_red = model.fit_transform(train_dict) @@ -118,15 +152,15 @@ def ClusterConv(model_file, train_file, valid_file, ftype, nsamples, outdir): x = gauss(0,0.1) + x y = gauss(0,0.1) + y plt.scatter(x, y, c='r') - #plt.text(x, y+0.02, prog.split("/")[-1]) + plt.text(x, y+0.02, prog.split("/")[-1]) - if valid_file is not None: + if valid_file is not None: valid_programs, valid_features, valid_classes = read_traces(valid_file, None, cut=10, maxsize=window_size) #None) valid_dict = dict() X_valid, _, valid_labels = preprocessor.preprocess_traces(valid_features, y_data=None, labels=valid_programs) - valid_dict[ftype] = new_model._predict(X_valid) + valid_dict[ftype] = new_model._predict(X_valid) X_red = model.transform(valid_dict) for prog,[x,y] in zip(valid_labels, X_red): @@ -135,7 +169,8 @@ def ClusterConv(model_file, train_file, valid_file, ftype, nsamples, outdir): plt.scatter(x, y, c='b') plt.text(x, y+0.02, prog.split("/")[-1]) - plt.savefig("plot.png") + plt.show() + #plt.savefig("plot.png") return None @@ -167,7 +202,7 @@ def ClusterConv(model_file, train_file, valid_file, ftype, nsamples, outdir): plt.savefig("plot.png") #plt.show() - + return zip(labels, cluster_labels) #csvwriter = open_csv(train_file+".clusters") #for (label, cluster_label) in zip(labels, cluster_labels): @@ -176,7 +211,7 @@ def ClusterConv(model_file, train_file, valid_file, ftype, nsamples, outdir): #print "Clusters dumped!" -def TrainDeepRepr(model_file, train_file, valid_file, ftype, nsamples): +def TrainCnn(model_file, train_file, valid_file, ftype, nsamples): csvreader = open_csv(train_file) @@ -192,7 +227,7 @@ def TrainDeepRepr(model_file, train_file, valid_file, ftype, nsamples): nb_filters = 250 filter_length = 3 hidden_dims = 250 - nb_epoch = 1 + nb_epoch = 100 train_programs, train_features, train_classes = read_traces(train_file, nsamples, cut=None) train_size = len(train_features) @@ -205,70 +240,22 @@ def TrainDeepRepr(model_file, train_file, valid_file, ftype, nsamples): max_features = len(tokenizer.word_counts) preprocessor = DeepReprPreprocessor(tokenizer, window_size, batch_size) - X_train,y_train = preprocessor.preprocess(train_features, 3000) + X_train,y_train = preprocessor.preprocess(train_features, 50000) nb_classes = len(preprocessor.classes) print preprocessor.classes - #print X_train[0], len(X_train[0]) - #print X_train[1], len(X_train[1]) - - #print set(y_train) - #assert(0) - - from keras.preprocessing import sequence - from keras.optimizers import RMSprop - from keras.models import Sequential - from keras.layers.core import Dense, Dropout, Activation, Flatten - from keras.layers.embeddings import Embedding - from keras.layers.convolutional import Convolution1D, MaxPooling1D - - print('Build model...') - model = Sequential() - - # we start off with an efficient embedding layer which maps - # our vocab indices into embedding_dims dimensions - model.add(Embedding(max_features, embedding_dims)) - model.add(Dropout(0.25)) - - # we add a Convolution1D, which will learn nb_filters - # word group filters of size filter_length: - model.add(Convolution1D(input_dim=embedding_dims, - nb_filter=nb_filters, - filter_length=filter_length, - border_mode="valid", - activation="relu", - subsample_length=1)) - - # we use standard max pooling (halving the output of the previous layer): - model.add(MaxPooling1D(pool_length=2)) - - # We flatten the output of the conv layer, so that we can add a vanilla dense layer: - model.add(Flatten()) - - # Computing the output shape of a conv layer can be tricky; - # for a good tutorial, see: http://cs231n.github.io/convolutional-networks/ - output_size = nb_filters * (((maxlen - filter_length) / 1) + 1) / 2 - # We add a vanilla hidden layer: - model.add(Dense(output_size, hidden_dims)) - model.add(Dropout(0.25)) - model.add(Activation('relu')) - - # We project onto a single unit output layer, and squash it with a sigmoid: - model.add(Dense(hidden_dims, nb_classes)) - model.add(Activation('softmax')) - - model.compile(loss='categorical_crossentropy', optimizer='rmsprop', class_mode="categorical") + model = mk_cnn("train", max_features, maxlen, embedding_dims, nb_filters, filter_length, hidden_dims, nb_classes) model.fit(X_train, y_train, validation_split=0.1, batch_size=batch_size, nb_epoch=nb_epoch, show_accuracy=True) model.mypreprocessor = preprocessor - model_file = "cluster-weights.hdf5" + #model_file = model_file + ".wei" #modelfile = open_model(model_file) - print "Saving model to",model_file - model.save_weights(model_file) + print "Saving model to",model_file + ".wei" + model.save_weights(model_file + ".wei") - model_file = "cluster-preprocessor.pklz" - modelfile = open_model(model_file) - print "Saving preprocessor to",model_file + #model_file = model_file + ".pre" + modelfile = open_model(model_file + ".pre") + print "Saving preprocessor to",model_file + ".pre" #model.save_weights(model_file) modelfile.write(pickle.dumps(preprocessor, protocol=2)) @@ -306,11 +293,11 @@ def ClusterScikit(model_file, train_file, valid_file, ftype, nsamples): plt.text(x, y+0.02, prog.split("/")[-1]) - if valid_file is not None: + if valid_file is not None: valid_programs, valid_features, valid_classes = read_traces(valid_file, None) valid_dict = dict() valid_dict[ftype] = valid_features - + X_red = model.transform(valid_dict) for prog,[x,y],cl in zip(valid_programs, X_red, valid_classes): x = gauss(0,0.1) + x diff --git a/vpredictor b/vpredictor index 2812e3a..6479a5d 100755 --- a/vpredictor +++ b/vpredictor @@ -29,6 +29,7 @@ sys.setrecursionlimit(1024*1024*1024) from vdiscover.Pipeline import * from vdiscover.Recall import Recall +from vdiscover.Cluster import TrainCnn from vdiscover.Train import Train if __name__ == "__main__": @@ -52,7 +53,7 @@ if __name__ == "__main__": parser.add_argument("--test-aggr", help="Test a model using infile (recall only)", action="store_true", default=False) - + parser.add_argument("--static", help="Use static features", @@ -70,7 +71,7 @@ if __name__ == "__main__": help="Cluster input traces using BOW", action="store_true", default=False) - parser.add_argument("--cluster-conv", + parser.add_argument("--cluster-cnn", help="Cluster input traces using a convolutional model", action="store_true", default=False) @@ -78,9 +79,13 @@ if __name__ == "__main__": help="Train a Random Forest using infile", action="store_true", default=False) - parser.add_argument("--train-lstm", - help="Train a LSTM using infile (warning: very experimental and slow)", - action="store_true", default=False) + #parser.add_argument("--train-lstm", + # help="Train a LSTM using infile (warning: very experimental and slow)", + # action="store_true", default=False) + + #parser.add_argument("--train-cnn", + # help="Train a CNN using infile", + # action="store_true", default=False) parser.add_argument("--n-samples", type=int, help="Select a number of samples from infile (train only)", @@ -91,18 +96,20 @@ if __name__ == "__main__": type=str, default="/dev/stdout") options = parser.parse_args() - in_file = options.infile + in_file = options.infile valid_file = options.valid - + test_simple = options.test test_aggr = options.test_aggr training_mode_rf = options.train_rf - training_mode_lstm = options.train_lstm + #training_mode_lstm = options.train_lstm + #training_mode_cnn = options.train_cnn + training_mode_cluster_bow = options.cluster_bow - training_mode_cluster_conv = options.cluster_conv + training_mode_cluster_cnn = options.cluster_cnn - training_mode = training_mode_rf or training_mode_lstm or training_mode_cluster_bow or training_mode_cluster_conv + training_mode = training_mode_rf or training_mode_cluster_bow or training_mode_cluster_cnn probability_mode = options.prob nsamples = options.n_samples @@ -124,21 +131,21 @@ if __name__ == "__main__": if training_mode: if training_mode_rf: Train(out_file, in_file, valid_file, "rf", ftype, nsamples) - #elif training_mode_: - # Train(out_file, in_file, valid_file, "lstm", ftype, nsamples) + #elif training_mode_cnn: elif training_mode_cluster_bow: from vdiscover.Cluster import ClusterScikit #Cluster(in_file, valid_file, ftype, nsamples) ClusterScikit(None, in_file, valid_file, ftype, nsamples) - elif training_mode_cluster_conv: + elif training_mode_cluster_cnn: from vdiscover.Cluster import ClusterConv #Cluster(in_file, valid_file, ftype, nsamples) if (model_file is None): - print "Clustering using a convolutional model requires a pre-trained model" - exit(-1) - + TrainCnn(out_file, in_file, valid_file, ftype, nsamples) + #print "Clustering using a convolutional model requires a pre-trained model" + exit(0) + ClusterConv(model_file, in_file, valid_file, ftype, nsamples, None) @@ -146,7 +153,7 @@ if __name__ == "__main__": if model_file is None: print "VDiscover requires a pre-trained model to predict" exit(-1) - + test_mode = None if test_simple: test_mode = "simple"