Skip to content

Commit

Permalink
improved cnn support
Browse files Browse the repository at this point in the history
  • Loading branch information
g committed Dec 11, 2015
1 parent d17d282 commit 1714297
Show file tree
Hide file tree
Showing 2 changed files with 120 additions and 126 deletions.
205 changes: 96 additions & 109 deletions vdiscover/Cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,81 @@
from Utils import *
from Pipeline import *

def mk_cnn(mode, max_features, maxlen, embedding_dims, nb_filters, filter_length, hidden_dims, nb_classes, weights=None):

#print mode, max_features, maxlen, embedding_dims, nb_filters, filter_length, hidden_dims, nb_classes
from keras.preprocessing import sequence
from keras.optimizers import RMSprop
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.embeddings import Embedding
from keras.layers.convolutional import Convolution1D, MaxPooling1D

print('Build model...')
model = Sequential()

# we start off with an efficient embedding layer which maps
# our vocab indices into embedding_dims dimensions
if mode == "train":
model.add(Embedding(max_features, embedding_dims, input_length=maxlen))
elif mode == "test":
model.add(Embedding(max_features, embedding_dims, input_length=maxlen, weights=weights[0]))

model.add(Dropout(0.25))

# we add a Convolution1D, which will learn nb_filters
# word group filters of size filter_length:
if mode == "train":
model.add(Convolution1D(nb_filter=nb_filters,
filter_length=filter_length,
border_mode='valid',
activation='relu',
subsample_length=1))

elif mode == "test":
model.add(Convolution1D(nb_filter=nb_filters,
filter_length=filter_length,
border_mode='valid',
activation='relu',
subsample_length=1,
weights=weights[2]))


# we use standard max pooling (halving the output of the previous layer):
model.add(MaxPooling1D(pool_length=2))

# We flatten the output of the conv layer, so that we can add a vanilla dense layer:
model.add(Flatten())

# Computing the output shape of a conv layer can be tricky;
# for a good tutorial, see: http://cs231n.github.io/convolutional-networks/
output_size = nb_filters * (((maxlen - filter_length) / 1) + 1) / 2
#print output_size, hidden_dims

# We add a vanilla hidden layer:
if mode == "train":
model.add(Dense(hidden_dims))
if mode == "test":
model.add(Dense(hidden_dims, weights=weights[5]))

if mode == "train":

model.add(Dropout(0.25))
model.add(Activation('relu'))

# We project onto a single unit output layer, and squash it with a sigmoid:
model.add(Dense(nb_classes))

model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='rmsprop', class_mode="categorical")

elif mode == "test":
model.compile(loss='mean_squared_error', optimizer='rmsprop')


return model


def ClusterConv(model_file, train_file, valid_file, ftype, nsamples, outdir):

f = open(model_file+".pre")
Expand All @@ -51,7 +126,7 @@ def ClusterConv(model_file, train_file, valid_file, ftype, nsamples, outdir):
maxlen = window_size

embedding_dims = 20
nb_filters = 50
nb_filters = 250
filter_length = 3
hidden_dims = 250

Expand All @@ -62,51 +137,10 @@ def ClusterConv(model_file, train_file, valid_file, ftype, nsamples, outdir):

#y = train_programs
X_train, y_train, labels = preprocessor.preprocess_traces(train_features, y_data=train_classes, labels=train_programs)

from keras.preprocessing import sequence
from keras.optimizers import RMSprop
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.embeddings import Embedding
from keras.layers.convolutional import Convolution1D, MaxPooling1D

print('Build model...')
new_model = Sequential()

# we start off with an efficient embedding layer which maps
# our vocab indices into embedding_dims dimensions
new_model.add(Embedding(max_features, embedding_dims, weights=layers[0]))
new_model.add(Dropout(0.25))

# we add a Convolution1D, which will learn nb_filters
# word group filters of size filter_length:
new_model.add(Convolution1D(input_dim=embedding_dims,
nb_filter=nb_filters,
filter_length=filter_length,
border_mode="valid",
activation="relu",
subsample_length=1,
weights=layers[2]))

# we use standard max pooling (halving the output of the previous layer):
new_model.add(MaxPooling1D(pool_length=2))

# We flatten the output of the conv layer, so that we can add a vanilla dense layer:
new_model.add(Flatten())

# Computing the output shape of a conv layer can be tricky;
# for a good tutorial, see: http://cs231n.github.io/convolutional-networks/
output_size = nb_filters * (((maxlen - filter_length) / 1) + 1) / 2

# We add a vanilla hidden layer:
new_model.add(Dense(output_size, hidden_dims, weights=layers[5]))
#new_model.add(Dropout(0.25))
#new_model.add(Activation('relu'))

new_model.compile(loss='mean_squared_error', optimizer='rmsprop')
new_model = mk_cnn("test", max_features, maxlen, embedding_dims, nb_filters, filter_length, hidden_dims, None, weights=layers)

train_dict = dict()
train_dict[ftype] = new_model._predict(X_train)
train_dict[ftype] = new_model.predict(X_train)

model = make_cluster_pipeline_subtraces(ftype)
X_red = model.fit_transform(train_dict)
Expand All @@ -118,15 +152,15 @@ def ClusterConv(model_file, train_file, valid_file, ftype, nsamples, outdir):
x = gauss(0,0.1) + x
y = gauss(0,0.1) + y
plt.scatter(x, y, c='r')
#plt.text(x, y+0.02, prog.split("/")[-1])
plt.text(x, y+0.02, prog.split("/")[-1])


if valid_file is not None:
if valid_file is not None:
valid_programs, valid_features, valid_classes = read_traces(valid_file, None, cut=10, maxsize=window_size) #None)
valid_dict = dict()

X_valid, _, valid_labels = preprocessor.preprocess_traces(valid_features, y_data=None, labels=valid_programs)
valid_dict[ftype] = new_model._predict(X_valid)
valid_dict[ftype] = new_model._predict(X_valid)
X_red = model.transform(valid_dict)

for prog,[x,y] in zip(valid_labels, X_red):
Expand All @@ -135,7 +169,8 @@ def ClusterConv(model_file, train_file, valid_file, ftype, nsamples, outdir):
plt.scatter(x, y, c='b')
plt.text(x, y+0.02, prog.split("/")[-1])

plt.savefig("plot.png")
plt.show()
#plt.savefig("plot.png")
return None


Expand Down Expand Up @@ -167,7 +202,7 @@ def ClusterConv(model_file, train_file, valid_file, ftype, nsamples, outdir):

plt.savefig("plot.png")
#plt.show()

return zip(labels, cluster_labels)
#csvwriter = open_csv(train_file+".clusters")
#for (label, cluster_label) in zip(labels, cluster_labels):
Expand All @@ -176,7 +211,7 @@ def ClusterConv(model_file, train_file, valid_file, ftype, nsamples, outdir):
#print "Clusters dumped!"


def TrainDeepRepr(model_file, train_file, valid_file, ftype, nsamples):
def TrainCnn(model_file, train_file, valid_file, ftype, nsamples):

csvreader = open_csv(train_file)

Expand All @@ -192,7 +227,7 @@ def TrainDeepRepr(model_file, train_file, valid_file, ftype, nsamples):
nb_filters = 250
filter_length = 3
hidden_dims = 250
nb_epoch = 1
nb_epoch = 100

train_programs, train_features, train_classes = read_traces(train_file, nsamples, cut=None)
train_size = len(train_features)
Expand All @@ -205,70 +240,22 @@ def TrainDeepRepr(model_file, train_file, valid_file, ftype, nsamples):
max_features = len(tokenizer.word_counts)

preprocessor = DeepReprPreprocessor(tokenizer, window_size, batch_size)
X_train,y_train = preprocessor.preprocess(train_features, 3000)
X_train,y_train = preprocessor.preprocess(train_features, 50000)
nb_classes = len(preprocessor.classes)
print preprocessor.classes
#print X_train[0], len(X_train[0])
#print X_train[1], len(X_train[1])

#print set(y_train)
#assert(0)

from keras.preprocessing import sequence
from keras.optimizers import RMSprop
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.embeddings import Embedding
from keras.layers.convolutional import Convolution1D, MaxPooling1D

print('Build model...')
model = Sequential()

# we start off with an efficient embedding layer which maps
# our vocab indices into embedding_dims dimensions
model.add(Embedding(max_features, embedding_dims))
model.add(Dropout(0.25))

# we add a Convolution1D, which will learn nb_filters
# word group filters of size filter_length:
model.add(Convolution1D(input_dim=embedding_dims,
nb_filter=nb_filters,
filter_length=filter_length,
border_mode="valid",
activation="relu",
subsample_length=1))

# we use standard max pooling (halving the output of the previous layer):
model.add(MaxPooling1D(pool_length=2))

# We flatten the output of the conv layer, so that we can add a vanilla dense layer:
model.add(Flatten())

# Computing the output shape of a conv layer can be tricky;
# for a good tutorial, see: http://cs231n.github.io/convolutional-networks/
output_size = nb_filters * (((maxlen - filter_length) / 1) + 1) / 2

# We add a vanilla hidden layer:
model.add(Dense(output_size, hidden_dims))
model.add(Dropout(0.25))
model.add(Activation('relu'))

# We project onto a single unit output layer, and squash it with a sigmoid:
model.add(Dense(hidden_dims, nb_classes))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', optimizer='rmsprop', class_mode="categorical")
model = mk_cnn("train", max_features, maxlen, embedding_dims, nb_filters, filter_length, hidden_dims, nb_classes)
model.fit(X_train, y_train, validation_split=0.1, batch_size=batch_size, nb_epoch=nb_epoch, show_accuracy=True)

model.mypreprocessor = preprocessor
model_file = "cluster-weights.hdf5"
#model_file = model_file + ".wei"
#modelfile = open_model(model_file)
print "Saving model to",model_file
model.save_weights(model_file)
print "Saving model to",model_file + ".wei"
model.save_weights(model_file + ".wei")

model_file = "cluster-preprocessor.pklz"
modelfile = open_model(model_file)
print "Saving preprocessor to",model_file
#model_file = model_file + ".pre"
modelfile = open_model(model_file + ".pre")
print "Saving preprocessor to",model_file + ".pre"
#model.save_weights(model_file)
modelfile.write(pickle.dumps(preprocessor, protocol=2))

Expand Down Expand Up @@ -306,11 +293,11 @@ def ClusterScikit(model_file, train_file, valid_file, ftype, nsamples):
plt.text(x, y+0.02, prog.split("/")[-1])


if valid_file is not None:
if valid_file is not None:
valid_programs, valid_features, valid_classes = read_traces(valid_file, None)
valid_dict = dict()
valid_dict[ftype] = valid_features

X_red = model.transform(valid_dict)
for prog,[x,y],cl in zip(valid_programs, X_red, valid_classes):
x = gauss(0,0.1) + x
Expand Down
41 changes: 24 additions & 17 deletions vpredictor
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ sys.setrecursionlimit(1024*1024*1024)

from vdiscover.Pipeline import *
from vdiscover.Recall import Recall
from vdiscover.Cluster import TrainCnn
from vdiscover.Train import Train

if __name__ == "__main__":
Expand All @@ -52,7 +53,7 @@ if __name__ == "__main__":
parser.add_argument("--test-aggr",
help="Test a model using infile (recall only)",
action="store_true", default=False)


parser.add_argument("--static",
help="Use static features",
Expand All @@ -70,17 +71,21 @@ if __name__ == "__main__":
help="Cluster input traces using BOW",
action="store_true", default=False)

parser.add_argument("--cluster-conv",
parser.add_argument("--cluster-cnn",
help="Cluster input traces using a convolutional model",
action="store_true", default=False)

parser.add_argument("--train-rf",
help="Train a Random Forest using infile",
action="store_true", default=False)

parser.add_argument("--train-lstm",
help="Train a LSTM using infile (warning: very experimental and slow)",
action="store_true", default=False)
#parser.add_argument("--train-lstm",
# help="Train a LSTM using infile (warning: very experimental and slow)",
# action="store_true", default=False)

#parser.add_argument("--train-cnn",
# help="Train a CNN using infile",
# action="store_true", default=False)

parser.add_argument("--n-samples", type=int,
help="Select a number of samples from infile (train only)",
Expand All @@ -91,18 +96,20 @@ if __name__ == "__main__":
type=str, default="/dev/stdout")

options = parser.parse_args()
in_file = options.infile
in_file = options.infile
valid_file = options.valid

test_simple = options.test
test_aggr = options.test_aggr

training_mode_rf = options.train_rf
training_mode_lstm = options.train_lstm
#training_mode_lstm = options.train_lstm
#training_mode_cnn = options.train_cnn

training_mode_cluster_bow = options.cluster_bow
training_mode_cluster_conv = options.cluster_conv
training_mode_cluster_cnn = options.cluster_cnn

training_mode = training_mode_rf or training_mode_lstm or training_mode_cluster_bow or training_mode_cluster_conv
training_mode = training_mode_rf or training_mode_cluster_bow or training_mode_cluster_cnn

probability_mode = options.prob
nsamples = options.n_samples
Expand All @@ -124,29 +131,29 @@ if __name__ == "__main__":
if training_mode:
if training_mode_rf:
Train(out_file, in_file, valid_file, "rf", ftype, nsamples)
#elif training_mode_:
# Train(out_file, in_file, valid_file, "lstm", ftype, nsamples)
#elif training_mode_cnn:
elif training_mode_cluster_bow:
from vdiscover.Cluster import ClusterScikit

#Cluster(in_file, valid_file, ftype, nsamples)
ClusterScikit(None, in_file, valid_file, ftype, nsamples)
elif training_mode_cluster_conv:
elif training_mode_cluster_cnn:
from vdiscover.Cluster import ClusterConv

#Cluster(in_file, valid_file, ftype, nsamples)
if (model_file is None):
print "Clustering using a convolutional model requires a pre-trained model"
exit(-1)

TrainCnn(out_file, in_file, valid_file, ftype, nsamples)
#print "Clustering using a convolutional model requires a pre-trained model"
exit(0)

ClusterConv(model_file, in_file, valid_file, ftype, nsamples, None)


else:
if model_file is None:
print "VDiscover requires a pre-trained model to predict"
exit(-1)

test_mode = None
if test_simple:
test_mode = "simple"
Expand Down

0 comments on commit 1714297

Please sign in to comment.