Skip to content

Commit

Permalink
merge
Browse files Browse the repository at this point in the history
  • Loading branch information
gaa-cifasis committed Dec 11, 2015
2 parents 3c0345f + 1714297 commit 49dd846
Show file tree
Hide file tree
Showing 3 changed files with 114 additions and 124 deletions.
124 changes: 17 additions & 107 deletions vdiscover/Cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,6 @@ def Cluster(X, labels)
return zip(labels, cluster_labels)
"""


def ClusterConv(model_file, train_file, valid_file, ftype, nsamples, outdir):

f = open(model_file+".pre")
Expand All @@ -85,7 +84,7 @@ def ClusterConv(model_file, train_file, valid_file, ftype, nsamples, outdir):
maxlen = window_size

embedding_dims = 20
nb_filters = 50
nb_filters = 250
filter_length = 3
hidden_dims = 250

Expand All @@ -96,51 +95,10 @@ def ClusterConv(model_file, train_file, valid_file, ftype, nsamples, outdir):

#y = train_programs
X_train, y_train, labels = preprocessor.preprocess_traces(train_features, y_data=train_classes, labels=train_programs)

from keras.preprocessing import sequence
from keras.optimizers import RMSprop
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.embeddings import Embedding
from keras.layers.convolutional import Convolution1D, MaxPooling1D

print('Build model...')
new_model = Sequential()

# we start off with an efficient embedding layer which maps
# our vocab indices into embedding_dims dimensions
new_model.add(Embedding(max_features, embedding_dims, weights=layers[0]))
new_model.add(Dropout(0.25))

# we add a Convolution1D, which will learn nb_filters
# word group filters of size filter_length:
new_model.add(Convolution1D(input_dim=embedding_dims,
nb_filter=nb_filters,
filter_length=filter_length,
border_mode="valid",
activation="relu",
subsample_length=1,
weights=layers[2]))

# we use standard max pooling (halving the output of the previous layer):
new_model.add(MaxPooling1D(pool_length=2))

# We flatten the output of the conv layer, so that we can add a vanilla dense layer:
new_model.add(Flatten())

# Computing the output shape of a conv layer can be tricky;
# for a good tutorial, see: http://cs231n.github.io/convolutional-networks/
output_size = nb_filters * (((maxlen - filter_length) / 1) + 1) / 2

# We add a vanilla hidden layer:
new_model.add(Dense(output_size, hidden_dims, weights=layers[5]))
#new_model.add(Dropout(0.25))
#new_model.add(Activation('relu'))

new_model.compile(loss='mean_squared_error', optimizer='rmsprop')
new_model = mk_cnn("test", max_features, maxlen, embedding_dims, nb_filters, filter_length, hidden_dims, None, weights=layers)

train_dict = dict()
train_dict[ftype] = new_model._predict(X_train)
train_dict[ftype] = new_model.predict(X_train)

model = make_cluster_pipeline_subtraces(ftype)
X_red_comp = model.fit_transform(train_dict)
Expand All @@ -155,6 +113,7 @@ def ClusterConv(model_file, train_file, valid_file, ftype, nsamples, outdir):
ncolors = len(colors)

for prog,[x,y] in zip(labels, X_red):
<<<<<<< HEAD
#x = gauss(0,0.1) + x
#y = gauss(0,0.1) + y
color = 'r' #colors[progs.index(prog)]
Expand All @@ -181,7 +140,6 @@ def ClusterConv(model_file, train_file, valid_file, ftype, nsamples, outdir):

plt.show()
#plt.savefig("plot.png")
#return None

from sklearn.cluster import MeanShift, estimate_bandwidth

Expand Down Expand Up @@ -274,7 +232,7 @@ def ClusterConv(model_file, train_file, valid_file, ftype, nsamples, outdir):
#print "Clusters dumped!"


def TrainDeepRepr(model_file, train_file, valid_file, ftype, nsamples):
def TrainCnn(model_file, train_file, valid_file, ftype, nsamples):

csvreader = open_csv(train_file)

Expand All @@ -290,7 +248,7 @@ def TrainDeepRepr(model_file, train_file, valid_file, ftype, nsamples):
nb_filters = 250
filter_length = 3
hidden_dims = 250
nb_epoch = 1
nb_epoch = 100

train_programs, train_features, train_classes = read_traces(train_file, nsamples, cut=None)
train_size = len(train_features)
Expand All @@ -303,70 +261,22 @@ def TrainDeepRepr(model_file, train_file, valid_file, ftype, nsamples):
max_features = len(tokenizer.word_counts)

preprocessor = DeepReprPreprocessor(tokenizer, window_size, batch_size)
X_train,y_train = preprocessor.preprocess(train_features, 3000)
X_train,y_train = preprocessor.preprocess(train_features, 50000)
nb_classes = len(preprocessor.classes)
print preprocessor.classes
#print X_train[0], len(X_train[0])
#print X_train[1], len(X_train[1])

#print set(y_train)
#assert(0)

from keras.preprocessing import sequence
from keras.optimizers import RMSprop
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.embeddings import Embedding
from keras.layers.convolutional import Convolution1D, MaxPooling1D

print('Build model...')
model = Sequential()

# we start off with an efficient embedding layer which maps
# our vocab indices into embedding_dims dimensions
model.add(Embedding(max_features, embedding_dims))
model.add(Dropout(0.25))

# we add a Convolution1D, which will learn nb_filters
# word group filters of size filter_length:
model.add(Convolution1D(input_dim=embedding_dims,
nb_filter=nb_filters,
filter_length=filter_length,
border_mode="valid",
activation="relu",
subsample_length=1))

# we use standard max pooling (halving the output of the previous layer):
model.add(MaxPooling1D(pool_length=2))

# We flatten the output of the conv layer, so that we can add a vanilla dense layer:
model.add(Flatten())

# Computing the output shape of a conv layer can be tricky;
# for a good tutorial, see: http://cs231n.github.io/convolutional-networks/
output_size = nb_filters * (((maxlen - filter_length) / 1) + 1) / 2

# We add a vanilla hidden layer:
model.add(Dense(output_size, hidden_dims))
model.add(Dropout(0.25))
model.add(Activation('relu'))

# We project onto a single unit output layer, and squash it with a sigmoid:
model.add(Dense(hidden_dims, nb_classes))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', optimizer='rmsprop', class_mode="categorical")

model = mk_cnn("train", max_features, maxlen, embedding_dims, nb_filters, filter_length, hidden_dims, nb_classes)
model.fit(X_train, y_train, validation_split=0.1, batch_size=batch_size, nb_epoch=nb_epoch, show_accuracy=True)

model.mypreprocessor = preprocessor
model_file = "cluster-weights.hdf5"
#model_file = model_file + ".wei"
#modelfile = open_model(model_file)
print "Saving model to",model_file
model.save_weights(model_file)
print "Saving model to",model_file + ".wei"
model.save_weights(model_file + ".wei")

model_file = "cluster-preprocessor.pklz"
modelfile = open_model(model_file)
print "Saving preprocessor to",model_file
#model_file = model_file + ".pre"
modelfile = open_model(model_file + ".pre")
print "Saving preprocessor to",model_file + ".pre"
#model.save_weights(model_file)
modelfile.write(pickle.dumps(preprocessor, protocol=2))

Expand Down Expand Up @@ -404,11 +314,11 @@ def ClusterScikit(model_file, train_file, valid_file, ftype, nsamples):
plt.text(x, y+0.02, prog.split("/")[-1])


if valid_file is not None:
if valid_file is not None:
valid_programs, valid_features, valid_classes = read_traces(valid_file, None)
valid_dict = dict()
valid_dict[ftype] = valid_features

X_red = model.transform(valid_dict)
for prog,[x,y],cl in zip(valid_programs, X_red, valid_classes):
x = gauss(0,0.1) + x
Expand Down
73 changes: 73 additions & 0 deletions vdiscover/Pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,79 @@ def make_cluster_pipeline_subtraces(ftype):
else:
assert(0)

def make_cluster_cnn(mode, max_features, maxlen, embedding_dims, nb_filters, filter_length, hidden_dims, nb_classes, weights=None):

#print mode, max_features, maxlen, embedding_dims, nb_filters, filter_length, hidden_dims, nb_classes
from keras.preprocessing import sequence
from keras.optimizers import RMSprop
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.embeddings import Embedding
from keras.layers.convolutional import Convolution1D, MaxPooling1D

print('Build model...')
model = Sequential()

# we start off with an efficient embedding layer which maps
# our vocab indices into embedding_dims dimensions
if mode == "train":
model.add(Embedding(max_features, embedding_dims, input_length=maxlen))
elif mode == "test":
model.add(Embedding(max_features, embedding_dims, input_length=maxlen, weights=weights[0]))

model.add(Dropout(0.25))

# we add a Convolution1D, which will learn nb_filters
# word group filters of size filter_length:
if mode == "train":
model.add(Convolution1D(nb_filter=nb_filters,
filter_length=filter_length,
border_mode='valid',
activation='relu',
subsample_length=1))

elif mode == "test":
model.add(Convolution1D(nb_filter=nb_filters,
filter_length=filter_length,
border_mode='valid',
activation='relu',
subsample_length=1,
weights=weights[2]))


# we use standard max pooling (halving the output of the previous layer):
model.add(MaxPooling1D(pool_length=2))

# We flatten the output of the conv layer, so that we can add a vanilla dense layer:
model.add(Flatten())

# Computing the output shape of a conv layer can be tricky;
# for a good tutorial, see: http://cs231n.github.io/convolutional-networks/
output_size = nb_filters * (((maxlen - filter_length) / 1) + 1) / 2
#print output_size, hidden_dims

# We add a vanilla hidden layer:
if mode == "train":
model.add(Dense(hidden_dims))
if mode == "test":
model.add(Dense(hidden_dims, weights=weights[5]))

if mode == "train":

model.add(Dropout(0.25))
model.add(Activation('relu'))

# We project onto a single unit output layer, and squash it with a sigmoid:
model.add(Dense(nb_classes))

model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='rmsprop', class_mode="categorical")

elif mode == "test":
model.compile(loss='mean_squared_error', optimizer='rmsprop')


return model



Expand Down
41 changes: 24 additions & 17 deletions vpredictor
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ sys.setrecursionlimit(1024*1024*1024)

from vdiscover.Pipeline import *
from vdiscover.Recall import Recall
from vdiscover.Cluster import TrainCnn
from vdiscover.Train import Train

if __name__ == "__main__":
Expand All @@ -52,7 +53,7 @@ if __name__ == "__main__":
parser.add_argument("--test-aggr",
help="Test a model using infile (recall only)",
action="store_true", default=False)


parser.add_argument("--static",
help="Use static features",
Expand All @@ -70,17 +71,21 @@ if __name__ == "__main__":
help="Cluster input traces using BOW",
action="store_true", default=False)

parser.add_argument("--cluster-conv",
parser.add_argument("--cluster-cnn",
help="Cluster input traces using a convolutional model",
action="store_true", default=False)

parser.add_argument("--train-rf",
help="Train a Random Forest using infile",
action="store_true", default=False)

parser.add_argument("--train-lstm",
help="Train a LSTM using infile (warning: very experimental and slow)",
action="store_true", default=False)
#parser.add_argument("--train-lstm",
# help="Train a LSTM using infile (warning: very experimental and slow)",
# action="store_true", default=False)

#parser.add_argument("--train-cnn",
# help="Train a CNN using infile",
# action="store_true", default=False)

parser.add_argument("--n-samples", type=int,
help="Select a number of samples from infile (train only)",
Expand All @@ -91,18 +96,20 @@ if __name__ == "__main__":
type=str, default="/dev/stdout")

options = parser.parse_args()
in_file = options.infile
in_file = options.infile
valid_file = options.valid

test_simple = options.test
test_aggr = options.test_aggr

training_mode_rf = options.train_rf
training_mode_lstm = options.train_lstm
#training_mode_lstm = options.train_lstm
#training_mode_cnn = options.train_cnn

training_mode_cluster_bow = options.cluster_bow
training_mode_cluster_conv = options.cluster_conv
training_mode_cluster_cnn = options.cluster_cnn

training_mode = training_mode_rf or training_mode_lstm or training_mode_cluster_bow or training_mode_cluster_conv
training_mode = training_mode_rf or training_mode_cluster_bow or training_mode_cluster_cnn

probability_mode = options.prob
nsamples = options.n_samples
Expand All @@ -124,29 +131,29 @@ if __name__ == "__main__":
if training_mode:
if training_mode_rf:
Train(out_file, in_file, valid_file, "rf", ftype, nsamples)
#elif training_mode_:
# Train(out_file, in_file, valid_file, "lstm", ftype, nsamples)
#elif training_mode_cnn:
elif training_mode_cluster_bow:
from vdiscover.Cluster import ClusterScikit

#Cluster(in_file, valid_file, ftype, nsamples)
ClusterScikit(None, in_file, valid_file, ftype, nsamples)
elif training_mode_cluster_conv:
elif training_mode_cluster_cnn:
from vdiscover.Cluster import ClusterConv

#Cluster(in_file, valid_file, ftype, nsamples)
if (model_file is None):
print "Clustering using a convolutional model requires a pre-trained model"
exit(-1)

TrainCnn(out_file, in_file, valid_file, ftype, nsamples)
#print "Clustering using a convolutional model requires a pre-trained model"
exit(0)

ClusterConv(model_file, in_file, valid_file, ftype, nsamples, None)


else:
if model_file is None:
print "VDiscover requires a pre-trained model to predict"
exit(-1)

test_mode = None
if test_simple:
test_mode = "simple"
Expand Down

0 comments on commit 49dd846

Please sign in to comment.