Skip to content

Commit 49dd846

Browse files
author
gaa-cifasis
committed
merge
2 parents 3c0345f + 1714297 commit 49dd846

File tree

3 files changed

+114
-124
lines changed

3 files changed

+114
-124
lines changed

vdiscover/Cluster.py

Lines changed: 17 additions & 107 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,6 @@ def Cluster(X, labels)
6464
return zip(labels, cluster_labels)
6565
"""
6666

67-
6867
def ClusterConv(model_file, train_file, valid_file, ftype, nsamples, outdir):
6968

7069
f = open(model_file+".pre")
@@ -85,7 +84,7 @@ def ClusterConv(model_file, train_file, valid_file, ftype, nsamples, outdir):
8584
maxlen = window_size
8685

8786
embedding_dims = 20
88-
nb_filters = 50
87+
nb_filters = 250
8988
filter_length = 3
9089
hidden_dims = 250
9190

@@ -96,51 +95,10 @@ def ClusterConv(model_file, train_file, valid_file, ftype, nsamples, outdir):
9695

9796
#y = train_programs
9897
X_train, y_train, labels = preprocessor.preprocess_traces(train_features, y_data=train_classes, labels=train_programs)
99-
100-
from keras.preprocessing import sequence
101-
from keras.optimizers import RMSprop
102-
from keras.models import Sequential
103-
from keras.layers.core import Dense, Dropout, Activation, Flatten
104-
from keras.layers.embeddings import Embedding
105-
from keras.layers.convolutional import Convolution1D, MaxPooling1D
106-
107-
print('Build model...')
108-
new_model = Sequential()
109-
110-
# we start off with an efficient embedding layer which maps
111-
# our vocab indices into embedding_dims dimensions
112-
new_model.add(Embedding(max_features, embedding_dims, weights=layers[0]))
113-
new_model.add(Dropout(0.25))
114-
115-
# we add a Convolution1D, which will learn nb_filters
116-
# word group filters of size filter_length:
117-
new_model.add(Convolution1D(input_dim=embedding_dims,
118-
nb_filter=nb_filters,
119-
filter_length=filter_length,
120-
border_mode="valid",
121-
activation="relu",
122-
subsample_length=1,
123-
weights=layers[2]))
124-
125-
# we use standard max pooling (halving the output of the previous layer):
126-
new_model.add(MaxPooling1D(pool_length=2))
127-
128-
# We flatten the output of the conv layer, so that we can add a vanilla dense layer:
129-
new_model.add(Flatten())
130-
131-
# Computing the output shape of a conv layer can be tricky;
132-
# for a good tutorial, see: http://cs231n.github.io/convolutional-networks/
133-
output_size = nb_filters * (((maxlen - filter_length) / 1) + 1) / 2
134-
135-
# We add a vanilla hidden layer:
136-
new_model.add(Dense(output_size, hidden_dims, weights=layers[5]))
137-
#new_model.add(Dropout(0.25))
138-
#new_model.add(Activation('relu'))
139-
140-
new_model.compile(loss='mean_squared_error', optimizer='rmsprop')
98+
new_model = mk_cnn("test", max_features, maxlen, embedding_dims, nb_filters, filter_length, hidden_dims, None, weights=layers)
14199

142100
train_dict = dict()
143-
train_dict[ftype] = new_model._predict(X_train)
101+
train_dict[ftype] = new_model.predict(X_train)
144102

145103
model = make_cluster_pipeline_subtraces(ftype)
146104
X_red_comp = model.fit_transform(train_dict)
@@ -155,6 +113,7 @@ def ClusterConv(model_file, train_file, valid_file, ftype, nsamples, outdir):
155113
ncolors = len(colors)
156114

157115
for prog,[x,y] in zip(labels, X_red):
116+
<<<<<<< HEAD
158117
#x = gauss(0,0.1) + x
159118
#y = gauss(0,0.1) + y
160119
color = 'r' #colors[progs.index(prog)]
@@ -181,7 +140,6 @@ def ClusterConv(model_file, train_file, valid_file, ftype, nsamples, outdir):
181140

182141
plt.show()
183142
#plt.savefig("plot.png")
184-
#return None
185143

186144
from sklearn.cluster import MeanShift, estimate_bandwidth
187145

@@ -274,7 +232,7 @@ def ClusterConv(model_file, train_file, valid_file, ftype, nsamples, outdir):
274232
#print "Clusters dumped!"
275233

276234

277-
def TrainDeepRepr(model_file, train_file, valid_file, ftype, nsamples):
235+
def TrainCnn(model_file, train_file, valid_file, ftype, nsamples):
278236

279237
csvreader = open_csv(train_file)
280238

@@ -290,7 +248,7 @@ def TrainDeepRepr(model_file, train_file, valid_file, ftype, nsamples):
290248
nb_filters = 250
291249
filter_length = 3
292250
hidden_dims = 250
293-
nb_epoch = 1
251+
nb_epoch = 100
294252

295253
train_programs, train_features, train_classes = read_traces(train_file, nsamples, cut=None)
296254
train_size = len(train_features)
@@ -303,70 +261,22 @@ def TrainDeepRepr(model_file, train_file, valid_file, ftype, nsamples):
303261
max_features = len(tokenizer.word_counts)
304262

305263
preprocessor = DeepReprPreprocessor(tokenizer, window_size, batch_size)
306-
X_train,y_train = preprocessor.preprocess(train_features, 3000)
264+
X_train,y_train = preprocessor.preprocess(train_features, 50000)
307265
nb_classes = len(preprocessor.classes)
308266
print preprocessor.classes
309-
#print X_train[0], len(X_train[0])
310-
#print X_train[1], len(X_train[1])
311-
312-
#print set(y_train)
313-
#assert(0)
314-
315-
from keras.preprocessing import sequence
316-
from keras.optimizers import RMSprop
317-
from keras.models import Sequential
318-
from keras.layers.core import Dense, Dropout, Activation, Flatten
319-
from keras.layers.embeddings import Embedding
320-
from keras.layers.convolutional import Convolution1D, MaxPooling1D
321-
322-
print('Build model...')
323-
model = Sequential()
324-
325-
# we start off with an efficient embedding layer which maps
326-
# our vocab indices into embedding_dims dimensions
327-
model.add(Embedding(max_features, embedding_dims))
328-
model.add(Dropout(0.25))
329-
330-
# we add a Convolution1D, which will learn nb_filters
331-
# word group filters of size filter_length:
332-
model.add(Convolution1D(input_dim=embedding_dims,
333-
nb_filter=nb_filters,
334-
filter_length=filter_length,
335-
border_mode="valid",
336-
activation="relu",
337-
subsample_length=1))
338-
339-
# we use standard max pooling (halving the output of the previous layer):
340-
model.add(MaxPooling1D(pool_length=2))
341-
342-
# We flatten the output of the conv layer, so that we can add a vanilla dense layer:
343-
model.add(Flatten())
344-
345-
# Computing the output shape of a conv layer can be tricky;
346-
# for a good tutorial, see: http://cs231n.github.io/convolutional-networks/
347-
output_size = nb_filters * (((maxlen - filter_length) / 1) + 1) / 2
348-
349-
# We add a vanilla hidden layer:
350-
model.add(Dense(output_size, hidden_dims))
351-
model.add(Dropout(0.25))
352-
model.add(Activation('relu'))
353-
354-
# We project onto a single unit output layer, and squash it with a sigmoid:
355-
model.add(Dense(hidden_dims, nb_classes))
356-
model.add(Activation('softmax'))
357-
358-
model.compile(loss='categorical_crossentropy', optimizer='rmsprop', class_mode="categorical")
267+
268+
model = mk_cnn("train", max_features, maxlen, embedding_dims, nb_filters, filter_length, hidden_dims, nb_classes)
359269
model.fit(X_train, y_train, validation_split=0.1, batch_size=batch_size, nb_epoch=nb_epoch, show_accuracy=True)
360270

361271
model.mypreprocessor = preprocessor
362-
model_file = "cluster-weights.hdf5"
272+
#model_file = model_file + ".wei"
363273
#modelfile = open_model(model_file)
364-
print "Saving model to",model_file
365-
model.save_weights(model_file)
274+
print "Saving model to",model_file + ".wei"
275+
model.save_weights(model_file + ".wei")
366276

367-
model_file = "cluster-preprocessor.pklz"
368-
modelfile = open_model(model_file)
369-
print "Saving preprocessor to",model_file
277+
#model_file = model_file + ".pre"
278+
modelfile = open_model(model_file + ".pre")
279+
print "Saving preprocessor to",model_file + ".pre"
370280
#model.save_weights(model_file)
371281
modelfile.write(pickle.dumps(preprocessor, protocol=2))
372282

@@ -404,11 +314,11 @@ def ClusterScikit(model_file, train_file, valid_file, ftype, nsamples):
404314
plt.text(x, y+0.02, prog.split("/")[-1])
405315

406316

407-
if valid_file is not None:
317+
if valid_file is not None:
408318
valid_programs, valid_features, valid_classes = read_traces(valid_file, None)
409319
valid_dict = dict()
410320
valid_dict[ftype] = valid_features
411-
321+
412322
X_red = model.transform(valid_dict)
413323
for prog,[x,y],cl in zip(valid_programs, X_red, valid_classes):
414324
x = gauss(0,0.1) + x

vdiscover/Pipeline.py

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,79 @@ def make_cluster_pipeline_subtraces(ftype):
142142
else:
143143
assert(0)
144144

145+
def make_cluster_cnn(mode, max_features, maxlen, embedding_dims, nb_filters, filter_length, hidden_dims, nb_classes, weights=None):
146+
147+
#print mode, max_features, maxlen, embedding_dims, nb_filters, filter_length, hidden_dims, nb_classes
148+
from keras.preprocessing import sequence
149+
from keras.optimizers import RMSprop
150+
from keras.models import Sequential
151+
from keras.layers.core import Dense, Dropout, Activation, Flatten
152+
from keras.layers.embeddings import Embedding
153+
from keras.layers.convolutional import Convolution1D, MaxPooling1D
154+
155+
print('Build model...')
156+
model = Sequential()
157+
158+
# we start off with an efficient embedding layer which maps
159+
# our vocab indices into embedding_dims dimensions
160+
if mode == "train":
161+
model.add(Embedding(max_features, embedding_dims, input_length=maxlen))
162+
elif mode == "test":
163+
model.add(Embedding(max_features, embedding_dims, input_length=maxlen, weights=weights[0]))
164+
165+
model.add(Dropout(0.25))
166+
167+
# we add a Convolution1D, which will learn nb_filters
168+
# word group filters of size filter_length:
169+
if mode == "train":
170+
model.add(Convolution1D(nb_filter=nb_filters,
171+
filter_length=filter_length,
172+
border_mode='valid',
173+
activation='relu',
174+
subsample_length=1))
175+
176+
elif mode == "test":
177+
model.add(Convolution1D(nb_filter=nb_filters,
178+
filter_length=filter_length,
179+
border_mode='valid',
180+
activation='relu',
181+
subsample_length=1,
182+
weights=weights[2]))
183+
184+
185+
# we use standard max pooling (halving the output of the previous layer):
186+
model.add(MaxPooling1D(pool_length=2))
187+
188+
# We flatten the output of the conv layer, so that we can add a vanilla dense layer:
189+
model.add(Flatten())
190+
191+
# Computing the output shape of a conv layer can be tricky;
192+
# for a good tutorial, see: http://cs231n.github.io/convolutional-networks/
193+
output_size = nb_filters * (((maxlen - filter_length) / 1) + 1) / 2
194+
#print output_size, hidden_dims
195+
196+
# We add a vanilla hidden layer:
197+
if mode == "train":
198+
model.add(Dense(hidden_dims))
199+
if mode == "test":
200+
model.add(Dense(hidden_dims, weights=weights[5]))
201+
202+
if mode == "train":
203+
204+
model.add(Dropout(0.25))
205+
model.add(Activation('relu'))
206+
207+
# We project onto a single unit output layer, and squash it with a sigmoid:
208+
model.add(Dense(nb_classes))
209+
210+
model.add(Activation('softmax'))
211+
model.compile(loss='categorical_crossentropy', optimizer='rmsprop', class_mode="categorical")
212+
213+
elif mode == "test":
214+
model.compile(loss='mean_squared_error', optimizer='rmsprop')
215+
216+
217+
return model
145218

146219

147220

vpredictor

Lines changed: 24 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ sys.setrecursionlimit(1024*1024*1024)
2929

3030
from vdiscover.Pipeline import *
3131
from vdiscover.Recall import Recall
32+
from vdiscover.Cluster import TrainCnn
3233
from vdiscover.Train import Train
3334

3435
if __name__ == "__main__":
@@ -52,7 +53,7 @@ if __name__ == "__main__":
5253
parser.add_argument("--test-aggr",
5354
help="Test a model using infile (recall only)",
5455
action="store_true", default=False)
55-
56+
5657

5758
parser.add_argument("--static",
5859
help="Use static features",
@@ -70,17 +71,21 @@ if __name__ == "__main__":
7071
help="Cluster input traces using BOW",
7172
action="store_true", default=False)
7273

73-
parser.add_argument("--cluster-conv",
74+
parser.add_argument("--cluster-cnn",
7475
help="Cluster input traces using a convolutional model",
7576
action="store_true", default=False)
7677

7778
parser.add_argument("--train-rf",
7879
help="Train a Random Forest using infile",
7980
action="store_true", default=False)
8081

81-
parser.add_argument("--train-lstm",
82-
help="Train a LSTM using infile (warning: very experimental and slow)",
83-
action="store_true", default=False)
82+
#parser.add_argument("--train-lstm",
83+
# help="Train a LSTM using infile (warning: very experimental and slow)",
84+
# action="store_true", default=False)
85+
86+
#parser.add_argument("--train-cnn",
87+
# help="Train a CNN using infile",
88+
# action="store_true", default=False)
8489

8590
parser.add_argument("--n-samples", type=int,
8691
help="Select a number of samples from infile (train only)",
@@ -91,18 +96,20 @@ if __name__ == "__main__":
9196
type=str, default="/dev/stdout")
9297

9398
options = parser.parse_args()
94-
in_file = options.infile
99+
in_file = options.infile
95100
valid_file = options.valid
96-
101+
97102
test_simple = options.test
98103
test_aggr = options.test_aggr
99104

100105
training_mode_rf = options.train_rf
101-
training_mode_lstm = options.train_lstm
106+
#training_mode_lstm = options.train_lstm
107+
#training_mode_cnn = options.train_cnn
108+
102109
training_mode_cluster_bow = options.cluster_bow
103-
training_mode_cluster_conv = options.cluster_conv
110+
training_mode_cluster_cnn = options.cluster_cnn
104111

105-
training_mode = training_mode_rf or training_mode_lstm or training_mode_cluster_bow or training_mode_cluster_conv
112+
training_mode = training_mode_rf or training_mode_cluster_bow or training_mode_cluster_cnn
106113

107114
probability_mode = options.prob
108115
nsamples = options.n_samples
@@ -124,29 +131,29 @@ if __name__ == "__main__":
124131
if training_mode:
125132
if training_mode_rf:
126133
Train(out_file, in_file, valid_file, "rf", ftype, nsamples)
127-
#elif training_mode_:
128-
# Train(out_file, in_file, valid_file, "lstm", ftype, nsamples)
134+
#elif training_mode_cnn:
129135
elif training_mode_cluster_bow:
130136
from vdiscover.Cluster import ClusterScikit
131137

132138
#Cluster(in_file, valid_file, ftype, nsamples)
133139
ClusterScikit(None, in_file, valid_file, ftype, nsamples)
134-
elif training_mode_cluster_conv:
140+
elif training_mode_cluster_cnn:
135141
from vdiscover.Cluster import ClusterConv
136142

137143
#Cluster(in_file, valid_file, ftype, nsamples)
138144
if (model_file is None):
139-
print "Clustering using a convolutional model requires a pre-trained model"
140-
exit(-1)
141-
145+
TrainCnn(out_file, in_file, valid_file, ftype, nsamples)
146+
#print "Clustering using a convolutional model requires a pre-trained model"
147+
exit(0)
148+
142149
ClusterConv(model_file, in_file, valid_file, ftype, nsamples, None)
143150

144151

145152
else:
146153
if model_file is None:
147154
print "VDiscover requires a pre-trained model to predict"
148155
exit(-1)
149-
156+
150157
test_mode = None
151158
if test_simple:
152159
test_mode = "simple"

0 commit comments

Comments
 (0)