Skip to content

Commit d17d282

Browse files
author
gaa-cifasis
committed
added validation option in clustering
1 parent 2b20227 commit d17d282

File tree

2 files changed

+45
-6
lines changed

2 files changed

+45
-6
lines changed

vd

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ if __name__ == "__main__":
5757
outfile = options.out
5858
#fuzzer = options.fuzzer
5959
cmd = options.cmd
60-
vectorizer = options.vectorizer
60+
#vectorizer = options.vectorizer
6161
program = cmd.split(" ")[0]
6262
programf = program.replace("/","__")
6363
timeout = 5

vdiscover/Cluster.py

Lines changed: 44 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ def ClusterConv(model_file, train_file, valid_file, ftype, nsamples, outdir):
5757

5858
#csvreader = load_csv(train_file)
5959
print "Reading and sampling data to train.."
60-
train_programs, train_features, train_classes = read_traces(train_file, nsamples, cut=None)
60+
train_programs, train_features, train_classes = read_traces(train_file, nsamples, cut=10, maxsize=window_size)
6161
train_size = len(train_features)
6262

6363
#y = train_programs
@@ -111,6 +111,34 @@ def ClusterConv(model_file, train_file, valid_file, ftype, nsamples, outdir):
111111
model = make_cluster_pipeline_subtraces(ftype)
112112
X_red = model.fit_transform(train_dict)
113113

114+
colors = 'rbgcmykbgrcmykbgrcmykbgrcmyk'
115+
ncolors = len(colors)
116+
117+
for prog,[x,y] in zip(labels, X_red):
118+
x = gauss(0,0.1) + x
119+
y = gauss(0,0.1) + y
120+
plt.scatter(x, y, c='r')
121+
#plt.text(x, y+0.02, prog.split("/")[-1])
122+
123+
124+
if valid_file is not None:
125+
valid_programs, valid_features, valid_classes = read_traces(valid_file, None, cut=10, maxsize=window_size) #None)
126+
valid_dict = dict()
127+
128+
X_valid, _, valid_labels = preprocessor.preprocess_traces(valid_features, y_data=None, labels=valid_programs)
129+
valid_dict[ftype] = new_model._predict(X_valid)
130+
X_red = model.transform(valid_dict)
131+
132+
for prog,[x,y] in zip(valid_labels, X_red):
133+
x = gauss(0,0.1) + x
134+
y = gauss(0,0.1) + y
135+
plt.scatter(x, y, c='b')
136+
plt.text(x, y+0.02, prog.split("/")[-1])
137+
138+
plt.savefig("plot.png")
139+
return None
140+
141+
114142
from sklearn.cluster import MeanShift, estimate_bandwidth
115143

116144
bandwidth = estimate_bandwidth(X_red, quantile=0.2)
@@ -124,8 +152,6 @@ def ClusterConv(model_file, train_file, valid_file, ftype, nsamples, outdir):
124152

125153
plt.figure()
126154
print len(X_red), len(labels)
127-
colors = 'rbgcmykbgrcmykbgrcmykbgrcmyk'
128-
ncolors = len(colors)
129155

130156
for ([x,y],label, cluster_label) in zip(X_red,labels, cluster_labels):
131157
x = gauss(0,0.1) + x
@@ -139,8 +165,8 @@ def ClusterConv(model_file, train_file, valid_file, ftype, nsamples, outdir):
139165

140166
plt.title('Estimated number of clusters: %d' % n_clusters)
141167

142-
#plb.savefig(outdir+"/plot.png")
143-
plt.show()
168+
plt.savefig("plot.png")
169+
#plt.show()
144170

145171
return zip(labels, cluster_labels)
146172
#csvwriter = open_csv(train_file+".clusters")
@@ -279,6 +305,19 @@ def ClusterScikit(model_file, train_file, valid_file, ftype, nsamples):
279305
plt.scatter(x, y, c=colors[cl])
280306
plt.text(x, y+0.02, prog.split("/")[-1])
281307

308+
309+
if valid_file is not None:
310+
valid_programs, valid_features, valid_classes = read_traces(valid_file, None)
311+
valid_dict = dict()
312+
valid_dict[ftype] = valid_features
313+
314+
X_red = model.transform(valid_dict)
315+
for prog,[x,y],cl in zip(valid_programs, X_red, valid_classes):
316+
x = gauss(0,0.1) + x
317+
y = gauss(0,0.1) + y
318+
plt.scatter(x, y, c=colors[cl+1])
319+
plt.text(x, y+0.02, prog.split("/")[-1])
320+
282321
plt.show()
283322
#af = MeanShift().fit(X_red)
284323

0 commit comments

Comments
 (0)