Skip to content

Commit

Permalink
added validation option in clustering
Browse files Browse the repository at this point in the history
  • Loading branch information
gaa-cifasis committed Nov 30, 2015
1 parent 2b20227 commit d17d282
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 6 deletions.
2 changes: 1 addition & 1 deletion vd
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ if __name__ == "__main__":
outfile = options.out
#fuzzer = options.fuzzer
cmd = options.cmd
vectorizer = options.vectorizer
#vectorizer = options.vectorizer
program = cmd.split(" ")[0]
programf = program.replace("/","__")
timeout = 5
Expand Down
49 changes: 44 additions & 5 deletions vdiscover/Cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def ClusterConv(model_file, train_file, valid_file, ftype, nsamples, outdir):

#csvreader = load_csv(train_file)
print "Reading and sampling data to train.."
train_programs, train_features, train_classes = read_traces(train_file, nsamples, cut=None)
train_programs, train_features, train_classes = read_traces(train_file, nsamples, cut=10, maxsize=window_size)
train_size = len(train_features)

#y = train_programs
Expand Down Expand Up @@ -111,6 +111,34 @@ def ClusterConv(model_file, train_file, valid_file, ftype, nsamples, outdir):
model = make_cluster_pipeline_subtraces(ftype)
X_red = model.fit_transform(train_dict)

colors = 'rbgcmykbgrcmykbgrcmykbgrcmyk'
ncolors = len(colors)

for prog,[x,y] in zip(labels, X_red):
x = gauss(0,0.1) + x
y = gauss(0,0.1) + y
plt.scatter(x, y, c='r')
#plt.text(x, y+0.02, prog.split("/")[-1])


if valid_file is not None:
valid_programs, valid_features, valid_classes = read_traces(valid_file, None, cut=10, maxsize=window_size) #None)
valid_dict = dict()

X_valid, _, valid_labels = preprocessor.preprocess_traces(valid_features, y_data=None, labels=valid_programs)
valid_dict[ftype] = new_model._predict(X_valid)
X_red = model.transform(valid_dict)

for prog,[x,y] in zip(valid_labels, X_red):
x = gauss(0,0.1) + x
y = gauss(0,0.1) + y
plt.scatter(x, y, c='b')
plt.text(x, y+0.02, prog.split("/")[-1])

plt.savefig("plot.png")
return None


from sklearn.cluster import MeanShift, estimate_bandwidth

bandwidth = estimate_bandwidth(X_red, quantile=0.2)
Expand All @@ -124,8 +152,6 @@ def ClusterConv(model_file, train_file, valid_file, ftype, nsamples, outdir):

plt.figure()
print len(X_red), len(labels)
colors = 'rbgcmykbgrcmykbgrcmykbgrcmyk'
ncolors = len(colors)

for ([x,y],label, cluster_label) in zip(X_red,labels, cluster_labels):
x = gauss(0,0.1) + x
Expand All @@ -139,8 +165,8 @@ def ClusterConv(model_file, train_file, valid_file, ftype, nsamples, outdir):

plt.title('Estimated number of clusters: %d' % n_clusters)

#plb.savefig(outdir+"/plot.png")
plt.show()
plt.savefig("plot.png")
#plt.show()

return zip(labels, cluster_labels)
#csvwriter = open_csv(train_file+".clusters")
Expand Down Expand Up @@ -279,6 +305,19 @@ def ClusterScikit(model_file, train_file, valid_file, ftype, nsamples):
plt.scatter(x, y, c=colors[cl])
plt.text(x, y+0.02, prog.split("/")[-1])


if valid_file is not None:
valid_programs, valid_features, valid_classes = read_traces(valid_file, None)
valid_dict = dict()
valid_dict[ftype] = valid_features

X_red = model.transform(valid_dict)
for prog,[x,y],cl in zip(valid_programs, X_red, valid_classes):
x = gauss(0,0.1) + x
y = gauss(0,0.1) + y
plt.scatter(x, y, c=colors[cl+1])
plt.text(x, y+0.02, prog.split("/")[-1])

plt.show()
#af = MeanShift().fit(X_red)

Expand Down

0 comments on commit d17d282

Please sign in to comment.