From 109b7c087ce4343e11897a218b51f167a096b0f5 Mon Sep 17 00:00:00 2001
From: gaa-cifasis <gg@cifasis-conicet.gov.ar>
Date: Fri, 20 May 2016 09:05:06 +0000
Subject: [PATCH] fixes for HITB workshop

---
 fextractor           |  2 ++
 vd                   |  2 +-
 vdiscover/Cluster.py | 50 +++++++++++++++++++++++++-------------------
 3 files changed, 31 insertions(+), 23 deletions(-)

diff --git a/fextractor b/fextractor
index 6a09410..d5198ca 100755
--- a/fextractor
+++ b/fextractor
@@ -148,6 +148,8 @@ if __name__ == "__main__":
       original_inputs = RandomInputMutator(args + files, NullMutator)
       #expanded_input_generator = RandomInputMutator(args + files, RandomExpanderMutator)
       mutated_input_generator = RandomInputMutator(args + files, RandomByteMutator)
+      if included_mods == []:
+        included_mods = [program]
 
       app = Process(program, envs, timeout, included_mods, ignored_mods, no_stdout = not show_stdout )
       prt = TypePrinter(csvfile, testcase, mclass)
diff --git a/vd b/vd
index 1ffeb66..a465371 100755
--- a/vd
+++ b/vd
@@ -70,7 +70,7 @@ if __name__ == "__main__":
     traces_path = outfile#outdir+"/traces.raw"
 
     if os.path.exists(traces_path):
-      traces = traces_path
+      print traces_path, "exists. I will not overwritte it. Aborting"
     else:
 
       modules_to_trace = [main_module]
diff --git a/vdiscover/Cluster.py b/vdiscover/Cluster.py
index 9954fb6..eb19219 100644
--- a/vdiscover/Cluster.py
+++ b/vdiscover/Cluster.py
@@ -27,7 +27,7 @@
 import matplotlib as mpl
 
 # hack from https://stackoverflow.com/questions/2801882/generating-a-png-with-matplotlib-when-display-is-undefined to avoid using X
-mpl.use('Agg')
+#mpl.use('Agg')
 import matplotlib.pyplot as plt
 
 from Utils import *
@@ -284,10 +284,6 @@ def TrainCnn(model_file, train_file, valid_file, ftype, nsamples):
 
 def ClusterScikit(model_file, train_file, valid_file, ftype, nsamples):
 
-  #import matplotlib.pyplot as plt
-  #import matplotlib as mpl
-
-  #csvreader = open_csv(train_file)
   train_programs, train_features, train_classes = read_traces(train_file, nsamples)
   train_size = len(train_programs)
 
@@ -298,8 +294,6 @@ def ClusterScikit(model_file, train_file, valid_file, ftype, nsamples):
   #batch_size = 16
   #window_size = 20
 
-  #from sklearn.cluster import MeanShift
-
   print "Transforming data and fitting model.."
   model = make_cluster_pipeline_bow(ftype)
   X_red = model.fit_transform(train_dict)
@@ -307,6 +301,7 @@ def ClusterScikit(model_file, train_file, valid_file, ftype, nsamples):
   #mpl.rcParams.update({'font.size': 10})
   plt.figure()
   colors = 'brgcmykbgrcmykbgrcmykbgrcmyk'
+  ncolors = len(colors)
 
   for prog,[x,y],cl in zip(train_programs, X_red, train_classes):
     x = gauss(0,0.1) + x
@@ -332,26 +327,37 @@ def ClusterScikit(model_file, train_file, valid_file, ftype, nsamples):
       plt.text(x, y+0.02, prog.split("/")[-1])
 
   plt.show()
-  #af = MeanShift().fit(X_red)
+  from sklearn.cluster import MeanShift, estimate_bandwidth
+
+  bandwidth = estimate_bandwidth(X_red, quantile=0.2)
+  print "Clustering with bandwidth:", bandwidth
+
+  af = MeanShift(bandwidth=bandwidth/5).fit(X_red)
+
+  cluster_centers = af.cluster_centers_
+  labels = af.labels_
+  n_clusters_ = len(cluster_centers)
 
-  #cluster_centers = af.cluster_centers_
-  #labels = af.labels_
-  #n_clusters_ = len(cluster_centers)
+  plt.close('all')
+  plt.figure(1)
+  plt.clf()
 
-  #plt.close('all')
-  #plt.figure(1)
-  #plt.clf()
+  for ([x,y],label, cluster_label) in zip(X_red,train_programs, labels):
+    x = gauss(0,0.1) + x
+    y = gauss(0,0.1) + y
+    plt.scatter(x, y, c = colors[cluster_label % ncolors])
 
-  #for k, col in zip(range(n_clusters_), colors):
-  #  my_members = labels == k
-  #  cluster_center = cluster_centers[k]
-  #  plt.plot(X_red[my_members, 0], X_red[my_members, 1], col + '.')
-  #  plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,
-  #           markeredgecolor='k', markersize=14)
+  for i,[x,y] in enumerate(cluster_centers):
+    plt.plot(x, y, 'o', markerfacecolor=colors[i % ncolors],
+             markeredgecolor='k', markersize=7)
 
+  plt.title('Estimated number of clusters: %d' % n_clusters_)
+  plt.show()
 
-  #plt.title('Estimated number of clusters: %d' % n_clusters_)
-  #plt.show()
+  clustered_traces = zip(train_programs, labels)
+  writer = write_csv(train_file.replace(".gz","")+".clusters")
+  for label, cluster in clustered_traces:
+     writer.writerow([label, cluster])
 
 def Cluster(train_file, valid_file, ftype, nsamples):