From 0650fdb7bd39f5fd78faf4ee400acd9f10b094eb Mon Sep 17 00:00:00 2001
From: gaa-cifasis <gg@cifasis-conicet.gov.ar>
Date: Thu, 29 Oct 2015 19:31:01 +0000
Subject: [PATCH] first steps to have seed sampler

---
 vd                    | 24 ++++++++++++++----------
 vdiscover/Cluster.py  |  4 ++--
 vdiscover/Sampling.py | 32 ++++++++++++++++++++++++++++++++
 vpredictor            | 33 ++++++++++++++++++++++++---------
 4 files changed, 72 insertions(+), 21 deletions(-)
 create mode 100644 vdiscover/Sampling.py

diff --git a/vd b/vd
index 50a5f24..7df977d 100755
--- a/vd
+++ b/vd
@@ -32,8 +32,10 @@ sys.setrecursionlimit(1024*1024*1024)
 #from vdiscover.Detection import WriteTestcase
 from vdiscover.Process    import Process
 from vdiscover.Printer    import TypePrinter
-from vdiscover.Cluster  import PlotDeepRepr
+from vdiscover.Cluster  import ClusterScikit, ClusterConv
 from vdiscover.Utils import update_progress
+from vdiscover.Sampling import cluster_sampler 
+
 if __name__ == "__main__":
 
 
@@ -46,7 +48,7 @@ if __name__ == "__main__":
     parser = argparse.ArgumentParser(description='')
     parser.add_argument("-i", help="", type=str, default=None, required=True, dest="seeds")
     parser.add_argument("-o", help="", type=str, default=None, required=True, dest="out")
-    #parser.add_argument("-v", help="", type=str, default=None, required=True, dest="vectorizer")
+    parser.add_argument("-v", help="", type=str, default=None, required=False, dest="vectorizer")
     #parser.add_argument("-m", help="", type=str, default="afl", dest="fuzzer")
     parser.add_argument("cmd", help="", type=str, default=None)
 
@@ -55,7 +57,7 @@ if __name__ == "__main__":
     outfile = options.out
     #fuzzer = options.fuzzer
     cmd = options.cmd
-    #vectorizer = options.vectorizer
+    vectorizer = options.vectorizer
     program = cmd.split(" ")[0]
     programf = program.replace("/","__")
     timeout = 5
@@ -66,7 +68,7 @@ if __name__ == "__main__":
       traces = traces_path
     else:
 
-      app = Process(program, envs, timeout, [], [], True)  
+      app = Process(program, envs, timeout, ["libcairo"], [], True)  
       prt = TypePrinter(traces_path, program, 0)
       traces = []
       all_files = []
@@ -93,10 +95,12 @@ if __name__ == "__main__":
         traces.append(prt.print_events(testcase,events))
         #print prepared_cmd
         #print traces[-1]
-    
-    #clustered_traces = PlotDeepRepr(vectorizer, traces, None, "dynamic", None, outdir)
-    #clusters = dict()
-    #for label, cluster in clustered_traces:
-    #  clusters[cluster] = clusters.get(cluster, []) + [label]
 
-    #print clusters 
+
+    if vectorizer is None:
+      clustered_traces = ClusterScikit(vectorizer, traces, None, "dynamic", None)
+    else:
+      clustered_traces = ClusterConv(vectorizer, traces, None, "dynamic", None, None)
+    
+    cluster_sampler(clustered_traces,1)
+    #print clusters
diff --git a/vdiscover/Cluster.py b/vdiscover/Cluster.py
index 38905cb..43dff09 100644
--- a/vdiscover/Cluster.py
+++ b/vdiscover/Cluster.py
@@ -31,7 +31,7 @@
 from Utils import *
 from Pipeline import *
 
-def PlotDeepRepr(model_file, train_file, valid_file, ftype, nsamples, outdir):
+def ClusterConv(model_file, train_file, valid_file, ftype, nsamples, outdir):
 
   f = open(model_file+".pre")
   preprocessor = pickle.load(f)
@@ -139,7 +139,7 @@ def PlotDeepRepr(model_file, train_file, valid_file, ftype, nsamples, outdir):
 
   plt.title('Estimated number of clusters: %d' % n_clusters)
 
-  plb.savefig(outdir+"/plot.png")
+  #plb.savefig(outdir+"/plot.png")
   plt.show()
   
   return zip(labels, cluster_labels)
diff --git a/vdiscover/Sampling.py b/vdiscover/Sampling.py
new file mode 100644
index 0000000..06e5f58
--- /dev/null
+++ b/vdiscover/Sampling.py
@@ -0,0 +1,32 @@
+"""
+This file is part of VDISCOVER.
+
+VDISCOVER is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+VDISCOVER is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with VDISCOVER. If not, see <http://www.gnu.org/licenses/>.
+
+Copyright 2014 by G.Grieco
+"""
+
+import random
+import copy
+
+def cluster_sampler(clustered_traces, n_per_cluster):
+  #cc = copy.copy(clusters)
+  #n_per_cluster = 1#n / len(cc)
+  clusters = dict()
+  for label, cluster in clustered_traces:
+    clusters[cluster] = clusters.get(cluster, []) + [label.split(":")[-1]]
+
+  print "Selecting", len(clusters), "seeds"
+  for (cluster, seeds) in clusters.items():
+    print ",".join(random.sample(seeds, n_per_cluster))
diff --git a/vpredictor b/vpredictor
index a9674cf..406e784 100755
--- a/vpredictor
+++ b/vpredictor
@@ -30,7 +30,7 @@ sys.setrecursionlimit(1024*1024*1024)
 from vdiscover.Pipeline import *
 from vdiscover.Recall  import Recall
 from vdiscover.Train  import Train
-from vdiscover.Cluster  import Cluster
+from vdiscover.Cluster  import ClusterScikit, ClusterConv
 
 if __name__ == "__main__":
 
@@ -67,10 +67,13 @@ if __name__ == "__main__":
                         help="Valid a model using infile",
                         action="store", default=None)
 
-    parser.add_argument("--cluster",
-                        help="Cluster input traces",
+    parser.add_argument("--cluster-bow",
+                        help="Cluster input traces using BOW",
                         action="store_true", default=False)
 
+    parser.add_argument("--cluster-conv",
+                        help="Cluster input traces using a convolutional model",
+                        action="store_true", default=False)
 
     parser.add_argument("--train-rf",
                         help="Train a Random Forest using infile",
@@ -97,8 +100,10 @@ if __name__ == "__main__":
 
     training_mode_rf = options.train_rf
     training_mode_lstm = options.train_lstm
-    training_mode_cluster = options.cluster
-    training_mode = training_mode_rf or training_mode_lstm or training_mode_cluster
+    training_mode_cluster_bow = options.cluster_bow
+    training_mode_cluster_conv = options.cluster_conv
+
+    training_mode = training_mode_rf or training_mode_lstm or training_mode_cluster_bow or training_mode_cluster_conv
 
     probability_mode = options.prob
     nsamples = options.n_samples
@@ -120,10 +125,20 @@ if __name__ == "__main__":
     if training_mode:
       if training_mode_rf:
         Train(out_file, in_file, valid_file, "rf", ftype, nsamples)
-      elif training_mode_lstm:
-        Train(out_file, in_file, valid_file, "lstm", ftype, nsamples)
-      elif training_mode_cluster:
-        Cluster(in_file, valid_file, ftype, nsamples)
+      #elif training_mode_:
+      #  Train(out_file, in_file, valid_file, "lstm", ftype, nsamples)
+      elif training_mode_cluster_bow:
+        #Cluster(in_file, valid_file, ftype, nsamples)
+        ClusterScikit(None, in_file, valid_file, ftype, nsamples)
+      elif training_mode_cluster_conv:
+        #Cluster(in_file, valid_file, ftype, nsamples)
+        if (model_file is None):
+          print "Clustering using a convolutional model requires a pre-trained model"
+          exit(-1)
+        
+        ClusterConv(model_file, in_file, valid_file, ftype, nsamples, None)
+
+
     else:
       if model_file is None:
         print "VDiscover requires a pre-trained model to predict"