max event limitation + fixes in clustering and sampling

CIFASIS · Jan 13, 2016 · 12d1339 · 12d1339
1 parent 163d57a
commit 12d1339
Show file tree

Hide file tree

Showing 7 changed files with 77 additions and 31 deletions.
diff --git a/tseeder b/tseeder
@@ -20,6 +20,7 @@ Copyright 2015 by G.Grieco
 """
 
 import os
+import shutil
 import argparse
 import sys
 import csv
@@ -36,7 +37,7 @@ if __name__ == "__main__":
     parser.add_argument("infile", help="A csv with the features to train or predict", type=str, default=None)
     parser.add_argument("outdir", help="A directory with the seeds", type=str, default=None)
     parser.add_argument("-n", help="Number of seeds to select per cluster", type=int, default=1)
-    parser.add_argument("--random", help="Sample randomly", action="store_true", default=None)
+    #parser.add_argument("--random", help="Sample randomly", action="store_true", default=None)
 
     options = parser.parse_args()
     in_file = options.infile
@@ -50,7 +51,10 @@ if __name__ == "__main__":
 
     selected = cluster_sampler(clusters, nseeds)
 
-    for seed in selected:
-      print "cp", seed, outdir
-
+    if not os.path.exists(outdir):
+      os.makedirs(outdir)
 
+    print "Copying seeds.."
+    for seed in selected:
+      print seed
+      shutil.copy(seed, outdir)
diff --git a/vd b/vd
@@ -32,7 +32,7 @@ sys.setrecursionlimit(1024*1024*1024)
 #from vdiscover.Detection import WriteTestcase
 from vdiscover.Process    import Process
 from vdiscover.Printer    import TypePrinter
-from vdiscover.Cluster  import ClusterScikit, ClusterConv
+#from vdiscover.Cluster  import ClusterScikit, ClusterConv
 from vdiscover.Utils import update_progress
 from vdiscover.Sampling import cluster_sampler 
 
@@ -60,15 +60,26 @@ if __name__ == "__main__":
     #vectorizer = options.vectorizer
     program = cmd.split(" ")[0]
     programf = program.replace("/","__")
-    timeout = 5
+    main_module = program.split("/")[-1]
+
+    timeout = 15
     envs = dict()
     traces_path = outfile#outdir+"/traces.raw"
 
     if os.path.exists(traces_path):
       traces = traces_path
     else:
 
-      app = Process(program, envs, timeout, [], [], True)  
+      #app = Process(program, envs, timeout, ["libpixman-1.so","libcairo.so.2","libpango"], [], True)  
+      modules_to_trace = [main_module]
+      if "LD_LIBRARY_PATH" in os.environ:
+        libs =  os.environ["LD_LIBRARY_PATH"]
+        for _,_,files in os.walk(libs):
+          for f in files:
+            modules_to_trace.append(f)
+
+      print "Tracing", modules_to_trace
+      app = Process(program, envs, timeout, modules_to_trace, [], True)  
       prt = TypePrinter(traces_path, program, 0)
       traces = []
       all_files = []
@@ -85,13 +96,15 @@ if __name__ == "__main__":
 
       for progress,testcase in enumerate(all_files):
         #print testcase
-        progress = round(float(progress)/nfiles, 2)
+        progress = round(float(progress)/nfiles,4)
         update_progress(progress)
         prepared_cmd = cmd.replace(program,"")
         prepared_cmd = prepared_cmd.split("@@")
         prepared_cmd = prepared_cmd[0].split(" ") + [testcase] + prepared_cmd[1].split(" ")
         prepared_cmd = filter(lambda x: x<>'', prepared_cmd)
+        #print "Getting data.."
         events = app.getData(prepared_cmd)
+        #print "Printing data.. ", len(events)
         traces.append(prt.print_events(testcase,events))
         #print prepared_cmd
         #print traces[-1]

diff --git a/vdiscover/Cluster.py b/vdiscover/Cluster.py
@@ -24,9 +24,12 @@
 import subprocess
 import pickle
 import numpy as np
-import matplotlib.pyplot as plt
 import matplotlib as mpl
 
+# hack from https://stackoverflow.com/questions/2801882/generating-a-png-with-matplotlib-when-display-is-undefined to avoid using X
+mpl.use('Agg')
+import matplotlib.pyplot as plt
+
 from Utils import *
 from Pipeline import *
 
@@ -111,13 +114,17 @@ def ClusterCnn(model_file, train_file, valid_file, ftype, nsamples, outdir):
   colors = mpl.colors.cnames.keys()
   progs = list(set(labels))
   ncolors = len(colors)
-
+  size = len(labels)
+  print "Plotting.."
+
   for prog,[x,y] in zip(labels, X_red):
+  #for prog,[x,y] in sample(zip(labels, X_red), min(size, 1000)):
     x = gauss(0,0.05) + x
     y = gauss(0,0.05) + y
     color = 'r'
     plt.scatter(x, y, c=color )
 
+  """
   if valid_file is not None:
     valid_programs, valid_features, valid_classes = read_traces(valid_file, None, cut=None, maxsize=window_size) #None)
     valid_dict = dict()
@@ -134,13 +141,16 @@ def ClusterCnn(model_file, train_file, valid_file, ftype, nsamples, outdir):
       y = gauss(0,0.05) + y
       plt.scatter(x, y, c='b')
       plt.text(x, y+0.02, prog.split("/")[-1])
-
+  
   plt.show()
-  #plt.savefig("plot.png")
-
+  """
+  plt.savefig(train_file.replace(".gz","")+".png")
+  print "Bandwidth estimation.."
   from sklearn.cluster import MeanShift, estimate_bandwidth
 
-  bandwidth = estimate_bandwidth(X_red, quantile=0.2)
+
+  X_red_sample = X_red[:min(size, 1000)]
+  bandwidth = estimate_bandwidth(X_red_sample, quantile=0.2)
   print "Clustering with bandwidth:", bandwidth
 
   #X_red = np.vstack((X_red,X_red_valid))
@@ -150,16 +160,17 @@ def ClusterCnn(model_file, train_file, valid_file, ftype, nsamples, outdir):
   print X_red.shape, len(X_red), len(labels)
   #print valid_labels
 
-  af = MeanShift(bandwidth=bandwidth/5).fit(X_red)
+  af = MeanShift(bandwidth=bandwidth/1).fit(X_red)
 
   cluster_centers = af.cluster_centers_
   cluster_labels = af.labels_
   n_clusters = len(cluster_centers)
-
+  
   plt.figure()
   for ([x,y],label, cluster_label) in zip(X_red,labels, cluster_labels):
-    #x = gauss(0,0.1) + x
-    #y = gauss(0,0.1) + y
+  #for ([x,y],label, cluster_label) in sample(zip(X_red,labels, cluster_labels), min(size, 1000)):
+    x = gauss(0,0.1) + x
+    y = gauss(0,0.1) + y
     plt.scatter(x, y, c = colors[cluster_label % ncolors])
     #print label
     #if label in valid_labels:
@@ -169,6 +180,7 @@ def ClusterCnn(model_file, train_file, valid_file, ftype, nsamples, outdir):
     plt.plot(x, y, 'o', markerfacecolor=colors[i % ncolors],
              markeredgecolor='k', markersize=7)
 
+  """
   #for prog,[x,y] in zip(valid_labels, X_red_valid):
     #x = gauss(0,0.1) + x
     #y = gauss(0,0.1) + y
@@ -180,6 +192,9 @@ def ClusterCnn(model_file, train_file, valid_file, ftype, nsamples, outdir):
 
   #plt.savefig("clusters.png")
   plt.show()
+  """
+  plt.savefig(train_file.replace(".gz","")+".clusters.png")
+
   clustered_traces = zip(labels, cluster_labels)
   writer = open_csv(train_file.replace(".gz","")+".clusters")
   for label, cluster in clustered_traces:

diff --git a/vdiscover/Pipeline.py b/vdiscover/Pipeline.py
@@ -236,7 +236,7 @@ def preprocess_traces(self, X_data, y_data=None, labels=None):
     cut_X_data = []
     cut_label_data = []
     cut_y_data = []
-    rep = 5
+    #rep = 5
 
     X_size = len(X_data)
 
@@ -249,6 +249,7 @@ def preprocess_traces(self, X_data, y_data=None, labels=None):
 
       size = len(trace)
       rep = 1 + int(float(size) / float(self.max_len))
+      rep = min(rep, 10)
 
       for _ in range(rep):
 

diff --git a/vdiscover/Printer.py b/vdiscover/Printer.py
@@ -91,7 +91,7 @@ def print_events(self, label, events):
     trace = ""
 
     for x,y in events:
-      trace = trace+x+"="+y+" "
+      trace = trace + ("%s=%s " % (x,y))
 
     row = [self.pname+":"+label,trace]
 

diff --git a/vdiscover/Process.py b/vdiscover/Process.py
@@ -45,7 +45,7 @@
 from Alarm import alarm_handler, TimeoutEx
 
 class Process(Application):
-    def __init__(self, program, envs, timeout, included_mods = [], ignored_mods = [], no_stdout = True):
+    def __init__(self, program, envs, timeout, included_mods = [], ignored_mods = [], no_stdout = True, max_events = 10000):
 
         Application.__init__(self)  # no effect
 
@@ -63,7 +63,7 @@ def __init__(self, program, envs, timeout, included_mods = [], ignored_mods = []
         self.pid = None
         self.mm = None
         self.timeouts = 0
-        self.max_timeouts = 10
+        self.max_events = max_events
 
         # Parse ELF
         self.elf = ELF(self.program, plt = False)
@@ -77,6 +77,7 @@ def __init__(self, program, envs, timeout, included_mods = [], ignored_mods = []
         self.last_signal = {}
         self.last_call = None
         self.crashed = False
+        self.nevents = 0
         self.events = []
 
         self.binfo = dict()
@@ -146,7 +147,7 @@ def createEvents(self, signal):
 
                 else:
                   call = Call(name, module)
-                  self.mm.update()
+                  #self.mm.update()
                   #print "updated mm"
                   call.detect_parameters(self.process, self.mm)
                   breakpoint.desinstall(set_ip=True)
@@ -259,9 +260,11 @@ def cont(self, signum=None):
         signal = self.debugger.waitSignals()
         process = signal.process
         events = self.createEvents(signal)
-        vulns = self.DetectVulnerabilities(self.events, events)
+
+        #vulns = self.DetectVulnerabilities(self.events, events)
         #print "vulns detected"
-        self.events = self.events + events + vulns
+        self.events = self.events + events #+ vulns
+        self.nevents = self.nevents + len(events)
 
 
     def readInstrSize(self, address, default_size=None):
@@ -328,11 +331,18 @@ def runProcess(self, cmd):
 
         # Set the breakpoints
         self.breakpoint(self.elf.GetEntrypoint())
+        #print hex(self.elf.GetEntrypoint())
 
         try:
           while True:
+
             #self.cont() 
-            if not self.debugger or self.crashed:
+            if self.nevents > self.max_events:
+
+                self.events.append(Timeout(timeout))
+                alarm(0)
+                return
+            elif not self.debugger or self.crashed:
                 # There is no more process: quit
                 alarm(0)
                 return
@@ -371,6 +381,7 @@ def runProcess(self, cmd):
 
     def getData(self, inputs):
         self.events = []
+        self.nevents = 0
         self.debugger = PtraceDebugger()
 
         self.runProcess([self.program]+inputs)
@@ -389,7 +400,3 @@ def getData(self, inputs):
 
         self.process = None
         return self.events
-
-
-    def timeouted(self):
-        return self.timeouts >= self.max_timeouts
diff --git a/vdiscover/Sampling.py b/vdiscover/Sampling.py
@@ -28,8 +28,14 @@ def cluster_sampler(clustered_traces, n_per_cluster):
     clusters[cluster] = clusters.get(cluster, []) + [label.split(":")[-1]]
 
   selected = set()
+  tmp = set()
+
   for (cluster, seeds) in clusters.items():
     n_sample = min(len(seeds), n_per_cluster)
-    selected.update(set(random.sample(seeds, n_sample)))
+    tmp = set(seeds).intersection(selected)
+    if len(tmp) >= n_sample: 
+      selected.update(set(random.sample(tmp, n_sample)))
+    else:
+      selected.update(set(random.sample(seeds, n_sample)))
 
   return selected