diff --git a/tseeder b/tseeder index 2866188..1f7d8fa 100755 --- a/tseeder +++ b/tseeder @@ -20,6 +20,7 @@ Copyright 2015 by G.Grieco """ import os +import shutil import argparse import sys import csv @@ -36,7 +37,7 @@ if __name__ == "__main__": parser.add_argument("infile", help="A csv with the features to train or predict", type=str, default=None) parser.add_argument("outdir", help="A directory with the seeds", type=str, default=None) parser.add_argument("-n", help="Number of seeds to select per cluster", type=int, default=1) - parser.add_argument("--random", help="Sample randomly", action="store_true", default=None) + #parser.add_argument("--random", help="Sample randomly", action="store_true", default=None) options = parser.parse_args() in_file = options.infile @@ -50,7 +51,10 @@ if __name__ == "__main__": selected = cluster_sampler(clusters, nseeds) - for seed in selected: - print "cp", seed, outdir - + if not os.path.exists(outdir): + os.makedirs(outdir) + print "Copying seeds.." + for seed in selected: + print seed + shutil.copy(seed, outdir) diff --git a/vd b/vd index 43f4b56..f97c871 100755 --- a/vd +++ b/vd @@ -32,7 +32,7 @@ sys.setrecursionlimit(1024*1024*1024) #from vdiscover.Detection import WriteTestcase from vdiscover.Process import Process from vdiscover.Printer import TypePrinter -from vdiscover.Cluster import ClusterScikit, ClusterConv +#from vdiscover.Cluster import ClusterScikit, ClusterConv from vdiscover.Utils import update_progress from vdiscover.Sampling import cluster_sampler @@ -60,7 +60,9 @@ if __name__ == "__main__": #vectorizer = options.vectorizer program = cmd.split(" ")[0] programf = program.replace("/","__") - timeout = 5 + main_module = program.split("/")[-1] + + timeout = 15 envs = dict() traces_path = outfile#outdir+"/traces.raw" @@ -68,7 +70,16 @@ if __name__ == "__main__": traces = traces_path else: - app = Process(program, envs, timeout, [], [], True) + #app = Process(program, envs, timeout, ["libpixman-1.so","libcairo.so.2","libpango"], [], True) + modules_to_trace = [main_module] + if "LD_LIBRARY_PATH" in os.environ: + libs = os.environ["LD_LIBRARY_PATH"] + for _,_,files in os.walk(libs): + for f in files: + modules_to_trace.append(f) + + print "Tracing", modules_to_trace + app = Process(program, envs, timeout, modules_to_trace, [], True) prt = TypePrinter(traces_path, program, 0) traces = [] all_files = [] @@ -85,13 +96,15 @@ if __name__ == "__main__": for progress,testcase in enumerate(all_files): #print testcase - progress = round(float(progress)/nfiles, 2) + progress = round(float(progress)/nfiles,4) update_progress(progress) prepared_cmd = cmd.replace(program,"") prepared_cmd = prepared_cmd.split("@@") prepared_cmd = prepared_cmd[0].split(" ") + [testcase] + prepared_cmd[1].split(" ") prepared_cmd = filter(lambda x: x<>'', prepared_cmd) + #print "Getting data.." events = app.getData(prepared_cmd) + #print "Printing data.. ", len(events) traces.append(prt.print_events(testcase,events)) #print prepared_cmd #print traces[-1] diff --git a/vdiscover/Cluster.py b/vdiscover/Cluster.py index ec91943..d082789 100644 --- a/vdiscover/Cluster.py +++ b/vdiscover/Cluster.py @@ -24,9 +24,12 @@ import subprocess import pickle import numpy as np -import matplotlib.pyplot as plt import matplotlib as mpl +# hack from https://stackoverflow.com/questions/2801882/generating-a-png-with-matplotlib-when-display-is-undefined to avoid using X +mpl.use('Agg') +import matplotlib.pyplot as plt + from Utils import * from Pipeline import * @@ -111,13 +114,17 @@ def ClusterCnn(model_file, train_file, valid_file, ftype, nsamples, outdir): colors = mpl.colors.cnames.keys() progs = list(set(labels)) ncolors = len(colors) - + size = len(labels) + print "Plotting.." + for prog,[x,y] in zip(labels, X_red): + #for prog,[x,y] in sample(zip(labels, X_red), min(size, 1000)): x = gauss(0,0.05) + x y = gauss(0,0.05) + y color = 'r' plt.scatter(x, y, c=color ) + """ if valid_file is not None: valid_programs, valid_features, valid_classes = read_traces(valid_file, None, cut=None, maxsize=window_size) #None) valid_dict = dict() @@ -134,13 +141,16 @@ def ClusterCnn(model_file, train_file, valid_file, ftype, nsamples, outdir): y = gauss(0,0.05) + y plt.scatter(x, y, c='b') plt.text(x, y+0.02, prog.split("/")[-1]) - + plt.show() - #plt.savefig("plot.png") - + """ + plt.savefig(train_file.replace(".gz","")+".png") + print "Bandwidth estimation.." from sklearn.cluster import MeanShift, estimate_bandwidth - bandwidth = estimate_bandwidth(X_red, quantile=0.2) + + X_red_sample = X_red[:min(size, 1000)] + bandwidth = estimate_bandwidth(X_red_sample, quantile=0.2) print "Clustering with bandwidth:", bandwidth #X_red = np.vstack((X_red,X_red_valid)) @@ -150,16 +160,17 @@ def ClusterCnn(model_file, train_file, valid_file, ftype, nsamples, outdir): print X_red.shape, len(X_red), len(labels) #print valid_labels - af = MeanShift(bandwidth=bandwidth/5).fit(X_red) + af = MeanShift(bandwidth=bandwidth/1).fit(X_red) cluster_centers = af.cluster_centers_ cluster_labels = af.labels_ n_clusters = len(cluster_centers) - + plt.figure() for ([x,y],label, cluster_label) in zip(X_red,labels, cluster_labels): - #x = gauss(0,0.1) + x - #y = gauss(0,0.1) + y + #for ([x,y],label, cluster_label) in sample(zip(X_red,labels, cluster_labels), min(size, 1000)): + x = gauss(0,0.1) + x + y = gauss(0,0.1) + y plt.scatter(x, y, c = colors[cluster_label % ncolors]) #print label #if label in valid_labels: @@ -169,6 +180,7 @@ def ClusterCnn(model_file, train_file, valid_file, ftype, nsamples, outdir): plt.plot(x, y, 'o', markerfacecolor=colors[i % ncolors], markeredgecolor='k', markersize=7) + """ #for prog,[x,y] in zip(valid_labels, X_red_valid): #x = gauss(0,0.1) + x #y = gauss(0,0.1) + y @@ -180,6 +192,9 @@ def ClusterCnn(model_file, train_file, valid_file, ftype, nsamples, outdir): #plt.savefig("clusters.png") plt.show() + """ + plt.savefig(train_file.replace(".gz","")+".clusters.png") + clustered_traces = zip(labels, cluster_labels) writer = open_csv(train_file.replace(".gz","")+".clusters") for label, cluster in clustered_traces: diff --git a/vdiscover/Pipeline.py b/vdiscover/Pipeline.py index 3730609..28b0540 100644 --- a/vdiscover/Pipeline.py +++ b/vdiscover/Pipeline.py @@ -236,7 +236,7 @@ def preprocess_traces(self, X_data, y_data=None, labels=None): cut_X_data = [] cut_label_data = [] cut_y_data = [] - rep = 5 + #rep = 5 X_size = len(X_data) @@ -249,6 +249,7 @@ def preprocess_traces(self, X_data, y_data=None, labels=None): size = len(trace) rep = 1 + int(float(size) / float(self.max_len)) + rep = min(rep, 10) for _ in range(rep): diff --git a/vdiscover/Printer.py b/vdiscover/Printer.py index fd25718..83b32da 100644 --- a/vdiscover/Printer.py +++ b/vdiscover/Printer.py @@ -91,7 +91,7 @@ def print_events(self, label, events): trace = "" for x,y in events: - trace = trace+x+"="+y+" " + trace = trace + ("%s=%s " % (x,y)) row = [self.pname+":"+label,trace] diff --git a/vdiscover/Process.py b/vdiscover/Process.py index 2ffb779..a7e9bbc 100644 --- a/vdiscover/Process.py +++ b/vdiscover/Process.py @@ -45,7 +45,7 @@ from Alarm import alarm_handler, TimeoutEx class Process(Application): - def __init__(self, program, envs, timeout, included_mods = [], ignored_mods = [], no_stdout = True): + def __init__(self, program, envs, timeout, included_mods = [], ignored_mods = [], no_stdout = True, max_events = 10000): Application.__init__(self) # no effect @@ -63,7 +63,7 @@ def __init__(self, program, envs, timeout, included_mods = [], ignored_mods = [] self.pid = None self.mm = None self.timeouts = 0 - self.max_timeouts = 10 + self.max_events = max_events # Parse ELF self.elf = ELF(self.program, plt = False) @@ -77,6 +77,7 @@ def __init__(self, program, envs, timeout, included_mods = [], ignored_mods = [] self.last_signal = {} self.last_call = None self.crashed = False + self.nevents = 0 self.events = [] self.binfo = dict() @@ -146,7 +147,7 @@ def createEvents(self, signal): else: call = Call(name, module) - self.mm.update() + #self.mm.update() #print "updated mm" call.detect_parameters(self.process, self.mm) breakpoint.desinstall(set_ip=True) @@ -259,9 +260,11 @@ def cont(self, signum=None): signal = self.debugger.waitSignals() process = signal.process events = self.createEvents(signal) - vulns = self.DetectVulnerabilities(self.events, events) + + #vulns = self.DetectVulnerabilities(self.events, events) #print "vulns detected" - self.events = self.events + events + vulns + self.events = self.events + events #+ vulns + self.nevents = self.nevents + len(events) def readInstrSize(self, address, default_size=None): @@ -328,11 +331,18 @@ def runProcess(self, cmd): # Set the breakpoints self.breakpoint(self.elf.GetEntrypoint()) + #print hex(self.elf.GetEntrypoint()) try: while True: + #self.cont() - if not self.debugger or self.crashed: + if self.nevents > self.max_events: + + self.events.append(Timeout(timeout)) + alarm(0) + return + elif not self.debugger or self.crashed: # There is no more process: quit alarm(0) return @@ -371,6 +381,7 @@ def runProcess(self, cmd): def getData(self, inputs): self.events = [] + self.nevents = 0 self.debugger = PtraceDebugger() self.runProcess([self.program]+inputs) @@ -389,7 +400,3 @@ def getData(self, inputs): self.process = None return self.events - - - def timeouted(self): - return self.timeouts >= self.max_timeouts diff --git a/vdiscover/Sampling.py b/vdiscover/Sampling.py index 6d26ac2..23d5ac4 100644 --- a/vdiscover/Sampling.py +++ b/vdiscover/Sampling.py @@ -28,8 +28,14 @@ def cluster_sampler(clustered_traces, n_per_cluster): clusters[cluster] = clusters.get(cluster, []) + [label.split(":")[-1]] selected = set() + tmp = set() + for (cluster, seeds) in clusters.items(): n_sample = min(len(seeds), n_per_cluster) - selected.update(set(random.sample(seeds, n_sample))) + tmp = set(seeds).intersection(selected) + if len(tmp) >= n_sample: + selected.update(set(random.sample(tmp, n_sample))) + else: + selected.update(set(random.sample(seeds, n_sample))) return selected