From 0650fdb7bd39f5fd78faf4ee400acd9f10b094eb Mon Sep 17 00:00:00 2001 From: gaa-cifasis Date: Thu, 29 Oct 2015 19:31:01 +0000 Subject: [PATCH] first steps to have seed sampler --- vd | 24 ++++++++++++++---------- vdiscover/Cluster.py | 4 ++-- vdiscover/Sampling.py | 32 ++++++++++++++++++++++++++++++++ vpredictor | 33 ++++++++++++++++++++++++--------- 4 files changed, 72 insertions(+), 21 deletions(-) create mode 100644 vdiscover/Sampling.py diff --git a/vd b/vd index 50a5f24..7df977d 100755 --- a/vd +++ b/vd @@ -32,8 +32,10 @@ sys.setrecursionlimit(1024*1024*1024) #from vdiscover.Detection import WriteTestcase from vdiscover.Process import Process from vdiscover.Printer import TypePrinter -from vdiscover.Cluster import PlotDeepRepr +from vdiscover.Cluster import ClusterScikit, ClusterConv from vdiscover.Utils import update_progress +from vdiscover.Sampling import cluster_sampler + if __name__ == "__main__": @@ -46,7 +48,7 @@ if __name__ == "__main__": parser = argparse.ArgumentParser(description='') parser.add_argument("-i", help="", type=str, default=None, required=True, dest="seeds") parser.add_argument("-o", help="", type=str, default=None, required=True, dest="out") - #parser.add_argument("-v", help="", type=str, default=None, required=True, dest="vectorizer") + parser.add_argument("-v", help="", type=str, default=None, required=False, dest="vectorizer") #parser.add_argument("-m", help="", type=str, default="afl", dest="fuzzer") parser.add_argument("cmd", help="", type=str, default=None) @@ -55,7 +57,7 @@ if __name__ == "__main__": outfile = options.out #fuzzer = options.fuzzer cmd = options.cmd - #vectorizer = options.vectorizer + vectorizer = options.vectorizer program = cmd.split(" ")[0] programf = program.replace("/","__") timeout = 5 @@ -66,7 +68,7 @@ if __name__ == "__main__": traces = traces_path else: - app = Process(program, envs, timeout, [], [], True) + app = Process(program, envs, timeout, ["libcairo"], [], True) prt = TypePrinter(traces_path, program, 0) traces = [] all_files = [] @@ -93,10 +95,12 @@ if __name__ == "__main__": traces.append(prt.print_events(testcase,events)) #print prepared_cmd #print traces[-1] - - #clustered_traces = PlotDeepRepr(vectorizer, traces, None, "dynamic", None, outdir) - #clusters = dict() - #for label, cluster in clustered_traces: - # clusters[cluster] = clusters.get(cluster, []) + [label] - #print clusters + + if vectorizer is None: + clustered_traces = ClusterScikit(vectorizer, traces, None, "dynamic", None) + else: + clustered_traces = ClusterConv(vectorizer, traces, None, "dynamic", None, None) + + cluster_sampler(clustered_traces,1) + #print clusters diff --git a/vdiscover/Cluster.py b/vdiscover/Cluster.py index 38905cb..43dff09 100644 --- a/vdiscover/Cluster.py +++ b/vdiscover/Cluster.py @@ -31,7 +31,7 @@ from Utils import * from Pipeline import * -def PlotDeepRepr(model_file, train_file, valid_file, ftype, nsamples, outdir): +def ClusterConv(model_file, train_file, valid_file, ftype, nsamples, outdir): f = open(model_file+".pre") preprocessor = pickle.load(f) @@ -139,7 +139,7 @@ def PlotDeepRepr(model_file, train_file, valid_file, ftype, nsamples, outdir): plt.title('Estimated number of clusters: %d' % n_clusters) - plb.savefig(outdir+"/plot.png") + #plb.savefig(outdir+"/plot.png") plt.show() return zip(labels, cluster_labels) diff --git a/vdiscover/Sampling.py b/vdiscover/Sampling.py new file mode 100644 index 0000000..06e5f58 --- /dev/null +++ b/vdiscover/Sampling.py @@ -0,0 +1,32 @@ +""" +This file is part of VDISCOVER. + +VDISCOVER is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +VDISCOVER is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with VDISCOVER. If not, see . + +Copyright 2014 by G.Grieco +""" + +import random +import copy + +def cluster_sampler(clustered_traces, n_per_cluster): + #cc = copy.copy(clusters) + #n_per_cluster = 1#n / len(cc) + clusters = dict() + for label, cluster in clustered_traces: + clusters[cluster] = clusters.get(cluster, []) + [label.split(":")[-1]] + + print "Selecting", len(clusters), "seeds" + for (cluster, seeds) in clusters.items(): + print ",".join(random.sample(seeds, n_per_cluster)) diff --git a/vpredictor b/vpredictor index a9674cf..406e784 100755 --- a/vpredictor +++ b/vpredictor @@ -30,7 +30,7 @@ sys.setrecursionlimit(1024*1024*1024) from vdiscover.Pipeline import * from vdiscover.Recall import Recall from vdiscover.Train import Train -from vdiscover.Cluster import Cluster +from vdiscover.Cluster import ClusterScikit, ClusterConv if __name__ == "__main__": @@ -67,10 +67,13 @@ if __name__ == "__main__": help="Valid a model using infile", action="store", default=None) - parser.add_argument("--cluster", - help="Cluster input traces", + parser.add_argument("--cluster-bow", + help="Cluster input traces using BOW", action="store_true", default=False) + parser.add_argument("--cluster-conv", + help="Cluster input traces using a convolutional model", + action="store_true", default=False) parser.add_argument("--train-rf", help="Train a Random Forest using infile", @@ -97,8 +100,10 @@ if __name__ == "__main__": training_mode_rf = options.train_rf training_mode_lstm = options.train_lstm - training_mode_cluster = options.cluster - training_mode = training_mode_rf or training_mode_lstm or training_mode_cluster + training_mode_cluster_bow = options.cluster_bow + training_mode_cluster_conv = options.cluster_conv + + training_mode = training_mode_rf or training_mode_lstm or training_mode_cluster_bow or training_mode_cluster_conv probability_mode = options.prob nsamples = options.n_samples @@ -120,10 +125,20 @@ if __name__ == "__main__": if training_mode: if training_mode_rf: Train(out_file, in_file, valid_file, "rf", ftype, nsamples) - elif training_mode_lstm: - Train(out_file, in_file, valid_file, "lstm", ftype, nsamples) - elif training_mode_cluster: - Cluster(in_file, valid_file, ftype, nsamples) + #elif training_mode_: + # Train(out_file, in_file, valid_file, "lstm", ftype, nsamples) + elif training_mode_cluster_bow: + #Cluster(in_file, valid_file, ftype, nsamples) + ClusterScikit(None, in_file, valid_file, ftype, nsamples) + elif training_mode_cluster_conv: + #Cluster(in_file, valid_file, ftype, nsamples) + if (model_file is None): + print "Clustering using a convolutional model requires a pre-trained model" + exit(-1) + + ClusterConv(model_file, in_file, valid_file, ftype, nsamples, None) + + else: if model_file is None: print "VDiscover requires a pre-trained model to predict"