From 2f1cfb31ae6120f00976d861e97b3f3e2a0ebde4 Mon Sep 17 00:00:00 2001 From: gustavo Date: Thu, 19 Jan 2017 16:48:32 -0300 Subject: [PATCH] autopep8, starting refactoring --- fextractor | 168 ++++---- setup.py | 11 +- tcreator | 84 ++-- tseeder | 26 +- vd | 112 ++--- vdiscover/Alarm.py | 8 +- vdiscover/Analysis.py | 79 ++-- vdiscover/Backtrace.py | 19 +- vdiscover/Cluster.py | 457 +++++++++++---------- vdiscover/Detection.py | 198 ++++----- vdiscover/ELF.py | 373 +++++++++-------- vdiscover/Event.py | 332 ++++++++------- vdiscover/Input.py | 155 +++---- vdiscover/MemoryMap.py | 180 ++++---- vdiscover/Misc.py | 35 +- vdiscover/Mutation.py | 317 +++++++------- vdiscover/Pipeline.py | 772 ++++++++++++++++++----------------- vdiscover/Printer.py | 110 ++--- vdiscover/Process.py | 354 ++++++++-------- vdiscover/RandomWalk.py | 289 ++++++------- vdiscover/Recall.py | 87 ++-- vdiscover/Run.py | 89 ++-- vdiscover/Sampling.py | 37 +- vdiscover/Spec.py | 8 +- vdiscover/Train.py | 244 +++++------ vdiscover/Types.py | 95 +++-- vdiscover/Utils.py | 250 ++++++------ vdiscover/Vulnerabilities.py | 56 +-- vdp | 94 +++-- vpredictor | 150 ++++--- 30 files changed, 2740 insertions(+), 2449 deletions(-) diff --git a/fextractor b/fextractor index d5198ca..9ad6ca4 100755 --- a/fextractor +++ b/fextractor @@ -25,7 +25,7 @@ import csv import sys import random -from vdiscover.Detection import GetArgs, GetFiles, GetCmd +from vdiscover.Detection import GetArgs, GetFiles, GetCmd # static feature extraction @@ -33,13 +33,11 @@ from vdiscover.RandomWalk import RandomWalkElf # dynamic feature extraction -from vdiscover.Process import Process -from vdiscover.Mutation import NullMutator, RandomByteMutator, RandomExpanderMutator, RandomInputMutator -from vdiscover.Printer import TypePrinter -from vdiscover.Misc import readmodfile -from vdiscover.Input import prepare_inputs - - +from vdiscover.Process import Process +from vdiscover.Mutation import NullMutator, RandomByteMutator, RandomExpanderMutator, RandomInputMutator +from vdiscover.Printer import TypePrinter +from vdiscover.Misc import readmodfile +from vdiscover.Input import prepare_inputs if __name__ == "__main__": @@ -48,7 +46,8 @@ if __name__ == "__main__": random.seed() # To help argparse to detect the number of columns correctly - #os.environ['COLUMNS'] = str(os.popen('stty size', 'r').read().split()[1]) #str(shutil.get_terminal_size().columns) + # os.environ['COLUMNS'] = str(os.popen('stty size', + # 'r').read().split()[1]) #str(shutil.get_terminal_size().columns) if open("/proc/sys/kernel/randomize_va_space").read().strip() != "0": print("Address space layout randomization (ASLR) is enabled, disable it before continue to use the cache") @@ -56,8 +55,10 @@ if __name__ == "__main__": sys.exit(-1) # Arguments - parser = argparse.ArgumentParser(description='Feature extraction of VDiscover') - parser.add_argument("testcase", help="Testcase to analyze", type=str, default=None) + parser = argparse.ArgumentParser( + description='Feature extraction of VDiscover') + parser.add_argument( + "testcase", help="Testcase to analyze", type=str, default=None) parser.add_argument("--static", help="Extract only static features from an executable", @@ -67,40 +68,65 @@ if __name__ == "__main__": help="Extract only dynamic features from a testcase", action="store_true", default=False) - parser.add_argument("--mclass", type=str, - help="Include class column, to use later in training mode", - action="store", default=None) + parser.add_argument( + "--mclass", + type=str, + help="Include class column, to use later in training mode", + action="store", + default=None) parser.add_argument("--out-file", help="File to output the extracted features", type=str, default="/dev/stdout") - parser.add_argument("--max-subtraces-collected", type=int, - help="Maximum number of subtraces collected (static features only)", default=100) - - parser.add_argument("--max-subtraces-explored", type=int, - help="Maximum number of subtraces explored (static features only)", default=10000) - - parser.add_argument("--min-subtrace-size", type=int, - help="Minumum number of events in each subtrace collected (static features only)", default=3) - - parser.add_argument("--show-stdout", - help="Don't use /dev/null as stdout/stderr (dynamic features only)", - action="store_true", default=False) - - parser.add_argument("--inc-mods", - help="Only extract features from the libraries matching the strings inside this file (dynamic features only)", - type=str, default=None) - - parser.add_argument("--ign-mods", - help="Ignore extracted features from the libraries matching the string inside this file (dynamic features only)", - type=str, default=None) - - parser.add_argument("--timeout", dest="timeout", type=int, - help="Timeout in seconds (dynamic features only)", default=3) - - parser.add_argument("--max-mutations", type=int, - help="Maximum number of mutations to the original testcase (dynamic features only)", default=0) + parser.add_argument( + "--max-subtraces-collected", + type=int, + help="Maximum number of subtraces collected (static features only)", + default=100) + + parser.add_argument( + "--max-subtraces-explored", + type=int, + help="Maximum number of subtraces explored (static features only)", + default=10000) + + parser.add_argument( + "--min-subtrace-size", + type=int, + help="Minumum number of events in each subtrace collected (static features only)", + default=3) + + parser.add_argument( + "--show-stdout", + help="Don't use /dev/null as stdout/stderr (dynamic features only)", + action="store_true", + default=False) + + parser.add_argument( + "--inc-mods", + help="Only extract features from the libraries matching the strings inside this file (dynamic features only)", + type=str, + default=None) + + parser.add_argument( + "--ign-mods", + help="Ignore extracted features from the libraries matching the string inside this file (dynamic features only)", + type=str, + default=None) + + parser.add_argument( + "--timeout", + dest="timeout", + type=int, + help="Timeout in seconds (dynamic features only)", + default=3) + + parser.add_argument( + "--max-mutations", + type=int, + help="Maximum number of mutations to the original testcase (dynamic features only)", + default=0) options = parser.parse_args() testcase = options.testcase @@ -108,13 +134,14 @@ if __name__ == "__main__": static_only = options.static dynamic_only = options.dynamic - if (not static_only and not dynamic_only) or (static_only and dynamic_only): - print "The feature extraction requires to select either static of dynamic features exclusively" - exit(-1) + if (not static_only and not dynamic_only) or ( + static_only and dynamic_only): + print "The feature extraction requires to select either static of dynamic features exclusively" + exit(-1) max_subtraces_collected = options.max_subtraces_collected - max_subtraces_explored = options.max_subtraces_explored - min_subtrace_size = options.min_subtrace_size + max_subtraces_explored = options.max_subtraces_explored + min_subtrace_size = options.min_subtrace_size incmodfile = options.inc_mods ignmodfile = options.ign_mods @@ -135,39 +162,42 @@ if __name__ == "__main__": if static_only: - RandomWalkElf(program, csvfile, mclass, max_subtraces_collected, max_subtraces_explored, min_subtrace_size) + RandomWalkElf(program, csvfile, mclass, max_subtraces_collected, + max_subtraces_explored, min_subtrace_size) elif dynamic_only: - os.chdir("inputs") + os.chdir("inputs") - envs = dict() - args = GetArgs() - files = GetFiles() + envs = dict() + args = GetArgs() + files = GetFiles() - original_inputs = RandomInputMutator(args + files, NullMutator) - #expanded_input_generator = RandomInputMutator(args + files, RandomExpanderMutator) - mutated_input_generator = RandomInputMutator(args + files, RandomByteMutator) - if included_mods == []: - included_mods = [program] + original_inputs = RandomInputMutator(args + files, NullMutator) + #expanded_input_generator = RandomInputMutator(args + files, RandomExpanderMutator) + mutated_input_generator = RandomInputMutator( + args + files, RandomByteMutator) + if included_mods == []: + included_mods = [program] - app = Process(program, envs, timeout, included_mods, ignored_mods, no_stdout = not show_stdout ) - prt = TypePrinter(csvfile, testcase, mclass) + app = Process(program, envs, timeout, included_mods, + ignored_mods, no_stdout=not show_stdout) + prt = TypePrinter(csvfile, testcase, mclass) - # unchanged input - null_mutt, original_input = original_inputs.next() - original_events = app.getData(prepare_inputs(original_input)) + # unchanged input + null_mutt, original_input = original_inputs.next() + original_events = app.getData(prepare_inputs(original_input)) - if original_events is None: - print "Execution of",program,"failed!" - exit(-1) + if original_events is None: + print "Execution of", program, "failed!" + exit(-1) - prt.print_events(program,original_events) + prt.print_events(program, original_events) - for (i, (d, mutated)) in enumerate(mutated_input_generator): + for (i, (d, mutated)) in enumerate(mutated_input_generator): - if i >= max_mut: - break + if i >= max_mut: + break - events = app.getData(prepare_inputs(mutated)) - prt.print_events(program,events) + events = app.getData(prepare_inputs(mutated)) + prt.print_events(program, events) diff --git a/setup.py b/setup.py index 5690720..9f798a2 100755 --- a/setup.py +++ b/setup.py @@ -12,10 +12,13 @@ url='http://vdiscover.org/', author='G.Grieco', author_email='gg@cifasis-conicet.gov.ar', - scripts=['fextractor', 'vpredictor', 'tcreator', 'tseeder', 'vd'], + scripts=[ + 'fextractor', + 'vpredictor', + 'tcreator', + 'tseeder', + 'vd'], install_requires=[ "python-ptrace", - "scikit-learn" - ], + "scikit-learn"], ) - diff --git a/tcreator b/tcreator index f3edd73..c65f05a 100755 --- a/tcreator +++ b/tcreator @@ -25,61 +25,73 @@ import sys import csv from vdiscover.Detection import WriteTestcase -concatenate = lambda *lists: reduce((lambda a,b: a.extend(b) or a),lists,[]) +from functools import reduce +concatenate = lambda *lists: reduce((lambda a, b: a.extend(b) or a), lists, []) if __name__ == "__main__": # Arguments - parser = argparse.ArgumentParser(description='A small utility to create new test cases using a name and a command line') - parser.add_argument("--name", help="The name of the ", type=str, default=None) - parser.add_argument("--cmd", help="Command-line to execute", type=str, default=None) - parser.add_argument("--batch", help="A csv with the command lines", type=str, default=None) - - parser.add_argument("--copy", help="Force the copy of the files in command lines instead of symbolic linking", action='store_true', default=False) - - parser.add_argument("outdir", help="Output directory to write testcases", type=str, default=None) + parser = argparse.ArgumentParser( + description='A small utility to create new test cases using a name and a command line') + parser.add_argument("--name", help="The name of the ", + type=str, default=None) + parser.add_argument( + "--cmd", help="Command-line to execute", type=str, default=None) + parser.add_argument( + "--batch", help="A csv with the command lines", type=str, default=None) + + parser.add_argument( + "--copy", + help="Force the copy of the files in command lines instead of symbolic linking", + action='store_true', + default=False) + + parser.add_argument( + "outdir", + help="Output directory to write testcases", + type=str, + default=None) options = parser.parse_args() name = options.name cmd = options.cmd in_file = options.batch copy = options.copy - out_dir= options.outdir + out_dir = options.outdir if (name is not None and cmd is not None) ^ (in_file is not None): - pass + pass else: - #or (name not is None and cmd is not None) and in_file is None: - print "Either name and command should be used or an input file" - exit(-1) + # or (name not is None and cmd is not None) and in_file is None: + print "Either name and command should be used or an input file" + exit(-1) try: - os.makedirs(out_dir) + os.makedirs(out_dir) except: - pass + pass if in_file is not None: - infile = open(in_file,"r") - csvreader = csv.reader(infile, delimiter='\t') - os.chdir(out_dir) + infile = open(in_file, "r") + csvreader = csv.reader(infile, delimiter='\t') + os.chdir(out_dir) - for i,row in enumerate(csvreader): - args = filter(lambda x: x is not '', row[0].split(" ")) - name = args[0].replace("/","_")+":"+str(i) - WriteTestcase(name,args[0],args[1:], copy) + for i, row in enumerate(csvreader): + args = filter(lambda x: x is not '', row[0].split(" ")) + name = args[0].replace("/", "_") + ":" + str(i) + WriteTestcase(name, args[0], args[1:], copy) else: - os.chdir(out_dir) - args = cmd.split("'") - args = map(lambda x: x.split(" "), args) - pargs = [] - - for arg in args: - if arg <> '': - pargs = pargs + arg - #args = concatenate(args) - print "Procesing '" + " ".join(pargs) + "'" - #args = filter(lambda x: x is not '', cmd.split(" ")) - WriteTestcase(name,pargs[0],pargs[1:], copy) - + os.chdir(out_dir) + args = cmd.split("'") + args = map(lambda x: x.split(" "), args) + pargs = [] + + for arg in args: + if arg != '': + pargs = pargs + arg + #args = concatenate(args) + print "Procesing '" + " ".join(pargs) + "'" + #args = filter(lambda x: x is not '', cmd.split(" ")) + WriteTestcase(name, pargs[0], pargs[1:], copy) diff --git a/tseeder b/tseeder index 1f7d8fa..6a13991 100755 --- a/tseeder +++ b/tseeder @@ -33,10 +33,20 @@ csv.field_size_limit(sys.maxsize) if __name__ == "__main__": # Arguments - parser = argparse.ArgumentParser(description='A small utility to perform seed selection for fuzzig') - parser.add_argument("infile", help="A csv with the features to train or predict", type=str, default=None) - parser.add_argument("outdir", help="A directory with the seeds", type=str, default=None) - parser.add_argument("-n", help="Number of seeds to select per cluster", type=int, default=1) + parser = argparse.ArgumentParser( + description='A small utility to perform seed selection for fuzzig') + parser.add_argument( + "infile", + help="A csv with the features to train or predict", + type=str, + default=None) + parser.add_argument( + "outdir", help="A directory with the seeds", type=str, default=None) + parser.add_argument( + "-n", + help="Number of seeds to select per cluster", + type=int, + default=1) #parser.add_argument("--random", help="Sample randomly", action="store_true", default=None) options = parser.parse_args() @@ -47,14 +57,14 @@ if __name__ == "__main__": reader = load_csv(in_file) clusters = [] for [label, cluster] in reader: - clusters.append((label.split(":")[-1], cluster)) + clusters.append((label.split(":")[-1], cluster)) selected = cluster_sampler(clusters, nseeds) if not os.path.exists(outdir): - os.makedirs(outdir) + os.makedirs(outdir) print "Copying seeds.." for seed in selected: - print seed - shutil.copy(seed, outdir) + print seed + shutil.copy(seed, outdir) diff --git a/vd b/vd index 26d7cae..b8304b0 100755 --- a/vd +++ b/vd @@ -24,21 +24,20 @@ import os.path import argparse import sys import csv -import random +import random csv.field_size_limit(sys.maxsize) -sys.setrecursionlimit(1024*1024*1024) +sys.setrecursionlimit(1024 * 1024 * 1024) #from vdiscover.Detection import WriteTestcase -from vdiscover.Process import Process -from vdiscover.Printer import TypePrinter +from vdiscover.Process import Process +from vdiscover.Printer import TypePrinter #from vdiscover.Cluster import ClusterScikit, ClusterConv from vdiscover.Utils import update_progress -from vdiscover.Sampling import cluster_sampler +from vdiscover.Sampling import cluster_sampler if __name__ == "__main__": - if open("/proc/sys/kernel/randomize_va_space").read().strip() != "0": print("Address space layout randomization (ASLR) is enabled, disable it before continue to use the cache") print("Hint: # echo 0 > /proc/sys/kernel/randomize_va_space") @@ -46,8 +45,10 @@ if __name__ == "__main__": # Arguments parser = argparse.ArgumentParser(description='') - parser.add_argument("-i", help="", type=str, default=None, required=True, dest="seeds") - parser.add_argument("-o", help="", type=str, default=None, required=True, dest="out") + parser.add_argument("-i", help="", type=str, + default=None, required=True, dest="seeds") + parser.add_argument("-o", help="", type=str, + default=None, required=True, dest="out") parser.add_argument("-m", help="", type=str, nargs='+', dest="mods") #parser.add_argument("-v", help="", type=str, default=None, required=False, dest="vectorizer") @@ -62,62 +63,63 @@ if __name__ == "__main__": cmd = options.cmd #vectorizer = options.vectorizer program = cmd.split(" ")[0] - programf = program.replace("/","__") + programf = program.replace("/", "__") main_module = program.split("/")[-1] timeout = 15 envs = dict() - traces_path = outfile#outdir+"/traces.raw" + traces_path = outfile # outdir+"/traces.raw" if os.path.exists(traces_path): - print traces_path, "exists. I will not overwritte it. Aborting" + print traces_path, "exists. I will not overwritte it. Aborting" else: - modules_to_trace = [main_module] - if mods is not None: - modules_to_trace = modules_to_trace + mods - - if "LD_LIBRARY_PATH" in os.environ: - libs = os.environ["LD_LIBRARY_PATH"] - for _,_,files in os.walk(libs): - for f in files: - modules_to_trace.append(f) - - print "Tracing", modules_to_trace - app = Process(program, envs, timeout, modules_to_trace, [], True) - prt = TypePrinter(traces_path, program, 0) - traces = [] - all_files = [] - - print "Extracting traces.." - for x,y,files in os.walk(seeds): - nfiles = len(files) - #print "Processing directory ","./"++("/".join(y)), "with", nfiles, "seeds" - for f in files: - all_files.append(x+"/".join(y)+f) - - random.shuffle(all_files) - nfiles = len(all_files) - - for progress,testcase in enumerate(all_files): - #print testcase - progress = round(float(progress)/nfiles,4) - update_progress(progress) - prepared_cmd = cmd.replace(program,"") - prepared_cmd = prepared_cmd.split("@@") - prepared_cmd = prepared_cmd[0].split(" ") + [testcase] + prepared_cmd[1].split(" ") - prepared_cmd = filter(lambda x: x<>'', prepared_cmd) - #print "Getting data.." - events = app.getData(prepared_cmd) - #print "Printing data.. ", len(events) - traces.append(prt.print_events(testcase,events)) - #print prepared_cmd - #print traces[-1] - - - #if vectorizer is None: + modules_to_trace = [main_module] + if mods is not None: + modules_to_trace = modules_to_trace + mods + + if "LD_LIBRARY_PATH" in os.environ: + libs = os.environ["LD_LIBRARY_PATH"] + for _, _, files in os.walk(libs): + for f in files: + modules_to_trace.append(f) + + print "Tracing", modules_to_trace + app = Process(program, envs, timeout, modules_to_trace, [], True) + prt = TypePrinter(traces_path, program, 0) + traces = [] + all_files = [] + + print "Extracting traces.." + for x, y, files in os.walk(seeds): + nfiles = len(files) + # print "Processing directory ","./"++("/".join(y)), "with", + # nfiles, "seeds" + for f in files: + all_files.append(x + "/".join(y) + f) + + random.shuffle(all_files) + nfiles = len(all_files) + + for progress, testcase in enumerate(all_files): + # print testcase + progress = round(float(progress) / nfiles, 4) + update_progress(progress) + prepared_cmd = cmd.replace(program, "") + prepared_cmd = prepared_cmd.split("@@") + prepared_cmd = prepared_cmd[0].split( + " ") + [testcase] + prepared_cmd[1].split(" ") + prepared_cmd = filter(lambda x: x != '', prepared_cmd) + # print "Getting data.." + events = app.getData(prepared_cmd) + # print "Printing data.. ", len(events) + traces.append(prt.print_events(testcase, events)) + # print prepared_cmd + # print traces[-1] + + # if vectorizer is None: # clustered_traces = ClusterScikit(vectorizer, traces, None, "dynamic", None) - #else: + # else: # clustered_traces = ClusterConv(vectorizer, traces, None, "dynamic", None, None) # cluster_sampler(clustered_traces,1) # #print clusters diff --git a/vdiscover/Alarm.py b/vdiscover/Alarm.py index 2cf7373..6390cf6 100644 --- a/vdiscover/Alarm.py +++ b/vdiscover/Alarm.py @@ -19,10 +19,10 @@ import signal -class TimeoutEx(Exception): - pass -def alarm_handler(signum, frame): - raise TimeoutEx +class TimeoutEx(Exception): + pass +def alarm_handler(signum, frame): + raise TimeoutEx diff --git a/vdiscover/Analysis.py b/vdiscover/Analysis.py index b835b5a..c4e05ee 100644 --- a/vdiscover/Analysis.py +++ b/vdiscover/Analysis.py @@ -22,47 +22,48 @@ from Types import Type from ptrace.error import PtraceError + def FindModule(value, mm): - return mm.findModule(value) + return mm.findModule(value) def RefinePType(ptype, value, process, mm): - if value is None: - return (Type("Top32",4), value) - - if str(ptype) == "Ptr32": - ptr = value - if ptr == 0x0: - return (Type("NPtr32",4), ptr) - else: - - try: - _ = process.readBytes(ptr, 4) - except PtraceError: - return (Type("DPtr32",4), ptr) - - mm.checkPtr(ptr) - if mm.isStackPtr(ptr): - return (Type("SPtr32",4), ptr) - elif mm.isHeapPtr(ptr): - return (Type("HPtr32",4), ptr) - elif mm.isCodePtr(ptr): - return (Type("GxPtr32",4), ptr) - elif mm.isFilePtr(ptr): - return (Type("FPtr32",4), ptr) - elif mm.isGlobalPtr(ptr): - return (Type("GPtr32",4), ptr) - else: - return (Type("Ptr32",4), ptr) - - elif str(ptype) == "Num32": - num = value - if num == 0x0: - return (Type("Num32B0",4), num) - else: - binlen = len(bin(num))-2 - binlen = int(ceil(binlen / 8.0))*8 - return (Type("Num32B"+str(binlen),4), num) - - return (Type("Top32",4), value) + if value is None: + return (Type("Top32", 4), value) + + if str(ptype) == "Ptr32": + ptr = value + if ptr == 0x0: + return (Type("NPtr32", 4), ptr) + else: + + try: + _ = process.readBytes(ptr, 4) + except PtraceError: + return (Type("DPtr32", 4), ptr) + + mm.checkPtr(ptr) + if mm.isStackPtr(ptr): + return (Type("SPtr32", 4), ptr) + elif mm.isHeapPtr(ptr): + return (Type("HPtr32", 4), ptr) + elif mm.isCodePtr(ptr): + return (Type("GxPtr32", 4), ptr) + elif mm.isFilePtr(ptr): + return (Type("FPtr32", 4), ptr) + elif mm.isGlobalPtr(ptr): + return (Type("GPtr32", 4), ptr) + else: + return (Type("Ptr32", 4), ptr) + + elif str(ptype) == "Num32": + num = value + if num == 0x0: + return (Type("Num32B0", 4), num) + else: + binlen = len(bin(num)) - 2 + binlen = int(ceil(binlen / 8.0)) * 8 + return (Type("Num32B" + str(binlen), 4), num) + + return (Type("Top32", 4), value) diff --git a/vdiscover/Backtrace.py b/vdiscover/Backtrace.py index edbe8f0..3dc0566 100644 --- a/vdiscover/Backtrace.py +++ b/vdiscover/Backtrace.py @@ -4,6 +4,7 @@ from ptrace import PtraceError #from ptrace.six.moves import xrange + class BacktraceFrame(object): """ Backtrace frame. @@ -13,6 +14,7 @@ class BacktraceFrame(object): - name: name of the function - arguments: value of the arguments """ + def __init__(self, ip): self.ip = ip self.name = u"???" @@ -20,12 +22,15 @@ def __init__(self, ip): def __str__(self): arguments = (formatWordHex(arg) for arg in self.arguments) - return u"IP=%s: %s (%s)" % (formatAddress(self.ip), self.name, ", ".join(arguments)) + return u"IP=%s: %s (%s)" % (formatAddress(self.ip), + self.name, ", ".join(arguments)) + class Backtrace(object): """ Backtrace: all process frames since the start function. """ + def __init__(self): self.frames = [] self.truncated = False @@ -39,6 +44,7 @@ def __iter__(self): def __len__(self): return len(self.frames) + def getBacktrace(process, max_args=6, max_depth=20): """ Get the current backtrace of the specified process: @@ -74,9 +80,9 @@ def getBacktrace(process, max_args=6, max_depth=20): # Create frame frame = getBacktraceFrame(process, ip, fp, nargs) - - #print frame - #print hex(fp),hex(nextfp), hex(nargs) + + # print frame + # print hex(fp),hex(nextfp), hex(nargs) backtrace.append(frame) # End of the stack? @@ -84,7 +90,7 @@ def getBacktrace(process, max_args=6, max_depth=20): break # Move to next instruction/frame pointer - ip = process.readWord(fp+CPU_WORD_SIZE) + ip = process.readWord(fp + CPU_WORD_SIZE) if ip == CPU_MAX_UINT: # Linux hack to detect end of the stack break @@ -92,6 +98,7 @@ def getBacktrace(process, max_args=6, max_depth=20): depth += 1 return backtrace + def getBacktraceFrame(process, ip, fp, nargs): """ Get a backtrace frame: @@ -112,5 +119,3 @@ def getBacktraceFrame(process, ip, fp, nargs): # Ignore argument read error pass return frame - - diff --git a/vdiscover/Cluster.py b/vdiscover/Cluster.py index a1013b0..9993a3d 100644 --- a/vdiscover/Cluster.py +++ b/vdiscover/Cluster.py @@ -27,14 +27,14 @@ import matplotlib as mpl # hack from https://stackoverflow.com/questions/2801882/generating-a-png-with-matplotlib-when-display-is-undefined to avoid using X -#mpl.use('Agg') +# mpl.use('Agg') import matplotlib.pyplot as plt from Utils import * from Pipeline import * -#def Cluster(X, labels) +# def Cluster(X, labels) """ assert(len(X_red) == len(labels)) @@ -63,8 +63,7 @@ plt.title('Estimated number of clusters: %d' % n_clusters) """ -#return zip(labels, cluster_labels) - +# return zip(labels, cluster_labels) batch_size = 25 @@ -77,54 +76,67 @@ hidden_dims = 50 nb_epoch = 3 -def ClusterCnn(model_file, train_file, valid_file, ftype, nsamples, outdir): - - f = open(model_file+".pre") - preprocessor = pickle.load(f) - - import h5py - f = h5py.File(model_file+".wei") - - layers = [] - for k in range(f.attrs['nb_layers']): - g = f['layer_{}'.format(k)] - layers.append([g['param_{}'.format(p)] for p in range(g.attrs['nb_params'])]) - max_features = len(preprocessor.tokenizer.word_counts) +def ClusterCnn(model_file, train_file, valid_file, ftype, nsamples, outdir): - print "Reading and sampling data to train.." - train_programs, train_features, train_classes = read_traces(train_file, nsamples, cut=None) - train_size = len(train_features) + f = open(model_file + ".pre") + preprocessor = pickle.load(f) + + import h5py + f = h5py.File(model_file + ".wei") + + layers = [] + for k in range(f.attrs['nb_layers']): + g = f['layer_{}'.format(k)] + layers.append([g['param_{}'.format(p)] + for p in range(g.attrs['nb_params'])]) + + max_features = len(preprocessor.tokenizer.word_counts) + + print "Reading and sampling data to train.." + train_programs, train_features, train_classes = read_traces( + train_file, nsamples, cut=None) + train_size = len(train_features) + + #y = train_programs + X_train, y_train, labels = preprocessor.preprocess_traces( + train_features, y_data=train_classes, labels=train_programs) + new_model = make_cluster_cnn( + "test", + max_features, + maxlen, + embedding_dims, + nb_filters, + filter_length, + hidden_dims, + None, + weights=layers) - #y = train_programs - X_train, y_train, labels = preprocessor.preprocess_traces(train_features, y_data=train_classes, labels=train_programs) - new_model = make_cluster_cnn("test", max_features, maxlen, embedding_dims, nb_filters, filter_length, hidden_dims, None, weights=layers) + train_dict = dict() + train_dict[ftype] = new_model.predict(X_train) - train_dict = dict() - train_dict[ftype] = new_model.predict(X_train) + model = make_cluster_pipeline_subtraces(ftype) + X_red_comp = model.fit_transform(train_dict) + explained_var = np.var(X_red_comp, axis=0) + print explained_var - model = make_cluster_pipeline_subtraces(ftype) - X_red_comp = model.fit_transform(train_dict) - explained_var = np.var(X_red_comp, axis=0) - print explained_var + X_red = X_red_comp[:, 0:2] + X_red_next = X_red_comp[:, 2:4] - X_red = X_red_comp[:,0:2] - X_red_next = X_red_comp[:,2:4] + colors = mpl.colors.cnames.keys() + progs = list(set(labels)) + ncolors = len(colors) + size = len(labels) + print "Plotting.." - colors = mpl.colors.cnames.keys() - progs = list(set(labels)) - ncolors = len(colors) - size = len(labels) - print "Plotting.." - - for prog,[x,y] in zip(labels, X_red): - #for prog,[x,y] in sample(zip(labels, X_red), min(size, 1000)): - x = gauss(0,0.05) + x - y = gauss(0,0.05) + y - color = 'r' - plt.scatter(x, y, c=color ) + for prog, [x, y] in zip(labels, X_red): + # for prog,[x,y] in sample(zip(labels, X_red), min(size, 1000)): + x = gauss(0, 0.05) + x + y = gauss(0, 0.05) + y + color = 'r' + plt.scatter(x, y, c=color) - """ + """ if valid_file is not None: valid_programs, valid_features, valid_classes = read_traces(valid_file, None, cut=None, maxsize=window_size) #None) valid_dict = dict() @@ -141,46 +153,46 @@ def ClusterCnn(model_file, train_file, valid_file, ftype, nsamples, outdir): y = gauss(0,0.05) + y plt.scatter(x, y, c='b') plt.text(x, y+0.02, prog.split("/")[-1]) - + plt.show() """ - plt.savefig(train_file.replace(".gz","")+".png") - print "Bandwidth estimation.." - from sklearn.cluster import MeanShift, estimate_bandwidth + plt.savefig(train_file.replace(".gz", "") + ".png") + print "Bandwidth estimation.." + from sklearn.cluster import MeanShift, estimate_bandwidth + X_red_sample = X_red[:min(size, 1000)] + bandwidth = estimate_bandwidth(X_red_sample, quantile=0.2) + print "Clustering with bandwidth:", bandwidth - X_red_sample = X_red[:min(size, 1000)] - bandwidth = estimate_bandwidth(X_red_sample, quantile=0.2) - print "Clustering with bandwidth:", bandwidth - - #X_red = np.vstack((X_red,X_red_valid)) - #X_red_next = np.vstack((X_red_next,X_red_valid_next)) - #labels = labels + valid_labels + #X_red = np.vstack((X_red,X_red_valid)) + #X_red_next = np.vstack((X_red_next,X_red_valid_next)) + #labels = labels + valid_labels - print X_red.shape, len(X_red), len(labels) - #print valid_labels - - af = MeanShift(bandwidth=bandwidth/1).fit(X_red) + print X_red.shape, len(X_red), len(labels) + # print valid_labels - cluster_centers = af.cluster_centers_ - cluster_labels = af.labels_ - n_clusters = len(cluster_centers) - - plt.figure() - for ([x,y],label, cluster_label) in zip(X_red,labels, cluster_labels): - #for ([x,y],label, cluster_label) in sample(zip(X_red,labels, cluster_labels), min(size, 1000)): - x = gauss(0,0.1) + x - y = gauss(0,0.1) + y - plt.scatter(x, y, c = colors[cluster_label % ncolors]) - #print label - #if label in valid_labels: - # plt.text(x-0.05, y+0.01, label.split("/")[-1]) + af = MeanShift(bandwidth=bandwidth / 1).fit(X_red) - for i,[x,y] in enumerate(cluster_centers): - plt.plot(x, y, 'o', markerfacecolor=colors[i % ncolors], - markeredgecolor='k', markersize=7) + cluster_centers = af.cluster_centers_ + cluster_labels = af.labels_ + n_clusters = len(cluster_centers) - """ + plt.figure() + for ([x, y], label, cluster_label) in zip(X_red, labels, cluster_labels): + # for ([x,y],label, cluster_label) in sample(zip(X_red,labels, + # cluster_labels), min(size, 1000)): + x = gauss(0, 0.1) + x + y = gauss(0, 0.1) + y + plt.scatter(x, y, c=colors[cluster_label % ncolors]) + # print label + # if label in valid_labels: + # plt.text(x-0.05, y+0.01, label.split("/")[-1]) + + for i, [x, y] in enumerate(cluster_centers): + plt.plot(x, y, 'o', markerfacecolor=colors[i % ncolors], + markeredgecolor='k', markersize=7) + + """ #for prog,[x,y] in zip(valid_labels, X_red_valid): #x = gauss(0,0.1) + x #y = gauss(0,0.1) + y @@ -193,14 +205,14 @@ def ClusterCnn(model_file, train_file, valid_file, ftype, nsamples, outdir): #plt.savefig("clusters.png") plt.show() """ - plt.savefig(train_file.replace(".gz","")+".clusters.png") + plt.savefig(train_file.replace(".gz", "") + ".clusters.png") - clustered_traces = zip(labels, cluster_labels) - writer = open_csv(train_file.replace(".gz","")+".clusters") - for label, cluster in clustered_traces: - writer.writerow([label, cluster]) + clustered_traces = zip(labels, cluster_labels) + writer = open_csv(train_file.replace(".gz", "") + ".clusters") + for label, cluster in clustered_traces: + writer.writerow([label, cluster]) - """ + """ clusters = dict() for label, cluster in clustered_traces: @@ -240,46 +252,56 @@ def ClusterCnn(model_file, train_file, valid_file, ftype, nsamples, outdir): #plt.savefig('cluster-%d.png' % cluster) """ - #return clustered_traces + # return clustered_traces def TrainCnn(model_file, train_file, valid_file, ftype, nsamples): - csvreader = open_csv(train_file) - - train_features = [] - train_programs = [] - train_classes = [] - - train_programs, train_features, train_classes = read_traces(train_file, nsamples, cut=None) - train_size = len(train_features) - - from keras.preprocessing.text import Tokenizer - - tokenizer = Tokenizer(nb_words=None, filters="", lower=False, split=" ") - #print type(train_features[0]) - tokenizer.fit_on_texts(train_features) - max_features = len(tokenizer.word_counts) - - preprocessor = DeepReprPreprocessor(tokenizer, window_size, batch_size) - X_train,y_train = preprocessor.preprocess(train_features, 10000) - nb_classes = len(preprocessor.classes) - print preprocessor.classes - - model = make_cluster_cnn("train", max_features, maxlen, embedding_dims, nb_filters, filter_length, hidden_dims, nb_classes) - model.fit(X_train, y_train, validation_split=0.1, batch_size=batch_size, nb_epoch=nb_epoch, show_accuracy=True) - - model.mypreprocessor = preprocessor - #model_file = model_file + ".wei" - #modelfile = open_model(model_file) - print "Saving model to",model_file + ".wei" - model.save_weights(model_file + ".wei") - - #model_file = model_file + ".pre" - modelfile = open_model(model_file + ".pre") - print "Saving preprocessor to",model_file + ".pre" - #model.save_weights(model_file) - modelfile.write(pickle.dumps(preprocessor, protocol=2)) + csvreader = open_csv(train_file) + + train_features = [] + train_programs = [] + train_classes = [] + + train_programs, train_features, train_classes = read_traces( + train_file, nsamples, cut=None) + train_size = len(train_features) + + from keras.preprocessing.text import Tokenizer + + tokenizer = Tokenizer(nb_words=None, filters="", lower=False, split=" ") + # print type(train_features[0]) + tokenizer.fit_on_texts(train_features) + max_features = len(tokenizer.word_counts) + + preprocessor = DeepReprPreprocessor(tokenizer, window_size, batch_size) + X_train, y_train = preprocessor.preprocess(train_features, 10000) + nb_classes = len(preprocessor.classes) + print preprocessor.classes + + model = make_cluster_cnn( + "train", + max_features, + maxlen, + embedding_dims, + nb_filters, + filter_length, + hidden_dims, + nb_classes) + model.fit(X_train, y_train, validation_split=0.1, + batch_size=batch_size, nb_epoch=nb_epoch, show_accuracy=True) + + model.mypreprocessor = preprocessor + #model_file = model_file + ".wei" + #modelfile = open_model(model_file) + print "Saving model to", model_file + ".wei" + model.save_weights(model_file + ".wei") + + #model_file = model_file + ".pre" + modelfile = open_model(model_file + ".pre") + print "Saving preprocessor to", model_file + ".pre" + # model.save_weights(model_file) + modelfile.write(pickle.dumps(preprocessor, protocol=2)) """ def ClusterDoc2Vec(model_file, train_file, valid_file, ftype, nsamples, param): @@ -294,7 +316,7 @@ def ClusterDoc2Vec(model_file, train_file, valid_file, ftype, nsamples, param): print "Vectorizing traces.." sentences = [] - + for (prog,trace) in zip(train_programs,train_features): sentences.append(TaggedDocument(trace.split(" "), [prog])) @@ -333,7 +355,7 @@ def ClusterDoc2Vec(model_file, train_file, valid_file, ftype, nsamples, param): except ValueError: plt.text(x, y+0.02, cl) - #plt.show() + #plt.show() plt.savefig(train_file.replace(".gz","")+".png") from sklearn.cluster import MeanShift, estimate_bandwidth @@ -372,126 +394,135 @@ def ClusterDoc2Vec(model_file, train_file, valid_file, ftype, nsamples, param): """ -def ClusterScikit(model_file, train_file, valid_file, ftype, nsamples, vectorizer, reducer, param): - train_programs, train_features, train_classes = read_traces(train_file, nsamples) - train_size = len(train_programs) - print "using", train_size,"examples to train." +def ClusterScikit( + model_file, + train_file, + valid_file, + ftype, + nsamples, + vectorizer, + reducer, + param): - if vectorizer == "bow": - - train_dict = dict() - train_dict[ftype] = train_features - #batch_size = 16 - #window_size = 20 + train_programs, train_features, train_classes = read_traces( + train_file, nsamples) + train_size = len(train_programs) + print "using", train_size, "examples to train." - print "Transforming data and fitting model.." - model = make_cluster_pipeline_bow(ftype, reducer) - X_red = model.fit_transform(train_dict) + if vectorizer == "bow": - elif vectorizer == "doc2vec": + train_dict = dict() + train_dict[ftype] = train_features + #batch_size = 16 + #window_size = 20 - from gensim.models.doc2vec import TaggedDocument - from gensim.models import Doc2Vec + print "Transforming data and fitting model.." + model = make_cluster_pipeline_bow(ftype, reducer) + X_red = model.fit_transform(train_dict) - print "Vectorizing traces.." - sentences = [] - - for (prog,trace) in zip(train_programs,train_features): - sentences.append(TaggedDocument(trace.split(" "), [prog])) + elif vectorizer == "doc2vec": - model = Doc2Vec(dm=2, min_count=1, window=5, size=100, sample=1e-4, negative=5, workers=8, iter=1) - model.build_vocab(sentences) + from gensim.models.doc2vec import TaggedDocument + from gensim.models import Doc2Vec - for epoch in range(20): - #print model - model.train(sentences) - shuffle(sentences) + print "Vectorizing traces.." + sentences = [] - train_dict = dict() + for (prog, trace) in zip(train_programs, train_features): + sentences.append(TaggedDocument(trace.split(" "), [prog])) - vec_train_features = [] - for prog in train_programs: - #print prog, model.docvecs[prog] - vec_train_features.append(model.docvecs[prog]) + model = Doc2Vec(dm=2, min_count=1, window=5, size=100, + sample=1e-4, negative=5, workers=8, iter=1) + model.build_vocab(sentences) - train_dict[ftype] = vec_train_features + for epoch in range(20): + # print model + model.train(sentences) + shuffle(sentences) - print "Transforming data and fitting model.." - model = make_cluster_pipeline_doc2vec(ftype, reducer) - X_red = model.fit_transform(train_dict) + train_dict = dict() + vec_train_features = [] + for prog in train_programs: + # print prog, model.docvecs[prog] + vec_train_features.append(model.docvecs[prog]) - #pl.rcParams.update({'font.size': 10}) - if type(X_red) == list: - X_red = np.vstack(X_red) - print X_red.shape + train_dict[ftype] = vec_train_features - if X_red.shape[1] == 2: + print "Transforming data and fitting model.." + model = make_cluster_pipeline_doc2vec(ftype, reducer) + X_red = model.fit_transform(train_dict) - plt.figure() - colors = 'brgcmykbgrcmykbgrcmykbgrcmyk' - ncolors = len(colors) + #pl.rcParams.update({'font.size': 10}) + if isinstance(X_red, list): + X_red = np.vstack(X_red) + print X_red.shape - for prog,[x,y],cl in zip(train_programs, X_red, train_classes): - x = gauss(0,0.1) + x - y = gauss(0,0.1) + y - try: - plt.scatter(x, y, c=colors[int(cl)]) - plt.text(x, y+0.02, prog.split("/")[-1]) - except ValueError: - plt.text(x, y+0.02, cl) - - - - if valid_file is not None: - valid_programs, valid_features, valid_classes = read_traces(valid_file, None) - valid_dict = dict() - valid_dict[ftype] = valid_features - - X_red = model.transform(valid_dict) - for prog,[x,y],cl in zip(valid_programs, X_red, valid_classes): - x = gauss(0,0.1) + x - y = gauss(0,0.1) + y - plt.scatter(x, y, c=colors[cl+1]) - plt.text(x, y+0.02, prog.split("/")[-1]) + if X_red.shape[1] == 2: - #plt.show() - plt.savefig(train_file.replace(".gz","")+".png") + plt.figure() + colors = 'brgcmykbgrcmykbgrcmykbgrcmyk' + ncolors = len(colors) + for prog, [x, y], cl in zip(train_programs, X_red, train_classes): + x = gauss(0, 0.1) + x + y = gauss(0, 0.1) + y + try: + plt.scatter(x, y, c=colors[int(cl)]) + plt.text(x, y + 0.02, prog.split("/")[-1]) + except ValueError: + plt.text(x, y + 0.02, cl) - from sklearn.cluster import MeanShift, estimate_bandwidth + if valid_file is not None: + valid_programs, valid_features, valid_classes = read_traces( + valid_file, None) + valid_dict = dict() + valid_dict[ftype] = valid_features - bandwidth = estimate_bandwidth(X_red, quantile=0.2) - print "Clustering with bandwidth:", bandwidth + X_red = model.transform(valid_dict) + for prog, [x, y], cl in zip(valid_programs, X_red, valid_classes): + x = gauss(0, 0.1) + x + y = gauss(0, 0.1) + y + plt.scatter(x, y, c=colors[cl + 1]) + plt.text(x, y + 0.02, prog.split("/")[-1]) - af = MeanShift(bandwidth=bandwidth*param).fit(X_red) + # plt.show() + plt.savefig(train_file.replace(".gz", "") + ".png") - cluster_centers = af.cluster_centers_ - labels = af.labels_ - n_clusters_ = len(cluster_centers) + from sklearn.cluster import MeanShift, estimate_bandwidth - if X_red.shape[1] == 2: + bandwidth = estimate_bandwidth(X_red, quantile=0.2) + print "Clustering with bandwidth:", bandwidth - plt.close('all') - plt.figure(1) - plt.clf() + af = MeanShift(bandwidth=bandwidth * param).fit(X_red) - for ([x,y],label, cluster_label) in zip(X_red,train_programs, labels): - x = gauss(0,0.1) + x - y = gauss(0,0.1) + y - plt.scatter(x, y, c = colors[cluster_label % ncolors]) + cluster_centers = af.cluster_centers_ + labels = af.labels_ + n_clusters_ = len(cluster_centers) - for i,[x,y] in enumerate(cluster_centers): - plt.plot(x, y, 'o', markerfacecolor=colors[i % ncolors], - markeredgecolor='k', markersize=7) + if X_red.shape[1] == 2: - plt.title('Estimated number of clusters: %d' % n_clusters_) - plt.savefig(train_file.replace(".gz","")+".clusters.png") + plt.close('all') + plt.figure(1) + plt.clf() - #plt.show() + for ([x, y], label, cluster_label) in zip( + X_red, train_programs, labels): + x = gauss(0, 0.1) + x + y = gauss(0, 0.1) + y + plt.scatter(x, y, c=colors[cluster_label % ncolors]) - clustered_traces = zip(train_programs, labels) - writer = write_csv(train_file.replace(".gz","")+".clusters") - for label, cluster in clustered_traces: - writer.writerow([label.split("/")[-1], cluster]) + for i, [x, y] in enumerate(cluster_centers): + plt.plot(x, y, 'o', markerfacecolor=colors[i % ncolors], + markeredgecolor='k', markersize=7) + + plt.title('Estimated number of clusters: %d' % n_clusters_) + plt.savefig(train_file.replace(".gz", "") + ".clusters.png") + + # plt.show() + + clustered_traces = zip(train_programs, labels) + writer = write_csv(train_file.replace(".gz", "") + ".clusters") + for label, cluster in clustered_traces: + writer.writerow([label.split("/")[-1], cluster]) diff --git a/vdiscover/Detection.py b/vdiscover/Detection.py index 218f2c7..e6f7a5a 100644 --- a/vdiscover/Detection.py +++ b/vdiscover/Detection.py @@ -23,122 +23,126 @@ from Input import Arg, File + def GetCmd(s): - if os.path.exists("path.txt"): - f = open("path.txt") - x = f.readline() - return x.replace("\n","").strip(" ") - else: - return s + if os.path.exists("path.txt"): + f = open("path.txt") + x = f.readline() + return x.replace("\n", "").strip(" ") + else: + return s + def GetArg(n, conc): - if conc: - filename = "cargv_"+str(n)+".symb" - data = open(filename).read() - x = Arg(n, data) - x.SetConcrete() - else: - filename = "argv_"+str(n)+".symb" - data = open(filename).read() - x = Arg(n, data) - x.SetSymbolic() + if conc: + filename = "cargv_" + str(n) + ".symb" + data = open(filename).read() + x = Arg(n, data) + x.SetConcrete() + else: + filename = "argv_" + str(n) + ".symb" + data = open(filename).read() + x = Arg(n, data) + x.SetSymbolic() + + return x - return x def WriteTestcase(name, program, args, copy=False): - try: - os.mkdir(name) - except: - pass - - os.chdir(name) - filename = "path.txt" - open(filename,"w").write(program) - - try: - os.mkdir("inputs") - except: - pass - - os.chdir("inputs") - for i,arg in enumerate(args): - if "file:" in arg: - #print arg - arg = arg.replace("file:","") - assert(arg[0] == '/') - filename = os.path.split(arg)[-1] - #print filename - if copy: - shutil.copyfile(os.path.realpath(arg), "file_"+filename) - else: - os.symlink(os.path.realpath(arg), "file_"+filename) - arg = filename - - filename = "argv_"+str(i+1)+".symb" - open(filename,"w").write(arg) - - os.chdir("../..") - + try: + os.mkdir(name) + except: + pass + + os.chdir(name) + filename = "path.txt" + open(filename, "w").write(program) + + try: + os.mkdir("inputs") + except: + pass + + os.chdir("inputs") + for i, arg in enumerate(args): + if "file:" in arg: + # print arg + arg = arg.replace("file:", "") + assert(arg[0] == '/') + filename = os.path.split(arg)[-1] + # print filename + if copy: + shutil.copyfile(os.path.realpath(arg), "file_" + filename) + else: + os.symlink(os.path.realpath(arg), "file_" + filename) + arg = filename + + filename = "argv_" + str(i + 1) + ".symb" + open(filename, "w").write(arg) + + os.chdir("../..") + def GetArgs(): - #i = 1 - r = [] + #i = 1 + r = [] - for _,_,files in os.walk('.'): - for f in files: - #print f - for i in range(10): - #print str(i), f + for _, _, files in os.walk('.'): + for f in files: + # print f + for i in range(10): + # print str(i), f - if ("cargv_"+str(i)) in f: - x = GetArg(i, True) - if x.IsValid(): - r.append(x) + if ("cargv_" + str(i)) in f: + x = GetArg(i, True) + if x.IsValid(): + r.append(x) - break + break - elif ("argv_"+str(i)) in f: - x = GetArg(i, False) - if x.IsValid(): - r.append(x) + elif ("argv_" + str(i)) in f: + x = GetArg(i, False) + if x.IsValid(): + r.append(x) - break + break - r.sort() - #print r - for i in range(len(r)): - if r[i].i <> i+1: - r = r[0:i] - break + r.sort() + # print r + for i in range(len(r)): + if r[i].i != i + 1: + r = r[0:i] + break + + # print r + return r - #print r - return r def GetFile(filename, source): - #size = int(os.path.getsize(source)) - data = open(source).read() - return File(filename, data) + #size = int(os.path.getsize(source)) + data = open(source).read() + return File(filename, data) -def GetFiles(): - r = [] - stdinf = "file___dev__stdin.symb" - - for dir,_,files in os.walk('.'): - if dir == '.': - for f in files: - if (stdinf == f): - r.append(GetFile("/dev/stdin",stdinf)) - elif ("file_" in f): - filename = f.split(".symb")[0] - #filename = f.replace(".symb","") - filename = filename.split("file_")[1] - filename = filename.replace(".__", "") - x = GetFile(filename,f) - if x.IsValid(): - r.append(x) - - return r +def GetFiles(): + r = [] + stdinf = "file___dev__stdin.symb" + + for dir, _, files in os.walk('.'): + if dir == '.': + for f in files: + if (stdinf == f): + r.append(GetFile("/dev/stdin", stdinf)) + elif ("file_" in f): + filename = f.split(".symb")[0] + #filename = f.replace(".symb","") + filename = filename.split("file_")[1] + filename = filename.replace(".__", "") + x = GetFile(filename, f) + if x.IsValid(): + r.append(x) + + return r diff --git a/vdiscover/ELF.py b/vdiscover/ELF.py index 8530eaa..8833e21 100644 --- a/vdiscover/ELF.py +++ b/vdiscover/ELF.py @@ -19,7 +19,8 @@ import re import csv -import os, os.path +import os +import os.path import subprocess from Misc import parse_ldd_output, sh_string @@ -28,14 +29,16 @@ _FILE = '/usr/bin/file' _OBJDUMP = '/usr/bin/objdump' + def die(s): - print s - exit(-1) + print s + exit(-1) + def check(f): - import os - if not (os.access(f, os.X_OK) and os.path.isfile(f)): - die('Executable %s needed for readelf.py, please install binutils' % f) + import os + if not (os.access(f, os.X_OK) and os.path.isfile(f)): + die('Executable %s needed for readelf.py, please install binutils' % f) check(_READELF) check(_OBJDUMP) @@ -43,190 +46,199 @@ def check(f): realpath = os.path.dirname(os.path.realpath(__file__)) datadir = "../cache/" + def _save_cached_data(path, plt, got, base): - filename = realpath+"/"+datadir+"/"+str(path.replace("/","_")) - csvfile = open(filename+".plt", 'wb') - writer = csv.writer(csvfile, delimiter='\t') + filename = realpath + "/" + datadir + "/" + str(path.replace("/", "_")) + csvfile = open(filename + ".plt", 'wb') + writer = csv.writer(csvfile, delimiter='\t') - for (name,addr) in plt.items(): - if addr is not None: - writer.writerow((name,addr-base)) + for (name, addr) in plt.items(): + if addr is not None: + writer.writerow((name, addr - base)) - csvfile = open(filename+".got", 'wb') - writer = csv.writer(csvfile, delimiter='\t') + csvfile = open(filename + ".got", 'wb') + writer = csv.writer(csvfile, delimiter='\t') + + for (name, addr) in got.items(): + # print "got",name, addr + if addr is None: + addr = 0x0 + writer.writerow((name, addr)) - for (name,addr) in got.items(): - #print "got",name, addr - if addr is None: - addr = 0x0 - writer.writerow((name,addr)) def _load_cached_data(path, plt, got, base): - - cachedir = os.path.dirname(realpath+"/"+datadir) - if not os.path.exists(cachedir): - os.makedirs(cachedir) - - - filename = realpath+"/"+datadir+"/"+str(path.replace("/","_")) - - #print filename - try: - csvfile = open(filename+".plt", 'rb') - except IOError: - return False - #print "cached file:",filename+".plt" - - reader = csv.reader(csvfile, delimiter='\t') - - for (name,addr) in reader: - #print name, int(addr)+base - plt[name] = int(addr)+base - - try: - csvfile = open(filename+".got", 'rb') - except IOError: - return False - - reader = csv.reader(csvfile, delimiter='\t') - - for (name,addr) in reader: - addr = int(addr) - if addr == 0x0: - addr = None - got[name] = addr - - return True + + cachedir = os.path.dirname(realpath + "/" + datadir) + if not os.path.exists(cachedir): + os.makedirs(cachedir) + + filename = realpath + "/" + datadir + "/" + str(path.replace("/", "_")) + + # print filename + try: + csvfile = open(filename + ".plt", 'rb') + except IOError: + return False + # print "cached file:",filename+".plt" + + reader = csv.reader(csvfile, delimiter='\t') + + for (name, addr) in reader: + # print name, int(addr)+base + plt[name] = int(addr) + base + + try: + csvfile = open(filename + ".got", 'rb') + except IOError: + return False + + reader = csv.reader(csvfile, delimiter='\t') + + for (name, addr) in reader: + addr = int(addr) + if addr == 0x0: + addr = None + got[name] = addr + + return True + def plt_got(path, base): - plt, got = dict(), dict() + plt, got = dict(), dict() - if _load_cached_data(path, plt, got, base): - #print "plt",plt - #print "got",got - return plt, got + if _load_cached_data(path, plt, got, base): + # print "plt",plt + # print "got",got + return plt, got - cmd = ["env", "-i", _OBJDUMP, '-d', path] - out = subprocess.Popen(cmd, stdout=subprocess.PIPE).communicate()[0] - got32 = '[^j]*jmp\s+\*0x(\S+)' - got64 = '[^#]*#\s+(\S+)' - lines = re.findall('([a-fA-F0-9]+)\s+<([^@<]+)@plt>:(%s|%s)' % (got32, got64), out) + cmd = ["env", "-i", _OBJDUMP, '-d', path] + out = subprocess.Popen(cmd, stdout=subprocess.PIPE).communicate()[0] + got32 = '[^j]*jmp\s+\*0x(\S+)' + got64 = '[^#]*#\s+(\S+)' + lines = re.findall( + '([a-fA-F0-9]+)\s+<([^@<]+)@plt>:(%s|%s)' % (got32, got64), out) - for addr, name, _, gotaddr32, gotaddr64 in lines: - addr = int(addr, 16) + for addr, name, _, gotaddr32, gotaddr64 in lines: + addr = int(addr, 16) - try: - gotaddr = int(gotaddr32 or gotaddr64, 16) - except ValueError: - gotaddr = None + try: + gotaddr = int(gotaddr32 or gotaddr64, 16) + except ValueError: + gotaddr = None - plt[name] = base + addr - got[name] = gotaddr + plt[name] = base + addr + got[name] = gotaddr - #print "plt",plt - #print "got",got + # print "plt",plt + # print "got",got + + _save_cached_data(path, plt, got, base) + return plt, got - _save_cached_data(path, plt, got, base) - return plt, got def load_raw_inss(path): - cmd = ["env", "-i", _OBJDUMP, '-d', '-j', ".text", path] - raw_instructions = subprocess.Popen(cmd, stdout=subprocess.PIPE).communicate()[0] - #lines = re.findall('([a-fA-F0-9]+)\s+((<([^@<]+)@plt>)|%s)' % "|".join(inss), raw_instructions) - #lines = re.findall('$', raw_instructions) - return raw_instructions + cmd = ["env", "-i", _OBJDUMP, '-d', '-j', ".text", path] + raw_instructions = subprocess.Popen( + cmd, stdout=subprocess.PIPE).communicate()[0] + #lines = re.findall('([a-fA-F0-9]+)\s+((<([^@<]+)@plt>)|%s)' % "|".join(inss), raw_instructions) + #lines = re.findall('$', raw_instructions) + return raw_instructions + def entrypoint(path): cmd = ["env", "-i", _READELF, '-hWS', path] out = subprocess.check_output(cmd) #elfclass = re.findall('Class:\s*(.*$)', out, re.MULTILINE)[0] - entrypoint = int(re.findall('Entry point address:\s*(.*$)', out, re.MULTILINE)[0], 16) - #print out - #print hex(entrypoint) + entrypoint = int(re.findall( + 'Entry point address:\s*(.*$)', out, re.MULTILINE)[0], 16) + # print out + # print hex(entrypoint) if "DYN (Shared object file)" in out: - entrypoint = entrypoint + 0x80000000 + entrypoint = entrypoint + 0x80000000 return entrypoint + def no_frame_pointer(path): cmd = ["env", "-i", _READELF, '-hWS', path] out = subprocess.check_output(cmd) - #print out + # print out #elfclass = re.findall('Class:\s*(.*$)', out, re.MULTILINE)[0] out = out.split('.eh_frame PROGBITS ')[1] out = out.split(" ")[2] - return (int(out,16) > 4) + return (int(out, 16) > 4) -def file_type(path): - cmd = [_FILE, os.path.realpath(path)] - try: - out = subprocess.check_output(cmd) - except subprocess.CalledProcessError: - return "" +def file_type(path): + cmd = [_FILE, os.path.realpath(path)] - if "ELF 32-bit" in out: - return "ELF 32-bit" - elif "ELF 64-bit" in out: - return "ELF 64-bit" - else: - return None + try: + out = subprocess.check_output(cmd) + except subprocess.CalledProcessError: + return "" -class ELF: - '''A parsed ELF file''' - cachedir = "cache" - - def __init__(self, path, plt = True, base = 0x0): - #print path, plt - self.path = str(path) - self.base = base - self.sections = dict() - self.filetype = file_type(path) - - if self.filetype == "": - print "The executable at",path,"cannot be found" - exit(-1) - - elif self.filetype is None: - print "The executable at",path,"is not a valid ELF file" - exit(-1) - - self.entrypoint = entrypoint(path) - #print hex(self.entrypoint) - self.no_frame_pointer = no_frame_pointer(path) - #self._load_sections() - - if plt: - self.plt, self.got = plt_got(self.path, self.base) + if "ELF 32-bit" in out: + return "ELF 32-bit" + elif "ELF 64-bit" in out: + return "ELF 64-bit" else: - self.plt, self.got = dict(), dict() - self.name2addr = self.plt - self.addr2name = dict() - - for (name, addr) in self.name2addr.items(): - self.addr2name[addr] = name - - self.name2func = self.got - self.func2name = dict() - - for (name, addr) in self.name2func.items(): - self.func2name[addr] = name + return None - def _populate_libraries_ldd(self): - """ - from pwntools +class ELF: + '''A parsed ELF file''' + cachedir = "cache" + + def __init__(self, path, plt=True, base=0x0): + # print path, plt + self.path = str(path) + self.base = base + self.sections = dict() + self.filetype = file_type(path) + + if self.filetype == "": + print "The executable at", path, "cannot be found" + exit(-1) + + elif self.filetype is None: + print "The executable at", path, "is not a valid ELF file" + exit(-1) + + self.entrypoint = entrypoint(path) + # print hex(self.entrypoint) + self.no_frame_pointer = no_frame_pointer(path) + # self._load_sections() + + if plt: + self.plt, self.got = plt_got(self.path, self.base) + else: + self.plt, self.got = dict(), dict() + self.name2addr = self.plt + self.addr2name = dict() + + for (name, addr) in self.name2addr.items(): + self.addr2name[addr] = name + + self.name2func = self.got + self.func2name = dict() + + for (name, addr) in self.name2func.items(): + self.func2name[addr] = name + + def _populate_libraries_ldd(self): + """ + from pwntools + """ + try: + cmd = '(ulimit -s unlimited; ldd %s > /dev/null && (LD_TRACE_LOADED_OBJECTS=1 %s || ldd %s)) 2>/dev/null' + arg = sh_string(self.path) + data = subprocess.check_output(cmd % (arg, arg, arg), shell=True) + self._libs = parse_ldd_output(data) + except subprocess.CalledProcessError: + self._libs = {} """ - try: - cmd = '(ulimit -s unlimited; ldd %s > /dev/null && (LD_TRACE_LOADED_OBJECTS=1 %s || ldd %s)) 2>/dev/null' - arg = sh_string(self.path) - data = subprocess.check_output(cmd % (arg, arg, arg), shell = True) - self._libs = parse_ldd_output(data) - except subprocess.CalledProcessError: - self._libs = {} - """ def _load_sections(self): # -W : Wide output # -S : Section headers @@ -247,45 +259,46 @@ def _load_sections(self): 'flags' : flgs, } """ - def GetEntrypoint(self): - return self.entrypoint - def GetFunctions(self): - return self.name2func.keys() + def GetEntrypoint(self): + return self.entrypoint - def GetModname(self): - return str(self.path) + def GetFunctions(self): + return self.name2func.keys() - def FindFuncInPlt(self, name): + def GetModname(self): + return str(self.path) - if name in self.name2addr: - return self.name2addr[name] - else: - return None + def FindFuncInPlt(self, name): - def FindAddrInPlt(self, addr): - #print addr - if addr in self.addr2name: - return self.addr2name[addr] - else: - return None + if name in self.name2addr: + return self.name2addr[name] + else: + return None - def FindFuncInGot(self, name): + def FindAddrInPlt(self, addr): + # print addr + if addr in self.addr2name: + return self.addr2name[addr] + else: + return None - if name in self.name2addr: - return self.name2func[name] - else: - return None + def FindFuncInGot(self, name): - def FindAddrInGot(self, addr): - #print addr - if addr in self.addr2name: - return self.func2name[addr] - else: - return None + if name in self.name2addr: + return self.name2func[name] + else: + return None + + def FindAddrInGot(self, addr): + # print addr + if addr in self.addr2name: + return self.func2name[addr] + else: + return None - def GetType(self): - return str(self.filetype) + def GetType(self): + return str(self.filetype) - def GetRawInss(self): - return load_raw_inss(self.path) + def GetRawInss(self): + return load_raw_inss(self.path) diff --git a/vdiscover/Event.py b/vdiscover/Event.py index dd33ea2..38a51e5 100644 --- a/vdiscover/Event.py +++ b/vdiscover/Event.py @@ -27,233 +27,255 @@ #from distorm import Decode, Decode32Bits + class Event: - module = None - def __init__(self): - pass + module = None -class Call(Event): + def __init__(self): + pass - def __init__(self, name, module): - assert(name in specs) - spec = specs[name] - self.ret = str(spec[0]) - #fixme: void functions and non-returned values should be different! - self.retvalue = (Type("Top32",4),None) - self.module = module - self.name = str(name) - self.param_types = list(spec[1:]) - self.param_ptypes = [] - self.param_values = [] +class Call(Event): - def __str__(self): - return str(self.name) + def __init__(self, name, module): - #def _detect_return_address(self): - # addr = self.process.getreg("esp") - # bytes = self.process.readBytes(addr, 4) - # return RefinePType(Type("Ptr32",4),bytes2word(bytes), self.process, self.mm) - # #return bytes2word(bytes) + assert(name in specs) + spec = specs[name] + self.ret = str(spec[0]) + # fixme: void functions and non-returned values should be different! + self.retvalue = (Type("Top32", 4), None) + self.module = module + self.name = str(name) + self.param_types = list(spec[1:]) + self.param_ptypes = [] + self.param_values = [] - def _detect_parameter_x86_64(self, ptype, index): + def __str__(self): + return str(self.name) - if index > 4: - return None + # def _detect_return_address(self): + # addr = self.process.getreg("esp") + # bytes = self.process.readBytes(addr, 4) + # return RefinePType(Type("Ptr32",4),bytes2word(bytes), self.process, self.mm) + # #return bytes2word(bytes) - reg = ["rdi","rsi","rdx","rcx","r8"][index] - val = self.process.getreg(reg) + def _detect_parameter_x86_64(self, ptype, index): - #print "bs value", repr(bs), hex(bytes2word(bs)) + if index > 4: + return None - return RefinePType(GetPtype(ptype),val, self.process, self.mm) + reg = ["rdi", "rsi", "rdx", "rcx", "r8"][index] + val = self.process.getreg(reg) + # print "bs value", repr(bs), hex(bytes2word(bs)) + return RefinePType(GetPtype(ptype), val, self.process, self.mm) + def _detect_parameter_x86(self, ptype, offset): + addr = self.process.getStackPointer() + offset + bs = self.process.readBytes(addr, 4) - def _detect_parameter_x86(self, ptype, offset): - addr = self.process.getStackPointer()+offset - bs = self.process.readBytes(addr, 4) + # if CPU_X86_64: + # bs = bs + (4*'\00') - #if CPU_X86_64: - # bs = bs + (4*'\00') + # print "bs value", repr(bs), hex(bytes2word(bs)) - #print "bs value", repr(bs), hex(bytes2word(bs)) + return RefinePType( + GetPtype(ptype), + bytes2word(bs), + self.process, + self.mm) - return RefinePType(GetPtype(ptype),bytes2word(bs), self.process, self.mm) + def get_return_address(self): + return self.retaddr[1] - def get_return_address(self): - return self.retaddr[1] + def detect_parameters(self, process, mm): + self.process = process + self.mm = mm + self.retaddr = None + # print "ret_addr:", str(self.retaddr[0]), hex(self.retaddr[1]) - def detect_parameters(self, process, mm): - self.process = process - self.mm = mm - self.retaddr = None - #print "ret_addr:", str(self.retaddr[0]), hex(self.retaddr[1]) + offset = 4 + # print self.mm + # print self.name + for index, ctype in enumerate(self.param_types): - offset = 4 - #print self.mm - #print self.name - for index,ctype in enumerate(self.param_types): + if CPU_X86_64: + (ptype, value) = self._detect_parameter_x86_64(ctype, index) + else: + (ptype, value) = self._detect_parameter_x86(ctype, offset) - if CPU_X86_64: - (ptype, value) = self._detect_parameter_x86_64(ctype, index) - else: - (ptype, value) = self._detect_parameter_x86(ctype, offset) + self.param_values.append(value) + self.param_ptypes.append(ptype) + offset += ptype.getSize() + #print (str(ptype), hex(value)) - self.param_values.append(value) - self.param_ptypes.append(ptype) - offset += ptype.getSize() - #print (str(ptype), hex(value)) + # def DetectReturnValue(self, process): + # self.process = process + # self.retvalue = RefinePType(GetPtype(self.ret),process.getreg("eax"), self.process, self.mm) - #def DetectReturnValue(self, process): - # self.process = process - # self.retvalue = RefinePType(GetPtype(self.ret),process.getreg("eax"), self.process, self.mm) + def GetTypedName(self): + return (str(self), list(self.param_ptypes)) - def GetTypedName(self): - return (str(self), list(self.param_ptypes)) class Signal(Event): - def __init__(self, name, process, mm): - self.fields = dict() - _sifields = process.getsiginfo()._sifields + def __init__(self, name, process, mm): + + self.fields = dict() + _sifields = process.getsiginfo()._sifields - self.name = name + self.name = name - if hasattr(_sifields, "_sigfault") and self.name == "SIGSEGV": - self.fields["addr"] = RefinePType(Type("Ptr32",4), _sifields._sigfault._addr, process, mm) - #print "sigfault @", _sifields._sigfault._addr + if hasattr(_sifields, "_sigfault") and self.name == "SIGSEGV": + self.fields["addr"] = RefinePType( + Type("Ptr32", 4), _sifields._sigfault._addr, process, mm) + # print "sigfault @", _sifields._sigfault._addr - def __str__(self): - return str(self.name) + def __str__(self): + return str(self.name) - def GetTypedName(self): + def GetTypedName(self): - if len(self.fields) > 0: - ptypes = map(lambda (x,_): x, self.fields.values()) - return (str(self.name), ptypes) - else: - return (str(self.name), ["()"]) + if len(self.fields) > 0: + ptypes = map(lambda x__: x__[0], self.fields.values()) + return (str(self.name), ptypes) + else: + return (str(self.name), ["()"]) class Syscall(Event): - def __init__(self, name): - self.name = name - def __str__(self): - return str(self.name) + def __init__(self, name): + self.name = name + + def __str__(self): + return str(self.name) + + def GetTypedName(self): + return ("Syscall", [str(self.name)]) - def GetTypedName(self): - return ("Syscall", [str(self.name)]) class Exit(Event): - def __init__(self, code): - self.code = code - self.name = "Exit with "+str(code) - def __str__(self): - return str(self.name) + def __init__(self, code): + self.code = code + self.name = "Exit with " + str(code) + + def __str__(self): + return str(self.name) + + def GetTypedName(self): + return ("exited", str(self.code)) - def GetTypedName(self): - return ("exited", str(self.code)) class Abort(Event): - def __init__(self, process, mm): - self.name = "Abort" - ip = process.getInstrPointer() - self.bt = process.getBacktrace(max_args=0, max_depth=20) - self.module = FindModule(ip,mm) - #print self.bt, type(self.bt) - frames = [] + def __init__(self, process, mm): + self.name = "Abort" + ip = process.getInstrPointer() - if CPU_X86_64: - pass # detection of stack frame disabled, python-ptrace does not support ... - if CPU_I386: + self.bt = process.getBacktrace(max_args=0, max_depth=20) + self.module = FindModule(ip, mm) + # print self.bt, type(self.bt) + frames = [] - for i,frame in enumerate(self.bt.frames): - r_type = RefinePType(Type("Ptr32",4), frame.ip, process, mm) - frames.append(r_type) + if CPU_X86_64: + # detection of stack frame disabled, python-ptrace does not support + # ... + pass + if CPU_I386: - if str(r_type[0]) == "DPtr32": - break + for i, frame in enumerate(self.bt.frames): + r_type = RefinePType(Type("Ptr32", 4), frame.ip, process, mm) + frames.append(r_type) - self.bt.frames = frames - #print "frames",frames - #print "self.bt.frames", self.bt.frames + if str(r_type[0]) == "DPtr32": + break - self.eip = RefinePType(Type("Ptr32",4), ip, process, mm) + self.bt.frames = frames + # print "frames",frames + # print "self.bt.frames", self.bt.frames - def __str__(self): - return str(self.name) + self.eip = RefinePType(Type("Ptr32", 4), ip, process, mm) + + def __str__(self): + return str(self.name) + + def GetTypedName(self): + return ("abort", [self.eip[0]]) - def GetTypedName(self): - return ("abort", [self.eip[0]]) class Timeout(Event): - def __init__(self, secs): - self.secs = secs - self.name = "Timeout "+str(secs)+" secs" - def __str__(self): - return str(self.name) + def __init__(self, secs): + self.secs = secs + self.name = "Timeout " + str(secs) + " secs" + + def __str__(self): + return str(self.name) + + def GetTypedName(self): + return ("timeouted", ["()"]) - def GetTypedName(self): - return ("timeouted", ["()"]) class Crash(Event): - def __init__(self, process, mm): - ip = process.getInstrPointer() - fp = process.getFramePointer() + def __init__(self, process, mm): + ip = process.getInstrPointer() + fp = process.getFramePointer() - self.module = FindModule(ip,mm) + self.module = FindModule(ip, mm) - self.fp_type = RefinePType(Type("Ptr32",4), fp, process, mm) - #print "fp:",hex(fp_type[1]), str(fp_type[0]) - if not process.no_frame_pointer: #str(self.fp_type[0]) == "SPtr32": - self.bt = getBacktrace(process,max_args=0, max_depth=20) - else: - self.bt = Backtrace() - frames = [] + self.fp_type = RefinePType(Type("Ptr32", 4), fp, process, mm) + # print "fp:",hex(fp_type[1]), str(fp_type[0]) + if not process.no_frame_pointer: # str(self.fp_type[0]) == "SPtr32": + self.bt = getBacktrace(process, max_args=0, max_depth=20) + else: + self.bt = Backtrace() + frames = [] - if CPU_X86_64: - pass # detection of stack frame disabled, python-ptrace does not support ... - if CPU_I386: + if CPU_X86_64: + # detection of stack frame disabled, python-ptrace does not support + # ... + pass + if CPU_I386: - for i,frame in enumerate(self.bt.frames): - print "frame",frame, hex(frame.ip) - r_type = RefinePType(Type("Ptr32",4), frame.ip, process, mm) - frames.append(r_type) - #print "ip:", str(r_type[0]) - if not (str(r_type[0]) == "GxPtr32"): - break + for i, frame in enumerate(self.bt.frames): + print "frame", frame, hex(frame.ip) + r_type = RefinePType(Type("Ptr32", 4), frame.ip, process, mm) + frames.append(r_type) + # print "ip:", str(r_type[0]) + if not (str(r_type[0]) == "GxPtr32"): + break - self.bt.frames = frames - self.eip_type = RefinePType(Type("Ptr32",4), process.getInstrPointer(), process, mm) + self.bt.frames = frames + self.eip_type = RefinePType( + Type("Ptr32", 4), process.getInstrPointer(), process, mm) - def __str__(self): - return "Crash@"+hex(self.eip_type[1])+":"+str(self.eip_type[0]) + def __str__(self): + return "Crash@" + hex(self.eip_type[1]) + ":" + str(self.eip_type[0]) - def GetTypedName(self): - return ("crashed", [self.eip_type[0]]) + def GetTypedName(self): + return ("crashed", [self.eip_type[0]]) class Vulnerability(Event): - def __init__(self, vtype): - self.type = str(vtype) - self.name = "Vulnerability "+str(vtype)+" detected" - def __str__(self): - return str(self.name) + def __init__(self, vtype): + self.type = str(vtype) + self.name = "Vulnerability " + str(vtype) + " detected" + + def __str__(self): + return str(self.name) + + def GetTypedName(self): + return ("Vulnerability", [str(self.type)]) - def GetTypedName(self): - return ("Vulnerability",[str(self.type)]) def hash_events(events): - return hash(tuple(map(str, events))) + return hash(tuple(map(str, events))) -def IsTimeout(event): - return isinstance(event, Timeout) +def IsTimeout(event): + return isinstance(event, Timeout) diff --git a/vdiscover/Input.py b/vdiscover/Input.py index 5a62b73..80e5bd7 100644 --- a/vdiscover/Input.py +++ b/vdiscover/Input.py @@ -19,79 +19,83 @@ import copy + def prepare_inputs(inputs): - r = [] - for input in inputs: - arg = input.PrepareData() - if not (arg is None): - r.append(arg) + r = [] + for input in inputs: + arg = input.PrepareData() + if not (arg is None): + r.append(arg) + + return r - return r class Input: - data = None - concrete = False + data = None + concrete = False + + def __init__(self): + pass - def __init__(self): - pass + def __len__(self): + return len(self.data) - def __len__(self): - return len(self.data) + def copy(self): + # print "data:",self.data + return copy.copy(self) - def copy(self): - #print "data:",self.data - return copy.copy(self) + def isSymbolic(self): + return not self.concrete - def isSymbolic(self): - return not self.concrete + def isConcrete(self): + return self.concrete - def isConcrete(self): - return self.concrete + def SetSymbolic(self): + self.concrete = False - def SetSymbolic(self): - self.concrete = False + def SetConcrete(self): + self.concrete = True - def SetConcrete(self): - self.concrete = True class Arg(Input): - def __init__(self, i, data): - self.i = i - self.data = str(data) - if ("\0" in data): - self.data = self.data.split("\0")[0] + def __init__(self, i, data): + self.i = i - self.size = len(self.data) + self.data = str(data) + if ("\0" in data): + self.data = self.data.split("\0")[0] - def __str__(self): - return "Arg("+str(self.i)+") = "+repr(self.data) + self.size = len(self.data) - def GetData(self): - return str(self.data) + def __str__(self): + return "Arg(" + str(self.i) + ") = " + repr(self.data) - def GetSize(self): - return len(self.data) + def GetData(self): + return str(self.data) - def PrepareData(self): + def GetSize(self): + return len(self.data) - return self.GetData() + def PrepareData(self): - def IsValid(self): - return self.size > 0 + return self.GetData() - def __cmp__(self, arg): - return cmp(self.i, arg.i) + def IsValid(self): + return self.size > 0 - def GetName(self): - if self.concrete: - return "cargv_"+str(self.i) - else: - return "argv_"+str(self.i) + def __cmp__(self, arg): + return cmp(self.i, arg.i) - def GetType(self): - return "arg" + def GetName(self): + if self.concrete: + return "cargv_" + str(self.i) + else: + return "argv_" + str(self.i) + + def GetType(self): + return "arg" # class Env(Input): @@ -130,43 +134,44 @@ def GetType(self): # return "env" class File(Input): - def __init__(self, filename, data): - self.filename = str(filename) - self.data = str(data) - self.size = len(data) - def __str__(self): - return "file("+str(self.filename)+") = "+repr(self.data) + def __init__(self, filename, data): + self.filename = str(filename) + self.data = str(data) + self.size = len(data) + + def __str__(self): + return "file(" + str(self.filename) + ") = " + repr(self.data) - def GetData(self): - return str(self.data) + def GetData(self): + return str(self.data) - def GetSize(self): - return len(self.data) + def GetSize(self): + return len(self.data) - def PrepareData(self): - if self.filename == "/dev/stdin": - with open("Stdin", 'w') as f: - f.write(self.data) + def PrepareData(self): + if self.filename == "/dev/stdin": + with open("Stdin", 'w') as f: + f.write(self.data) - return "< Stdin" - else: - with open(self.filename, 'w') as f: - f.write(self.data) + return "< Stdin" + else: + with open(self.filename, 'w') as f: + f.write(self.data) - return None + return None - def IsValid(self): - return True + def IsValid(self): + return True # def copy(self): # return File(self.filename, self.data) - def GetName(self): - return "file_"+self.filename.replace("/", "__") + def GetName(self): + return "file_" + self.filename.replace("/", "__") - def GetFilename(self): - return str(self.filename) + def GetFilename(self): + return str(self.filename) - def GetType(self): - return "file" + def GetType(self): + return "file" diff --git a/vdiscover/MemoryMap.py b/vdiscover/MemoryMap.py index 4625e94..fa1d4bc 100644 --- a/vdiscover/MemoryMap.py +++ b/vdiscover/MemoryMap.py @@ -17,94 +17,96 @@ Copyright 2014 by G.Grieco """ -class MemoryMaps: - def __init__(self, path, pid): - self.path = str(path) - self.pid = pid - self.update() - - def update(self): - - self.mm = dict() - self.atts = dict() - - for line in open('/proc/'+str(self.pid)+'/maps'): - line = line.replace("\n", "") - #print line - x = line.split(" ") - - mrange = x[0].split("-") - mrange = map(lambda s: int(s, 16), mrange) - #print tuple(mrange) - - self.mm[tuple(mrange)] = x[-1] - self.atts[tuple(mrange)] = x[1] - - def isStackPtr(self, ptr): - for (mrange,zone) in self.mm.items(): - if ptr >= mrange[0] and ptr < mrange[1]: - return zone == "[stack]" - return False - - def isHeapPtr(self, ptr): - for (mrange,zone) in self.mm.items(): - if ptr >= mrange[0] and ptr < mrange[1]: - return zone == "[heap]" - return False - - def isCodePtr(self, ptr): - for (mrange,zone) in self.mm.items(): - if ptr >= mrange[0] and ptr < mrange[1] and 'x' in self.atts[mrange]: - return True - return False - - def isLibPtr(self, ptr): - for (mrange,zone) in self.mm.items(): - if ptr >= mrange[0] and ptr < mrange[1]: - return "/lib/" in zone - return False - - def isGlobalPtr(self, ptr): - for (mrange,zone) in self.mm.items(): - if ptr >= mrange[0] and ptr < mrange[1]: - return zone == self.path - return False - - def isFilePtr(self, ptr): - for (mrange,zone) in self.mm.items(): - if ptr >= mrange[0] and ptr < mrange[1]: - return zone == "" - return False - - def checkPtr(self, ptr, update=True): - for (mrange,zone) in self.mm.items(): - if ptr >= mrange[0] and ptr < mrange[1]: - return True - - if update: - self.update() - else: - return False - - return self.checkPtr(ptr, update=False) - - def findModule(self, ptr): - for (mrange,zone) in self.mm.items(): - if ptr >= mrange[0] and ptr < mrange[1]: - return str(zone) - return None - - def __str__(self): - r = "" - for (mrange,zone) in self.mm.items(): - r = r + hex(mrange[0])+" - "+hex(mrange[1])+" -> "+zone+"\n" - return r - - def items(self): - r = [] - for (x,y) in self.mm.items(): - r.append((x,y,self.atts[x])) - - return r +class MemoryMaps: + def __init__(self, path, pid): + self.path = str(path) + self.pid = pid + self.update() + + def update(self): + + self.mm = dict() + self.atts = dict() + + for line in open('/proc/' + str(self.pid) + '/maps'): + line = line.replace("\n", "") + # print line + x = line.split(" ") + + mrange = x[0].split("-") + mrange = map(lambda s: int(s, 16), mrange) + # print tuple(mrange) + + self.mm[tuple(mrange)] = x[-1] + self.atts[tuple(mrange)] = x[1] + + def isStackPtr(self, ptr): + for (mrange, zone) in self.mm.items(): + if ptr >= mrange[0] and ptr < mrange[1]: + return zone == "[stack]" + return False + + def isHeapPtr(self, ptr): + for (mrange, zone) in self.mm.items(): + if ptr >= mrange[0] and ptr < mrange[1]: + return zone == "[heap]" + return False + + def isCodePtr(self, ptr): + for (mrange, zone) in self.mm.items(): + if ptr >= mrange[0] and ptr < mrange[ + 1] and 'x' in self.atts[mrange]: + return True + return False + + def isLibPtr(self, ptr): + for (mrange, zone) in self.mm.items(): + if ptr >= mrange[0] and ptr < mrange[1]: + return "/lib/" in zone + return False + + def isGlobalPtr(self, ptr): + for (mrange, zone) in self.mm.items(): + if ptr >= mrange[0] and ptr < mrange[1]: + return zone == self.path + return False + + def isFilePtr(self, ptr): + for (mrange, zone) in self.mm.items(): + if ptr >= mrange[0] and ptr < mrange[1]: + return zone == "" + return False + + def checkPtr(self, ptr, update=True): + for (mrange, zone) in self.mm.items(): + if ptr >= mrange[0] and ptr < mrange[1]: + return True + + if update: + self.update() + else: + return False + + return self.checkPtr(ptr, update=False) + + def findModule(self, ptr): + for (mrange, zone) in self.mm.items(): + if ptr >= mrange[0] and ptr < mrange[1]: + return str(zone) + return None + + def __str__(self): + r = "" + for (mrange, zone) in self.mm.items(): + r = r + hex(mrange[0]) + " - " + \ + hex(mrange[1]) + " -> " + zone + "\n" + return r + + def items(self): + r = [] + for (x, y) in self.mm.items(): + r.append((x, y, self.atts[x])) + + return r diff --git a/vdiscover/Misc.py b/vdiscover/Misc.py index e2de3ce..0784701 100644 --- a/vdiscover/Misc.py +++ b/vdiscover/Misc.py @@ -17,19 +17,27 @@ Copyright 2014 by G.Grieco """ -import socket, re, os, stat, errno, string, base64 +import socket +import re +import os +import stat +import errno +import string +import base64 + def readmodfile(modfile): - hooked_mods = [] - if modfile is not None: - hooked_mods = open(modfile).read().split("\n") - hooked_mods = filter(lambda x: x <> '', hooked_mods) - return hooked_mods + hooked_mods = [] + if modfile is not None: + hooked_mods = open(modfile).read().split("\n") + hooked_mods = filter(lambda x: x != '', hooked_mods) + return hooked_mods """ from pwntools """ + def parse_ldd_output(output): """Parses the output from a run of 'ldd' on a binary. Returns a dictionary of {path: address} for @@ -48,8 +56,9 @@ def parse_ldd_output(output): ... ''').keys()) ['/lib/x86_64-linux-gnu/libc.so.6', '/lib/x86_64-linux-gnu/libdl.so.2', '/lib/x86_64-linux-gnu/libtinfo.so.5', '/lib64/ld-linux-x86-64.so.2'] """ - expr_linux = re.compile(r'\s(?P\S?/\S+)\s+\((?P0x.+)\)') - expr_openbsd = re.compile(r'^\s+(?P[0-9a-f]+)\s+[0-9a-f]+\s+\S+\s+[01]\s+[0-9]+\s+[0-9]+\s+(?P\S+)$') + expr_linux = re.compile(r'\s(?P\S?/\S+)\s+\((?P0x.+)\)') + expr_openbsd = re.compile( + r'^\s+(?P[0-9a-f]+)\s+[0-9a-f]+\s+\S+\s+[01]\s+[0-9]+\s+[0-9]+\s+(?P\S+)$') libs = {} for s in output.split('\n'): @@ -88,8 +97,8 @@ def sh_string(s): """ very_good = set(string.ascii_letters + string.digits) - good = (very_good | set(string.punctuation + ' ')) - set("'") - alt_good = (very_good | set(string.punctuation + ' ')) - set('!') + good = (very_good | set(string.punctuation + ' ')) - set("'") + alt_good = (very_good | set(string.punctuation + ' ')) - set('!') if '\x00' in s: log.error("sh_string(): Cannot create a null-byte") @@ -115,7 +124,5 @@ def sh_string(s): fixed += c else: fixed += '\\x%02x' % ord(c) - return '"$( (echo %s|(base64 -d||openssl enc -d -base64)||echo -en \'%s\') 2>/dev/null)"' % (base64.b64encode(s), fixed) - - - + return '"$( (echo %s|(base64 -d||openssl enc -d -base64)||echo -en \'%s\') 2>/dev/null)"' % ( + base64.b64encode(s), fixed) diff --git a/vdiscover/Mutation.py b/vdiscover/Mutation.py index e32c0e1..6cb352b 100644 --- a/vdiscover/Mutation.py +++ b/vdiscover/Mutation.py @@ -24,217 +24,238 @@ import Input -def opened_files(program, args, files, timeout=5): - # check if the testcase is opened - output = Popen(["timeout","-k","1",str(timeout), "strace","-e","open",program]+args, stdout=PIPE, stderr=PIPE, stdin=PIPE, env=dict()).communicate() +def opened_files(program, args, files, timeout=5): - for mfile in files: - filename = mfile.filename - #print "checking",filename - if 'open("'+filename in output[1]: - return True + # check if the testcase is opened + output = Popen(["timeout", + "-k", + "1", + str(timeout), + "strace", + "-e", + "open", + program] + args, + stdout=PIPE, + stderr=PIPE, + stdin=PIPE, + env=dict()).communicate() + + for mfile in files: + filename = mfile.filename + # print "checking",filename + if 'open("' + filename in output[1]: + return True + + return False + # print output - return False - #print output def fuzz_cmd(prepared_inputs, fuzzer_cmd, seed): - p = Popen(fuzzer_cmd.split(" ")+[str(seed)], stdout=PIPE, stdin=PIPE, stderr=PIPE) - mutated_input = p.communicate(input=prepared_inputs)[0] - return mutated_input.replace("\0","")[:32767] + p = Popen(fuzzer_cmd.split(" ") + [str(seed)], + stdout=PIPE, stdin=PIPE, stderr=PIPE) + mutated_input = p.communicate(input=prepared_inputs)[0] + return mutated_input.replace("\0", "")[:32767] class DeltaMutation(object): - def __init__(self, inp, atts): - self.inp_type = str(inp.GetType()) - #self.mut_type = str(typ) - self.atts = copy.copy(atts) - def __str__(self): + def __init__(self, inp, atts): + self.inp_type = str(inp.GetType()) + #self.mut_type = str(typ) + self.atts = copy.copy(atts) + + def __str__(self): - r = ["input="+self.inp_type, "type="+self.mut_type] - r = r + map(lambda (a,b): a+"="+str(b),self.atts.items()) - return " ".join(r) + r = ["input=" + self.inp_type, "type=" + self.mut_type] + r = r + map(lambda a_b: a_b[0] + "=" + str(a_b[1]), self.atts.items()) + return " ".join(r) class NullDeltaMutation(DeltaMutation): - def __init__(self): - #pass - #DeltaMutation.__init__(inp, atts) - #super(self.__class__, self).__init__(inp, atts) - self.mut_type = "null" + def __init__(self): + # pass + #DeltaMutation.__init__(inp, atts) + #super(self.__class__, self).__init__(inp, atts) + self.mut_type = "null" - def __str__(self): - r = ["type="+self.mut_type] - return " ".join(r) + def __str__(self): + r = ["type=" + self.mut_type] + return " ".join(r) - def inv(self): - pass + def inv(self): + pass class OneByteDeltaMutation(DeltaMutation): - def __init__(self, inp, atts): - #DeltaMutation.__init__(inp, atts) - super(self.__class__, self).__init__(inp, atts) - self.mut_type = "mod" + def __init__(self, inp, atts): + #DeltaMutation.__init__(inp, atts) + super(self.__class__, self).__init__(inp, atts) + self.mut_type = "mod" - def inv(self): - t = self.atts["new"] - self.atts["new"] = self.atts["old"] - self.atts["old"] = t + def inv(self): + t = self.atts["new"] + self.atts["new"] = self.atts["old"] + self.atts["old"] = t class ByteExtensionDeltaMutation(DeltaMutation): - def __init__(self, inp, atts): - super(self.__class__, self).__init__(inp, atts) - self.mut_type = "ext" + def __init__(self, inp, atts): + super(self.__class__, self).__init__(inp, atts) + self.mut_type = "ext" - def inv(self): - self.mut_type = "con" - t = self.atts["new"] - self.atts["new"] = self.atts["old"] - self.atts["old"] = t + def inv(self): + self.mut_type = "con" + t = self.atts["new"] + self.atts["new"] = self.atts["old"] + self.atts["old"] = t class Mutator: - def __init__(self, input): - self.i = 0 - self.input = input.copy() - self.input_len = len(input) - if isinstance(input, Input.Arg): - self.array = map(chr, range(1, 256)) - elif isinstance(input, Input.File): - self.array = map(chr, range(0, 256)) + def __init__(self, input): + self.i = 0 + self.input = input.copy() + self.input_len = len(input) + + if isinstance(input, Input.Arg): + self.array = map(chr, range(1, 256)) + elif isinstance(input, Input.File): + self.array = map(chr, range(0, 256)) + + self.array_len = len(self.array) - self.array_len = len(self.array) + # def GetDelta(self): - #def GetDelta(self): + def Mutate(self): + assert(0) - def Mutate(self): - assert(0) - def GetData(self): - return None - def GetDelta(self): - assert(0) + def GetData(self): + return None + + def GetDelta(self): + assert(0) class RandomExpanderMutator(Mutator): - max_expansion = 10000 + max_expansion = 10000 + + def __iter__(self): + return self + + def next(self): + + assert(self.input_len > 0) - def __iter__(self): - return self + input = self.input.copy() + delta = str(self.input.GetType()) + " " - def next(self): + # expansion mutation + i = random.randrange(self.input_len) + j = random.randrange(self.max_expansion) + m = self.array[random.randrange(self.array_len)] - assert(self.input_len > 0) + # print self.array[rand] + input.data = input.data[:i] + m * j + input.data[i + 1:] - input = self.input.copy() - delta = str(self.input.GetType())+" " - - # expansion mutation - i = random.randrange(self.input_len) - j = random.randrange(self.max_expansion) - m = self.array[random.randrange(self.array_len)] + rpos = int(i / (float(self.input_len)) * 100.0) + rsize = j / 100 * 100 + self.delta = ByteExtensionDeltaMutation(input, dict( + pos=rpos, size=rsize, old=ord(self.input.data[i]), new=ord(m))) - #print self.array[rand] - input.data = input.data[:i] + m*j + input.data[i+1:] + return input - - rpos = int(i/(float(self.input_len))*100.0) - rsize = j/100*100 - self.delta = ByteExtensionDeltaMutation(input, dict(pos = rpos, size = rsize, old = ord(self.input.data[i]), new = ord(m) )) - - return input + def GetInput(self): + return self.input.copy() - def GetInput(self): - return self.input.copy() + def GetDelta(self): + return self.delta - def GetDelta(self): - return self.delta class RandomByteMutator(Mutator): - def __iter__(self): - return self + def __iter__(self): + return self - def next(self): + def next(self): - assert(self.input_len > 0) + assert(self.input_len > 0) - input = self.input.copy() - delta = str(self.input.GetType())+" " - - # single byte mutation - i = random.randrange(self.input_len) - #m = self.array[random.randrange(self.array_len)] - m = ord(input.data[i]) ^ (1 << random.randrange(7)) - input.data = input.data[:i] + chr(m) + input.data[i+1:] - - rpos = int(i/(float(self.input_len))*100.0) - self.delta = None#OneByteDeltaMutation(input, dict(pos = rpos, old = ord(self.input.data[i]), new=ord(m))) - return input + input = self.input.copy() + delta = str(self.input.GetType()) + " " - def GetInput(self): - return self.input.copy() + # single byte mutation + i = random.randrange(self.input_len) + #m = self.array[random.randrange(self.array_len)] + m = ord(input.data[i]) ^ (1 << random.randrange(7)) + input.data = input.data[:i] + chr(m) + input.data[i + 1:] - def GetDelta(self): - return self.delta + rpos = int(i / (float(self.input_len)) * 100.0) + # OneByteDeltaMutation(input, dict(pos = rpos, old = ord(self.input.data[i]), new=ord(m))) + self.delta = None + return input + + def GetInput(self): + return self.input.copy() + + def GetDelta(self): + return self.delta class NullMutator(Mutator): - def __iter__(self): - return self + def __iter__(self): + return self - def next(self): + def next(self): - input = self.input.copy() - return input + input = self.input.copy() + return input - def GetInput(self): - return self.input.copy() + def GetInput(self): + return self.input.copy() - #def GetData(self): + # def GetData(self): - def GetDelta(self): - return NullDeltaMutation() + def GetDelta(self): + return NullDeltaMutation() class RandomInputMutator: - def __init__(self, inputs, mutator): - assert(inputs <> []) - self.i = 0 - self.inputs = map(mutator, inputs) - self.inputs_len = len(self.inputs) - - def __iter__(self): - return self - - def next(self, mutate = True): - r = [] - delta = None - symb_inputs = filter(lambda (_,x): x.input.isSymbolic() and x.input.GetType() == "file", enumerate(self.inputs)) - symb_inputs_len = len(symb_inputs) - - self.i = symb_inputs[random.randrange(symb_inputs_len)][0] - - for j, m in enumerate(self.inputs): - if self.i == j: - r.append(m.next()) - #data = input.PrepareData() - delta = m.GetDelta() - - else: - r.append(m.GetInput()) - #data = input.PrepareData() - - #if data: - # r.append(data) - - return delta, r + def __init__(self, inputs, mutator): + assert(inputs != []) + self.i = 0 + self.inputs = map(mutator, inputs) + self.inputs_len = len(self.inputs) + + def __iter__(self): + return self + + def next(self, mutate=True): + r = [] + delta = None + symb_inputs = filter(lambda __x: __x[1].input.isSymbolic( + ) and __x[1].input.GetType() == "file", enumerate(self.inputs)) + symb_inputs_len = len(symb_inputs) + + self.i = symb_inputs[random.randrange(symb_inputs_len)][0] + + for j, m in enumerate(self.inputs): + if self.i == j: + r.append(m.next()) + #data = input.PrepareData() + delta = m.GetDelta() + + else: + r.append(m.GetInput()) + #data = input.PrepareData() + + # if data: + # r.append(data) + + return delta, r diff --git a/vdiscover/Pipeline.py b/vdiscover/Pipeline.py index cc7fc96..92e3780 100644 --- a/vdiscover/Pipeline.py +++ b/vdiscover/Pipeline.py @@ -18,10 +18,10 @@ """ import os -from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.base import BaseEstimator, TransformerMixin from sklearn.pipeline import Pipeline from sklearn.ensemble import RandomForestClassifier -from sklearn.naive_bayes import GaussianNB, MultinomialNB +from sklearn.naive_bayes import GaussianNB, MultinomialNB from sklearn.linear_model import LogisticRegression from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer from sklearn.decomposition import PCA, TruncatedSVD @@ -29,11 +29,14 @@ from random import random, randint, sample, gauss, shuffle -def static_tokenizer(s): - return filter(lambda x: x<>'', s.split(" ")) -def dynamic_tokenizer(s): - return filter(lambda x: x<>'', s.split(" ")) +def staticTokenizer(s): + return filter(lambda x: x != '', s.split(" ")) + + +def dynamicTokenizer(s): + return filter(lambda x: x != '', s.split(" ")) + class DenseTransformer(TransformerMixin): @@ -65,6 +68,7 @@ def transform(self, data_dict): def get_params(self, deep=True): return [] + class CutoffMax(BaseEstimator, TransformerMixin): def __init__(self, maxv): @@ -83,468 +87,482 @@ def get_params(self, deep=True): return [] +def makeTrainPipelineBOW(ftype): -def make_train_pipeline(ftype): - - if ftype is "dynamic": - - realpath = os.path.dirname(os.path.realpath(__file__)) - f = open(realpath+"/data/dyn_events.dic") - - event_dict = [] - - for line in f.readlines(): - event_dict.append(line.replace("\n","")) - - return Pipeline(steps=[ - ('selector', ItemSelector(key='dynamic')), - ('dvectorizer', CountVectorizer(tokenizer=dynamic_tokenizer, ngram_range=(1,3), lowercase=False, vocabulary=event_dict)), - ('todense', DenseTransformer()), - ('cutfoff', CutoffMax(16)), - ('classifier', RandomForestClassifier(n_estimators=1000, max_features=None, max_depth=100)) - #('classifier', GaussianNB()) - - ]) - elif ftype is "static": - return Pipeline(steps=[ - ('selector', ItemSelector(key='static')), - ('dvectorizer', CountVectorizer(tokenizer=static_tokenizer, ngram_range=(1,1), lowercase=False)), - ('todense', DenseTransformer()), - ('classifier', LogisticRegression(penalty="l2", C=1e-07, tol=1e-06)) - ]) - else: - assert(0) - -def make_cluster_pipeline_bow(ftype, rdim): - if ftype is "dynamic" and rdim == "pca": - - return Pipeline(steps=[ - ('selector', ItemSelector(key='dynamic')), - ('dvectorizer', TfidfVectorizer(tokenizer=dynamic_tokenizer, use_idf=False, norm=None, ngram_range=(1,1), lowercase=False)), - ('todense', DenseTransformer()), - #('cutfoff', CutoffMax(16)), - ('reducer', PCA(n_components=2)), - - ]) - - elif ftype is "dynamic" and rdim == "svd": - - return Pipeline(steps=[ - ('selector', ItemSelector(key='dynamic')), - ('dvectorizer', TfidfVectorizer(tokenizer=dynamic_tokenizer, use_idf=False, norm=None, ngram_range=(1,1), lowercase=False)), - ('todense', DenseTransformer()), - #('cutfoff', CutoffMax(16)), - ('reducer', TruncatedSVD(n_components=2)), - - ]) - - elif ftype is "dynamic" and rdim == "none": - - return Pipeline(steps=[ - ('selector', ItemSelector(key='dynamic')), - ('dvectorizer', TfidfVectorizer(tokenizer=dynamic_tokenizer, use_idf=False, norm=None, ngram_range=(1,1), lowercase=False)), - ('todense', DenseTransformer()), - #('cutfoff', CutoffMax(16)), - ]) - - elif ftype is "static": - return Pipeline(steps=[ - ('selector', ItemSelector(key='static')), - ('dvectorizer', TfidfVectorizer(tokenizer=dynamic_tokenizer, use_idf=False, norm=None, ngram_range=(1,1), lowercase=False)), - ('todense', DenseTransformer()), - ('cutfoff', CutoffMax(16)), - ('reducer', PCA(n_components=2)), - ]) - else: - assert(0) - - -def make_cluster_pipeline_doc2vec(ftype, rdim): - if ftype is "dynamic" and rdim == "pca": - return Pipeline(steps=[ - ('selector', ItemSelector(key='dynamic')), - ('reducer', PCA(n_components=2)), - ]) - elif ftype is "dynamic" and rdim == "svd": - return Pipeline(steps=[ - ('selector', ItemSelector(key='dynamic')), - ('reducer', TruncatedSVD(n_components=2)), - ]) - elif ftype is "dynamic" and rdim == "none": - return Pipeline(steps=[ - ('selector', ItemSelector(key='dynamic')) - ]) - elif ftype is "static": - raise NotImplemented - else: - assert(0) - - - -def make_cluster_pipeline_subtraces(ftype): - if ftype is "dynamic": - return Pipeline(steps=[ - ('selector', ItemSelector(key='dynamic')), - ('reducer', PCA(n_components=12)), - ]) - elif ftype is "static": - raise NotImplemented - else: - assert(0) - -def make_cluster_cnn(mode, max_features, maxlen, embedding_dims, nb_filters, filter_length, hidden_dims, nb_classes, weights=None): - - #print mode, max_features, maxlen, embedding_dims, nb_filters, filter_length, hidden_dims, nb_classes - from keras.preprocessing import sequence - from keras.optimizers import RMSprop - from keras.models import Sequential - from keras.layers.core import Dense, Dropout, Activation, Flatten - from keras.layers.embeddings import Embedding - from keras.layers.convolutional import Convolution1D, MaxPooling1D - - print('Build model...') - model = Sequential() - - # we start off with an efficient embedding layer which maps - # our vocab indices into embedding_dims dimensions - if mode == "train": - model.add(Embedding(max_features, embedding_dims, input_length=maxlen)) - elif mode == "test": - model.add(Embedding(max_features, embedding_dims, input_length=maxlen, weights=weights[0])) - - model.add(Dropout(0.25)) - - # we add a Convolution1D, which will learn nb_filters - # word group filters of size filter_length: - if mode == "train": - model.add(Convolution1D(nb_filter=nb_filters, - filter_length=filter_length, - border_mode='valid', - activation='relu', - subsample_length=1)) - - elif mode == "test": - model.add(Convolution1D(nb_filter=nb_filters, - filter_length=filter_length, - border_mode='valid', - activation='relu', - subsample_length=1, - weights=weights[2])) - - - # we use standard max pooling (halving the output of the previous layer): - model.add(MaxPooling1D(pool_length=2)) - - # We flatten the output of the conv layer, so that we can add a vanilla dense layer: - model.add(Flatten()) - - # Computing the output shape of a conv layer can be tricky; - # for a good tutorial, see: http://cs231n.github.io/convolutional-networks/ - output_size = nb_filters * (((maxlen - filter_length) / 1) + 1) / 2 - #print output_size, hidden_dims - - # We add a vanilla hidden layer: - if mode == "train": - model.add(Dense(hidden_dims)) - if mode == "test": - model.add(Dense(hidden_dims, weights=weights[5])) - - if mode == "train": + if ftype is "dynamic": - model.add(Dropout(0.25)) - model.add(Activation('relu')) + realpath = os.path.dirname(os.path.realpath(__file__)) + f = open(realpath + "/data/dyn_events.dic") - # We project onto a single unit output layer, and squash it with a sigmoid: - model.add(Dense(nb_classes)) + event_dict = [] - model.add(Activation('softmax')) - model.compile(loss='categorical_crossentropy', optimizer='rmsprop', class_mode="categorical") + for line in f.readlines(): + event_dict.append(line.replace("\n", "")) - elif mode == "test": - model.compile(loss='mean_squared_error', optimizer='rmsprop') + return Pipeline(steps=[ + ('selector', ItemSelector(key='dynamic')), + ('dvectorizer', CountVectorizer(tokenizer=dynamicTokenizer, + ngram_range=(1, 3), lowercase=False, vocabulary=event_dict)), + ('todense', DenseTransformer()), + ('cutfoff', CutoffMax(16)), + ('classifier', RandomForestClassifier( + n_estimators=1000, max_features=None, max_depth=100)) + #('classifier', GaussianNB()) + ]) + elif ftype is "static": + return Pipeline(steps=[ + ('selector', ItemSelector(key='static')), + ('dvectorizer', CountVectorizer( + tokenizer=static_tokenizer, ngram_range=(1, 1), lowercase=False)), + ('todense', DenseTransformer()), + ('classifier', LogisticRegression(penalty="l2", C=1e-07, tol=1e-06)) + ]) + else: + assert(0) + + +def makeClusterPipelineBOW(ftype, rdim): + if ftype is "dynamic" and rdim == "pca": + + return Pipeline(steps=[ + ('selector', ItemSelector(key='dynamic')), + ('dvectorizer', TfidfVectorizer(tokenizer=dynamicTokenizer, + use_idf=False, norm=None, ngram_range=(1, 1), lowercase=False)), + ('todense', DenseTransformer()), + #('cutfoff', CutoffMax(16)), + ('reducer', PCA(n_components=2)), + + ]) + + elif ftype is "dynamic" and rdim == "svd": + + return Pipeline(steps=[ + ('selector', ItemSelector(key='dynamic')), + ('dvectorizer', TfidfVectorizer(tokenizer=dynamicTokenizer, + use_idf=False, norm=None, ngram_range=(1, 1), lowercase=False)), + ('todense', DenseTransformer()), + #('cutfoff', CutoffMax(16)), + ('reducer', TruncatedSVD(n_components=2)), + + ]) + + elif ftype is "dynamic" and rdim == "none": + + return Pipeline(steps=[ + ('selector', ItemSelector(key='dynamic')), + ('dvectorizer', TfidfVectorizer(tokenizer=dynamic_tokenizer, + use_idf=False, norm=None, ngram_range=(1, 1), lowercase=False)), + ('todense', DenseTransformer()), + #('cutfoff', CutoffMax(16)), + ]) + + elif ftype is "static": + return Pipeline(steps=[ + ('selector', ItemSelector(key='static')), + ('dvectorizer', TfidfVectorizer(tokenizer=dynamic_tokenizer, + use_idf=False, norm=None, ngram_range=(1, 1), lowercase=False)), + ('todense', DenseTransformer()), + ('cutfoff', CutoffMax(16)), + ('reducer', PCA(n_components=2)), + ]) + else: + assert(0) + + +def makeClusterPipelineDoc2vec(ftype, rdim): + if ftype is "dynamic" and rdim == "pca": + return Pipeline(steps=[ + ('selector', ItemSelector(key='dynamic')), + ('reducer', PCA(n_components=2)), + ]) + elif ftype is "dynamic" and rdim == "svd": + return Pipeline(steps=[ + ('selector', ItemSelector(key='dynamic')), + ('reducer', TruncatedSVD(n_components=2)), + ]) + elif ftype is "dynamic" and rdim == "none": + return Pipeline(steps=[ + ('selector', ItemSelector(key='dynamic')) + ]) + elif ftype is "static": + raise NotImplemented + else: + assert(0) - return model +def makeClusterPipelineSubtraces(ftype): + if ftype is "dynamic": + return Pipeline(steps=[ + ('selector', ItemSelector(key='dynamic')), + ('reducer', PCA(n_components=12)), + ]) + elif ftype is "static": + raise NotImplemented + else: + assert(0) +""" +def make_cluster_cnn( + mode, + max_features, + maxlen, + embedding_dims, + nb_filters, + filter_length, + hidden_dims, + nb_classes, + weights=None): + + # print mode, max_features, maxlen, embedding_dims, nb_filters, + # filter_length, hidden_dims, nb_classes + from keras.preprocessing import sequence + from keras.optimizers import RMSprop + from keras.models import Sequential + from keras.layers.core import Dense, Dropout, Activation, Flatten + from keras.layers.embeddings import Embedding + from keras.layers.convolutional import Convolution1D, MaxPooling1D + + print('Build model...') + model = Sequential() + + # we start off with an efficient embedding layer which maps + # our vocab indices into embedding_dims dimensions + if mode == "train": + model.add(Embedding(max_features, embedding_dims, input_length=maxlen)) + elif mode == "test": + model.add(Embedding(max_features, embedding_dims, + input_length=maxlen, weights=weights[0])) -try: - from keras.preprocessing import sequence -except: - pass + model.add(Dropout(0.25)) + # we add a Convolution1D, which will learn nb_filters + # word group filters of size filter_length: + if mode == "train": + model.add(Convolution1D(nb_filter=nb_filters, + filter_length=filter_length, + border_mode='valid', + activation='relu', + subsample_length=1)) -class DeepReprPreprocessor: + elif mode == "test": + model.add(Convolution1D(nb_filter=nb_filters, + filter_length=filter_length, + border_mode='valid', + activation='relu', + subsample_length=1, + weights=weights[2])) - def __init__(self, tokenizer, max_len, batch_size): - self.tokenizer = tokenizer - self.max_len = max_len - self.batch_size = batch_size + # we use standard max pooling (halving the output of the previous layer): + model.add(MaxPooling1D(pool_length=2)) - def preprocess_traces(self, X_data, y_data=None, labels=None): + # We flatten the output of the conv layer, so that we can add a vanilla + # dense layer: + model.add(Flatten()) - cut_X_data = [] - cut_label_data = [] - cut_y_data = [] - #rep = 5 + # Computing the output shape of a conv layer can be tricky; + # for a good tutorial, see: http://cs231n.github.io/convolutional-networks/ + output_size = nb_filters * (((maxlen - filter_length) / 1) + 1) / 2 + # print output_size, hidden_dims - X_size = len(X_data) + # We add a vanilla hidden layer: + if mode == "train": + model.add(Dense(hidden_dims)) + if mode == "test": + model.add(Dense(hidden_dims, weights=weights[5])) - for i,x in enumerate(X_data): + if mode == "train": - #i = randint(0, X_size-1) + model.add(Dropout(0.25)) + model.add(Activation('relu')) - raw_trace = x[:-1] - trace = raw_trace.split(" ") + # We project onto a single unit output layer, and squash it with a + # sigmoid: + model.add(Dense(nb_classes)) - size = len(trace) - rep = 1 + int(float(size) / float(self.max_len)) - rep = min(rep, 10) + model.add(Activation('softmax')) + model.compile(loss='categorical_crossentropy', + optimizer='rmsprop', class_mode="categorical") - for _ in range(rep): + elif mode == "test": + model.compile(loss='mean_squared_error', optimizer='rmsprop') - start = size - (self.max_len) - start = randint(0, max(start,0)) + return model - new_trace = " ".join(trace[start:(start+self.max_len)]) - #print "sizes:", size, len(trace[start:(start+self.max_len)]) +try: + from keras.preprocessing import sequence +except: + pass - cut_X_data.append(new_trace) +class DeepReprPreprocessor: - if labels is not None: - cut_label_data.append(labels[i]) - else: - cut_label_data.append("+"+str(size)) + def __init__(self, tokenizer, max_len, batch_size): + self.tokenizer = tokenizer + self.max_len = max_len + self.batch_size = batch_size - if y_data is not None: - cut_y_data.append(y_data[i]) - else: - cut_y_data.append(0) + def preprocess_traces(self, X_data, y_data=None, labels=None): - X_train = self.tokenizer.texts_to_sequences(cut_X_data) - labels = cut_label_data - y_train = cut_y_data - X_train,y_train,labels = zip(*filter(lambda (x,y,z): not (x == []), zip(X_train,y_train,labels))) + cut_X_data = [] + cut_label_data = [] + cut_y_data = [] + #rep = 5 + X_size = len(X_data) - X_size = len(X_train) - X_train = sequence.pad_sequences(X_train, maxlen=self.max_len) - return X_train, y_train, labels + for i, x in enumerate(X_data): - def preprocess(self, X_data, cut_size=1): + #i = randint(0, X_size-1) - cut_X_data = [] - cut_y_data = [] - self.classes = [] - X_size = len(X_data) - stats = dict() + raw_trace = x[:-1] + trace = raw_trace.split(" ") - for _ in xrange(1000): + size = len(trace) + rep = 1 + int(float(size) / float(self.max_len)) + rep = min(rep, 10) - i = randint(0, X_size-1) + for _ in range(rep): - raw_trace = X_data[i][:-1] - trace = raw_trace.split(" ") + start = size - (self.max_len) + start = randint(0, max(start, 0)) - size = len(trace) + new_trace = " ".join(trace[start:(start + self.max_len)]) + # print "sizes:", size, len(trace[start:(start+self.max_len)]) - if size <= (self.max_len + 1): - start = 0 - end = size - 2 - new_trace = " ".join(trace[start:(end+1)]) - last_event = trace[(end+1)].split(":") - cut_y_data.append(last_event[0]) - else: - #print size - start = size - (self.max_len) - 2 - start = randint(0, start) - end = start + self.max_len - #print len(trace[start:end]) - #new_trace = " ".join(trace[start:end]) + cut_X_data.append(new_trace) - #start = randint(0, size-2) - #end = randint(start, size-2) + if labels is not None: + cut_label_data.append(labels[i]) + else: + cut_label_data.append("+" + str(size)) - new_trace = " ".join(trace[start:(end+1)]) - last_event = trace[end+1].split(":") - cut_y_data.append(last_event[0]) + if y_data is not None: + cut_y_data.append(y_data[i]) + else: + cut_y_data.append(0) + X_train = self.tokenizer.texts_to_sequences(cut_X_data) + labels = cut_label_data + y_train = cut_y_data + X_train, y_train, labels = zip( + *filter(lambda x_y_z: not (x_y_z[0] == []), zip(X_train, y_train, labels))) - for y in set(cut_y_data): - stats[y] = float(cut_y_data.count(y)) / len(cut_y_data) + X_size = len(X_train) + X_train = sequence.pad_sequences(X_train, maxlen=self.max_len) + return X_train, y_train, labels - print stats, sum(stats.values()) - #assert(0) - cut_y_data = [] - for _ in xrange(cut_size): + def preprocess(self, X_data, cut_size=1): - i = randint(0, X_size-1) + cut_X_data = [] + cut_y_data = [] + self.classes = [] + X_size = len(X_data) + stats = dict() - raw_trace = X_data[i][:-1] - trace = raw_trace.split(" ") - size = len(trace) + for _ in xrange(1000): + i = randint(0, X_size - 1) - if size <= (self.max_len + 1): - start = 0 - end = size - 2 - new_trace = " ".join(trace[start:(end+1)]) - last_event = trace[(end+1)].split(":") - else: - #print size - start = size - (self.max_len) - 2 - start = randint(0, start) - end = start + self.max_len - #print len(trace[start:end]) - #new_trace = " ".join(trace[start:end]) + raw_trace = X_data[i][:-1] + trace = raw_trace.split(" ") - #start = randint(0, size-2) - #end = randint(start, size-2) + size = len(trace) - new_trace = " ".join(trace[start:(end+1)]) - last_event = trace[end+1].split(":") + if size <= (self.max_len + 1): + start = 0 + end = size - 2 + new_trace = " ".join(trace[start:(end + 1)]) + last_event = trace[(end + 1)].split(":") + cut_y_data.append(last_event[0]) + else: + # print size + start = size - (self.max_len) - 2 + start = randint(0, start) + end = start + self.max_len + # print len(trace[start:end]) + #new_trace = " ".join(trace[start:end]) - cl = last_event[0] + #start = randint(0, size-2) + #end = randint(start, size-2) - if cl not in self.classes: - self.classes.append(cl) - stats[cl] = 0.0 - else: - if random() <= stats[cl]: - continue + new_trace = " ".join(trace[start:(end + 1)]) + last_event = trace[end + 1].split(":") + cut_y_data.append(last_event[0]) - cut_X_data.append(new_trace) - cut_y_data.append(self.classes.index(cl)) + for y in set(cut_y_data): + stats[y] = float(cut_y_data.count(y)) / len(cut_y_data) - X_train = self.tokenizer.texts_to_sequences(cut_X_data) + print stats, sum(stats.values()) + # assert(0) + cut_y_data = [] + for _ in xrange(cut_size): - y_train = [] + i = randint(0, X_size - 1) - for y in cut_y_data: - v = [0]*len(self.classes) - v[y] = 1 - y_train.append(v) + raw_trace = X_data[i][:-1] + trace = raw_trace.split(" ") + size = len(trace) - X_train = filter(lambda x: not (x == []), X_train) + if size <= (self.max_len + 1): + start = 0 + end = size - 2 + new_trace = " ".join(trace[start:(end + 1)]) + last_event = trace[(end + 1)].split(":") + else: + # print size + start = size - (self.max_len) - 2 + start = randint(0, start) + end = start + self.max_len + # print len(trace[start:end]) + #new_trace = " ".join(trace[start:end]) - X_size = len(X_train) - X_train = X_train[:(X_size-(X_size % self.batch_size))] - X_train = sequence.pad_sequences(X_train, maxlen=self.max_len) + #start = randint(0, size-2) + #end = randint(start, size-2) - if y_train is not None: - y_train = y_train[:(X_size-(X_size % self.batch_size))] - return X_train,y_train - else: - return X_train + new_trace = " ".join(trace[start:(end + 1)]) + last_event = trace[end + 1].split(":") + cl = last_event[0] + + if cl not in self.classes: + self.classes.append(cl) + stats[cl] = 0.0 + else: + if random() <= stats[cl]: + continue + + cut_X_data.append(new_trace) + cut_y_data.append(self.classes.index(cl)) + + X_train = self.tokenizer.texts_to_sequences(cut_X_data) + + y_train = [] + + for y in cut_y_data: + v = [0] * len(self.classes) + v[y] = 1 + y_train.append(v) + + X_train = filter(lambda x: not (x == []), X_train) + + X_size = len(X_train) + X_train = X_train[:(X_size - (X_size % self.batch_size))] + X_train = sequence.pad_sequences(X_train, maxlen=self.max_len) + + if y_train is not None: + y_train = y_train[:(X_size - (X_size % self.batch_size))] + return X_train, y_train + else: + return X_train class KerasPreprocessor: - def __init__(self, tokenizer, max_len, batch_size): - self.tokenizer = tokenizer - self.max_len = max_len - self.batch_size = batch_size - - def preprocess(self, X_data, y_data=None, cut_size=1): + def __init__(self, tokenizer, max_len, batch_size): + self.tokenizer = tokenizer + self.max_len = max_len + self.batch_size = batch_size - cut_X_data = [] - cut_y_data = [] - X_size = len(X_data) + def preprocess(self, X_data, y_data=None, cut_size=1): - for _ in xrange(cut_size): + cut_X_data = [] + cut_y_data = [] + X_size = len(X_data) - i = randint(0, X_size-1) + for _ in xrange(cut_size): - raw_trace = X_data[i] - trace = raw_trace.split(" ") + i = randint(0, X_size - 1) - size = len(trace) + raw_trace = X_data[i] + trace = raw_trace.split(" ") - start = randint(0, size-1) - end = start + randint(0, self.max_len) + size = len(trace) - new_trace = " ".join(trace[start:(end+1)]) - cut_X_data.append(new_trace) + start = randint(0, size - 1) + end = start + randint(0, self.max_len) - if y_data is not None: - y = y_data[i] - cut_y_data.append(y) + new_trace = " ".join(trace[start:(end + 1)]) + cut_X_data.append(new_trace) - X_train = self.tokenizer.texts_to_sequences(cut_X_data) - y_train = cut_y_data + if y_data is not None: + y = y_data[i] + cut_y_data.append(y) - if y_train is not None: - X_train,y_train = zip(*filter(lambda (x,y): not (x == []), zip(X_train,y_train))) - else: - X_train = filter(lambda x: not (x == []), X_train) + X_train = self.tokenizer.texts_to_sequences(cut_X_data) + y_train = cut_y_data + if y_train is not None: + X_train, y_train = zip( + *filter(lambda x_y: not (x_y[0] == []), zip(X_train, y_train))) + else: + X_train = filter(lambda x: not (x == []), X_train) - X_size = len(X_train) - X_train = X_train[:(X_size-(X_size % self.batch_size))] - X_train = sequence.pad_sequences(X_train, maxlen=self.max_len) + X_size = len(X_train) + X_train = X_train[:(X_size - (X_size % self.batch_size))] + X_train = sequence.pad_sequences(X_train, maxlen=self.max_len) - if y_train is not None: - y_train = y_train[:(X_size-(X_size % self.batch_size))] - return X_train,y_train - else: - return X_train + if y_train is not None: + y_train = y_train[:(X_size - (X_size % self.batch_size))] + return X_train, y_train + else: + return X_train + def preprocess_one(self, raw_trace, sample_size=100): - def preprocess_one(self, raw_trace, sample_size=100): + trace = raw_trace.split(" ") + size = len(trace) + cut_X_data = [] + # print trace - trace = raw_trace.split(" ") - size = len(trace) - cut_X_data = [] - #print trace + for _ in xrange(sample_size): - for _ in xrange(sample_size): + start = randint(0, size - 1) + end = start + randint(0, self.max_len) - start = randint(0, size-1) - end = start + randint(0, self.max_len) + new_trace = " ".join(trace[start:(end + 1)]) + cut_X_data.append(new_trace) - new_trace = " ".join(trace[start:(end+1)]) - cut_X_data.append(new_trace) + X_train = self.tokenizer.texts_to_sequences(cut_X_data) + X_train = filter(lambda x: not (x == []), X_train) - X_train = self.tokenizer.texts_to_sequences(cut_X_data) - X_train = filter(lambda x: not (x == []), X_train) + X_size = len(X_train) + X_train = X_train[:(X_size - (X_size % self.batch_size))] + # print "X_size", X_size-(X_size % self.batch_size) - X_size = len(X_train) - X_train = X_train[:(X_size-(X_size % self.batch_size))] - #print "X_size", X_size-(X_size % self.batch_size) + X_train = sequence.pad_sequences(X_train, maxlen=self.max_len) + return X_train - X_train = sequence.pad_sequences(X_train, maxlen=self.max_len) - return X_train class KerasPredictor: - def __init__(self,preprocessor, model, ftype): - self.preprocessor = preprocessor - self.batch_size = preprocessor.batch_size - self.ftype = ftype - self.model = model + def __init__(self, preprocessor, model, ftype): + self.preprocessor = preprocessor + self.batch_size = preprocessor.batch_size + self.ftype = ftype + self.model = model def predict(self, X_data): - X_size = len(X_data) - X_data = X_data[self.ftype] - X_predictions = [] + X_size = len(X_data) + X_data = X_data[self.ftype] + X_predictions = [] - for raw_trace in X_data: + for raw_trace in X_data: - trace_data = self.preprocessor.preprocess_one(raw_trace) + trace_data = self.preprocessor.preprocess_one(raw_trace) - if len(trace_data) > 0: - predictions = self.model.predict(trace_data, verbose=0, batch_size=self.batch_size) - else: # imposible to predict - predictions = [0] + if len(trace_data) > 0: + predictions = self.model.predict( + trace_data, verbose=0, batch_size=self.batch_size) + else: # imposible to predict + predictions = [0] - avg_predictions = sum(predictions)/100.0 - #print predictions, avg_predictions - if avg_predictions > 0.5: - X_predictions.append(1) - else: - X_predictions.append(0) - - return X_predictions + avg_predictions = sum(predictions) / 100.0 + # print predictions, avg_predictions + if avg_predictions > 0.5: + X_predictions.append(1) + else: + X_predictions.append(0) + return X_predictions +""" diff --git a/vdiscover/Printer.py b/vdiscover/Printer.py index 83b32da..7d1af84 100644 --- a/vdiscover/Printer.py +++ b/vdiscover/Printer.py @@ -21,83 +21,85 @@ import csv import copy -from Event import Call, Crash, Abort, Exit, Timeout, Signal, Vulnerability, specs -from Types import ptypes, isPtr, isNum, ptr32_ptypes, num32_ptypes, generic_ptypes +from Event import Call, Crash, Abort, Exit, Timeout, Signal, Vulnerability, specs +from Types import ptypes, isPtr, isNum, ptr32_ptypes, num32_ptypes, generic_ptypes + class TypePrinter: - def __init__(self, filename, pname, mclass): - self.tests = set() - self.outfile = open(filename, "a+") - self.pname = pname - self.mclass = mclass - self.csvwriter = csv.writer(self.outfile, delimiter='\t') - def preprocess(self, event): + def __init__(self, filename, pname, mclass): + self.tests = set() + self.outfile = open(filename, "a+") + self.pname = pname + self.mclass = mclass + self.csvwriter = csv.writer(self.outfile, delimiter='\t') + + def preprocess(self, event): - r = list() + r = list() - if isinstance(event, Call): - (name, args) = event.GetTypedName() + if isinstance(event, Call): + (name, args) = event.GetTypedName() - for (index, arg) in enumerate(args[:]): - r.append((name+":"+str(index),str(arg))) + for (index, arg) in enumerate(args[:]): + r.append((name + ":" + str(index), str(arg))) - elif isinstance(event, Abort): - (name, fields) = event.GetTypedName() - r.append((name+":eip",str(fields[0]))) + elif isinstance(event, Abort): + (name, fields) = event.GetTypedName() + r.append((name + ":eip", str(fields[0]))) - elif isinstance(event, Exit): - (name, fields) = event.GetTypedName() - r.append((name,str(()))) + elif isinstance(event, Exit): + (name, fields) = event.GetTypedName() + r.append((name, str(()))) - elif isinstance(event, Crash): - (name, fields) = event.GetTypedName() - r.append((name+":eip",str(fields[0]))) + elif isinstance(event, Crash): + (name, fields) = event.GetTypedName() + r.append((name + ":eip", str(fields[0]))) - elif isinstance(event, Vulnerability): - (name, fields) = event.GetTypedName() - r.append((name,str(fields[0]))) + elif isinstance(event, Vulnerability): + (name, fields) = event.GetTypedName() + r.append((name, str(fields[0]))) - elif isinstance(event, Timeout): - (name, fields) = event.GetTypedName() - r.append((name,str(()))) + elif isinstance(event, Timeout): + (name, fields) = event.GetTypedName() + r.append((name, str(()))) - elif isinstance(event, Signal): - (name, fields) = event.GetTypedName() + elif isinstance(event, Signal): + (name, fields) = event.GetTypedName() - if name == "SIGSEGV": - r.append((name+":addr",str(fields[0]))) - else: - r.append((name,str(fields[0]))) + if name == "SIGSEGV": + r.append((name + ":addr", str(fields[0]))) + else: + r.append((name, str(fields[0]))) - return r + return r - def print_events(self, label, events): + def print_events(self, label, events): - r = list() + r = list() - for event in events: - r = r + list(self.preprocess(event)) + for event in events: + r = r + list(self.preprocess(event)) - events = r + events = r - #x = hash(tuple(events)) + #x = hash(tuple(events)) - #if (x in self.tests): - # return + # if (x in self.tests): + # return - #self.tests.add(x) + # self.tests.add(x) - trace = "" + trace = "" - for x,y in events: - trace = trace + ("%s=%s " % (x,y)) + for x, y in events: + trace = trace + ("%s=%s " % (x, y)) - row = [self.pname+":"+label,trace] + row = [self.pname + ":" + label, trace] - if self.mclass is not None: - row.append(self.mclass) + if self.mclass is not None: + row.append(self.mclass) - self.csvwriter.writerow(row) - self.outfile.flush() - return row + self.csvwriter.writerow(row) + self.outfile.flush() + return row diff --git a/vdiscover/Process.py b/vdiscover/Process.py index 3fd580a..66fe41e 100644 --- a/vdiscover/Process.py +++ b/vdiscover/Process.py @@ -19,14 +19,18 @@ from ptrace import PtraceError from ptrace.debugger import (PtraceDebugger, Application, - ProcessExit, NewProcessEvent, ProcessSignal, - ProcessExecution, ProcessError) + ProcessExit, NewProcessEvent, ProcessSignal, + ProcessExecution, ProcessError) from logging import getLogger, info, warning, error from ptrace.error import PTRACE_ERRORS, PtraceError, writeError from ptrace.disasm import HAS_DISASSEMBLER -from ptrace.ctypes_tools import (truncateWord, - formatWordHex, formatAddress, formatAddressRange, word2bytes) +from ptrace.ctypes_tools import ( + truncateWord, + formatWordHex, + formatAddress, + formatAddressRange, + word2bytes) from ptrace.signames import signalName, SIGNAMES from signal import SIGTRAP, SIGALRM, SIGABRT, SIGSEGV, SIGILL, SIGCHLD, SIGWINCH, SIGFPE, SIGBUS, SIGTERM, SIGPIPE, signal, alarm @@ -44,8 +48,19 @@ from MemoryMap import MemoryMaps from Alarm import alarm_handler, TimeoutEx + class Process(Application): - def __init__(self, program, envs, timeout, included_mods = [], ignored_mods = [], no_stdout = True, max_events = 320, min_events = -10*320): + + def __init__( + self, + program, + envs, + timeout, + included_mods=[], + ignored_mods=[], + no_stdout=True, + max_events=320, + min_events=-10 * 320): Application.__init__(self) # no effect @@ -67,9 +82,9 @@ def __init__(self, program, envs, timeout, included_mods = [], ignored_mods = [] self.min_events = min_events # Parse ELF - self.elf = ELF(self.program, plt = False) + self.elf = ELF(self.program, plt=False) - #if self.elf.GetType() <> "ELF 32-bit": + # if self.elf.GetType() <> "ELF 32-bit": # print "Only ELF 32-bit are supported to be executed." # exit(-1) @@ -84,150 +99,161 @@ def __init__(self, program, envs, timeout, included_mods = [], ignored_mods = [] self.binfo = dict() def setBreakpoints(self, elf): - #print elf.GetFunctions() - for func_name in elf.GetFunctions(): - #print "func_name", elf.GetModname(), hex(elf.FindFuncInPlt(func_name)) - - if func_name in specs: - #print "func_name in spec",elf.GetModname(), func_name, hex(elf.FindFuncInPlt(func_name)) - addr = elf.FindFuncInPlt(func_name) - self.binfo[addr] = elf.GetModname(),func_name - self.breakpoint(addr) + # print elf.GetFunctions() + for func_name in elf.GetFunctions(): + # print "func_name", elf.GetModname(), + # hex(elf.FindFuncInPlt(func_name)) + + if func_name in specs: + # print "func_name in spec",elf.GetModname(), func_name, + # hex(elf.FindFuncInPlt(func_name)) + addr = elf.FindFuncInPlt(func_name) + self.binfo[addr] = elf.GetModname(), func_name + self.breakpoint(addr) def findBreakpointInfo(self, addr): - if addr in self.binfo: - return self.binfo[addr] - else: - return None, None + if addr in self.binfo: + return self.binfo[addr] + else: + return None, None def createEvents(self, signal): - # Hit breakpoint? + # Hit breakpoint? if signal.signum == SIGTRAP: ip = self.process.getInstrPointer() if not CPU_POWERPC: # Go before "INT 3" instruction ip -= 1 breakpoint = self.process.findBreakpoint(ip) - #print "breakpoint @",hex(ip) + # print "breakpoint @",hex(ip) if breakpoint: module, name = self.findBreakpointInfo(breakpoint.address) - #print module, name, hex(ip) + # print module, name, hex(ip) if ip == self.elf.GetEntrypoint(): - breakpoint.desinstall(set_ip=True) + breakpoint.desinstall(set_ip=True) - #if self.mm is None: - self.mm = MemoryMaps(self.program, self.pid) - #self.setBreakpoints(self.elf) + # if self.mm is None: + self.mm = MemoryMaps(self.program, self.pid) + # self.setBreakpoints(self.elf) - #print self.mm + # print self.mm - for (range, mod, atts) in self.mm.items(): - if '/' in mod and 'x' in atts and not ("libc-" in mod): + for (range, mod, atts) in self.mm.items(): + if '/' in mod and 'x' in atts and not ("libc-" in mod): - # FIXME: self.elf.path should be absolute - if mod == self.elf.path: - base = 0 - else: - base = range[0] + # FIXME: self.elf.path should be absolute + if mod == self.elf.path: + base = 0 + else: + base = range[0] - if self.included_mods == [] or any(map(lambda l: l in mod, self.included_mods)): - if self.ignored_mods == [] or not (any(map(lambda l: l in mod, self.ignored_mods))): + if self.included_mods == [] or any( + map(lambda l: l in mod, self.included_mods)): + if self.ignored_mods == [] or not ( + any(map(lambda l: l in mod, self.ignored_mods))): - if not (mod in self.modules): - self.modules[mod] = ELF(mod, base = base) - #print "hooking", mod, hex(base) + if not (mod in self.modules): + self.modules[mod] = ELF(mod, base=base) + # print "hooking", mod, hex(base) - self.setBreakpoints(self.modules[mod]) + self.setBreakpoints(self.modules[mod]) - - return [] + return [] elif name is None: - assert(0) + assert(0) else: - call = Call(name, module) - #self.mm.update() - #print "updated mm" - call.detect_parameters(self.process, self.mm) - breakpoint.desinstall(set_ip=True) - - call_ip = ip - self.process.singleStep() - self.debugger.waitProcessEvent() - - n = self.nevents.get((ip,name), 0) - self.nevents[(ip, name)] = n + 2 - - for ((ip_,name_),n) in self.nevents.items(): - - if n > self.min_events + 1: - self.nevents[(ip_, name_)] = n - 1 - elif n == self.min_events + 1: - self.nevents[(ip_, name_)] = self.min_events - #print "restoring!", (ip, name) - self.breakpoint(call_ip) - - if n < self.max_events: - self.breakpoint(call_ip) - #else: - #print "disabled!", (ip, name) - - #print "call detected!" - return [call] + call = Call(name, module) + # self.mm.update() + # print "updated mm" + call.detect_parameters(self.process, self.mm) + breakpoint.desinstall(set_ip=True) + + call_ip = ip + self.process.singleStep() + self.debugger.waitProcessEvent() + + n = self.nevents.get((ip, name), 0) + self.nevents[(ip, name)] = n + 2 + + for ((ip_, name_), n) in self.nevents.items(): + + if n > self.min_events + 1: + self.nevents[(ip_, name_)] = n - 1 + elif n == self.min_events + 1: + self.nevents[(ip_, name_)] = self.min_events + # print "restoring!", (ip, name) + self.breakpoint(call_ip) + + if n < self.max_events: + self.breakpoint(call_ip) + # else: + # print "disabled!", (ip, name) + + # print "call detected!" + return [call] elif signal.signum == SIGABRT: - self.crashed = True - return [Signal("SIGABRT",self.process, self.mm), Abort(self.process, self.mm)] + self.crashed = True + return [ + Signal( + "SIGABRT", self.process, self.mm), Abort( + self.process, self.mm)] elif signal.signum == SIGSEGV: - self.crashed = True - self.mm = MemoryMaps(self.program, self.pid) - return [Signal("SIGSEGV", self.process, self.mm), Crash(self.process, self.mm)] + self.crashed = True + self.mm = MemoryMaps(self.program, self.pid) + return [ + Signal( + "SIGSEGV", self.process, self.mm), Crash( + self.process, self.mm)] elif signal.signum == SIGILL: - #self.crashed = True - self.mm = MemoryMaps(self.program, self.pid) - return [Signal("SIGILL", self.process, self.mm)] + #self.crashed = True + self.mm = MemoryMaps(self.program, self.pid) + return [Signal("SIGILL", self.process, self.mm)] elif signal.signum == SIGFPE: - self.crashed = True - self.mm = MemoryMaps(self.program, self.pid) - return [Signal("SIGFPE", self.process, self.mm), Crash(self.process, self.mm)] + self.crashed = True + self.mm = MemoryMaps(self.program, self.pid) + return [ + Signal( + "SIGFPE", self.process, self.mm), Crash( + self.process, self.mm)] elif signal.signum == SIGBUS: - #self.crashed = True - self.mm = MemoryMaps(self.program, self.pid) - return [Signal("SIGBUS", self.process, self.mm)] + #self.crashed = True + self.mm = MemoryMaps(self.program, self.pid) + return [Signal("SIGBUS", self.process, self.mm)] elif signal.signum == SIGCHLD: - #self.crashed = True - self.mm = MemoryMaps(self.program, self.pid) - return [Signal("SIGCHLD", self.process, self.pid)] + #self.crashed = True + self.mm = MemoryMaps(self.program, self.pid) + return [Signal("SIGCHLD", self.process, self.pid)] - elif signal.signum == SIGTERM: # killed by the kernel? - self.crashed = True - return [] + elif signal.signum == SIGTERM: # killed by the kernel? + self.crashed = True + return [] # Harmless signals elif signal.signum == SIGPIPE: - return [] # User generated, ignore. + return [] # User generated, ignore. # Harmless signals elif signal.signum == SIGWINCH: - return [] # User generated, ignore. + return [] # User generated, ignore. else: - print "I don't know what to do with this signal:", str(signal) - assert(False) + print "I don't know what to do with this signal:", str(signal) + assert(False) return [] def DetectVulnerabilities(self, preevents, events): - return detect_vulnerabilities(preevents, events, self.process, self.mm) - + return detect_vulnerabilities(preevents, events, self.process, self.mm) def createProcess(self, cmd, envs, no_stdout): @@ -236,15 +262,17 @@ def createProcess(self, cmd, envs, no_stdout): is_attached = True try: - #print "initial processes:" - #for p in self.debugger: + # print "initial processes:" + # for p in self.debugger: # print "p:", p - #print "end processes" + # print "end processes" return self.debugger.addProcess(self.pid, is_attached=is_attached) - except (ProcessExit, PtraceError), err: + except (ProcessExit, PtraceError) as err: if isinstance(err, PtraceError) \ - and err.errno == EPERM: - error("ERROR: You are not allowed to trace process %s (permission denied or process already traced)" % self.pid) + and err.errno == EPERM: + error( + "ERROR: You are not allowed to trace process %s (permission denied or process already traced)" % + self.pid) else: error("ERROR: Process can no be attached! %s" % err) return None @@ -279,13 +307,12 @@ def cont(self, signum=None): signal = self.debugger.waitSignals() process = signal.process events = self.createEvents(signal) - + #vulns = self.DetectVulnerabilities(self.events, events) - #print "vulns detected" - self.events = self.events + events #+ vulns + # print "vulns detected" + self.events = self.events + events # + vulns #self.nevents = self.nevents + len(events) - def readInstrSize(self, address, default_size=None): if not HAS_DISASSEMBLER: return default_size @@ -293,7 +320,7 @@ def readInstrSize(self, address, default_size=None): # Get address and size of instruction at specified address instr = self.process.disassembleOne(address) return instr.size - except PtraceError, err: + except PtraceError as err: warning("Warning: Unable to read instruction size at %s: %s" % ( formatAddress(address), err)) return default_size @@ -304,7 +331,7 @@ def breakpoint(self, address): size = self.readInstrSize(address) try: bp = self.process.createBreakpoint(address, size) - except PtraceError, err: + except PtraceError as err: return "Unable to set breakpoint at %s: %s" % ( formatAddress(address), err) #error("New breakpoint: %s" % bp) @@ -312,14 +339,14 @@ def breakpoint(self, address): def runProcess(self, cmd): - #print "Running", cmd + # print "Running", cmd signal(SIGALRM, alarm_handler) - #if self.pid is None: + # if self.pid is None: # timeout = 20*self.timeout - #else: - timeout = 10*self.timeout + # else: + timeout = 10 * self.timeout alarm(timeout) @@ -328,18 +355,18 @@ def runProcess(self, cmd): self.process = self.createProcess(cmd, self.envs, self.no_stdout) self.process.no_frame_pointer = self.elf.no_frame_pointer #self.mm = MemoryMaps(self.program, self.pid) - #print self.mm + # print self.mm self.crashed = False - except ChildError, err: + except ChildError as err: print "a" writeError(getLogger(), err, "Unable to create child process") return - except OSError, err: + except OSError as err: print "b" writeError(getLogger(), err, "Unable to create child process") return - except IOError, err: + except IOError as err: print "c" writeError(getLogger(), err, "Unable to create child process") return @@ -347,75 +374,72 @@ def runProcess(self, cmd): if not self.process: return - # Set the breakpoints self.breakpoint(self.elf.GetEntrypoint()) - #print hex(self.elf.GetEntrypoint()) + # print hex(self.elf.GetEntrypoint()) try: - while True: - - #self.cont() - #if self.nevents > self.max_events: - # - # self.events.append(Timeout(timeout)) - # alarm(0) - # return - if not self.debugger or self.crashed: - # There is no more process: quit - alarm(0) - return - else: - self.cont() - - #alarm(0) - #except PtraceError: - #print "deb:",self.debugger, "crash:", self.crashed - #print "PtraceError" - #alarm(0) - #return - - except ProcessExit, event: - alarm(0) - self.events.append(Exit(event.exitcode)) - return + while True: + + # self.cont() + # if self.nevents > self.max_events: + # + # self.events.append(Timeout(timeout)) + # alarm(0) + # return + if not self.debugger or self.crashed: + # There is no more process: quit + alarm(0) + return + else: + self.cont() + + # alarm(0) + # except PtraceError: + # print "deb:",self.debugger, "crash:", self.crashed + # print "PtraceError" + # alarm(0) + # return + + except ProcessExit as event: + alarm(0) + self.events.append(Exit(event.exitcode)) + return except OSError: - alarm(0) - self.events.append(Timeout(timeout)) - self.timeouts += 1 - return + alarm(0) + self.events.append(Timeout(timeout)) + self.timeouts += 1 + return except IOError: - alarm(0) - self.events.append(Timeout(timeout)) - self.timeouts += 1 - return + alarm(0) + self.events.append(Timeout(timeout)) + self.timeouts += 1 + return except TimeoutEx: - self.events.append(Timeout(timeout)) - return - - + self.events.append(Timeout(timeout)) + return def getData(self, inputs): self.events = [] self.nevents = dict() self.debugger = PtraceDebugger() - self.runProcess([self.program]+inputs) - #print self.pid + self.runProcess([self.program] + inputs) + # print self.pid - #if self.crashed: + # if self.crashed: # print "we should terminate.." - #sleep(3) + # sleep(3) if self.process is None: - return None + return None self.process.terminate() self.process.detach() - #print self.nevents + # print self.nevents self.process = None return self.events diff --git a/vdiscover/RandomWalk.py b/vdiscover/RandomWalk.py index 7f6fc11..a4881cd 100644 --- a/vdiscover/RandomWalk.py +++ b/vdiscover/RandomWalk.py @@ -23,202 +23,207 @@ import csv import re -from ELF import ELF +from ELF import ELF from Spec import specs from Misc import readmodfile -def RandomWalkElf(program, outfile, mclass, max_subtraces, max_explored_subtraces, min_size): +def RandomWalkElf( + program, + outfile, + mclass, + max_subtraces, + max_explored_subtraces, + min_size): - csvwriter = csv.writer(open(outfile, "a+"), delimiter='\t') - elf = ELF(program) + csvwriter = csv.writer(open(outfile, "a+"), delimiter='\t') + elf = ELF(program) - # plt is inverted - inv_plt = dict() + # plt is inverted + inv_plt = dict() - for func, addr in elf.plt.items(): - if func in specs: # external functions are discarded - inv_plt[addr] = func + for func, addr in elf.plt.items(): + if func in specs: # external functions are discarded + inv_plt[addr] = func - elf.plt = inv_plt + elf.plt = inv_plt - cond_control_flow_ins = ["jo", "jno", "js", "jns", "je", - "jz","jnz", "jb", "jnae", "jc", - "jnb", "jae", "jnc", "jbe", "jna", - "ja", "jnbe", "jl", "jnge", "jge", - "jnl", "jle", "jng", "jg", "jnle", - "jp", "jpe", "jnp", "jpo", "jcxz", "jecxz"] + cond_control_flow_ins = ["jo", "jno", "js", "jns", "je", + "jz", "jnz", "jb", "jnae", "jc", + "jnb", "jae", "jnc", "jbe", "jna", + "ja", "jnbe", "jl", "jnge", "jge", + "jnl", "jle", "jng", "jg", "jnle", + "jp", "jpe", "jnp", "jpo", "jcxz", "jecxz"] - ncond_control_flow_ins = ["ret","jmp","call", "retq","jmp","callq"] + ncond_control_flow_ins = ["ret", "jmp", "call", "retq", "jmp", "callq"] - control_flow_ins = cond_control_flow_ins + ncond_control_flow_ins + control_flow_ins = cond_control_flow_ins + ncond_control_flow_ins - raw_inss = elf.GetRawInss() - useful_inss_list = [] - useful_inss_dict = dict() - libc_calls = [] - labels = dict() + raw_inss = elf.GetRawInss() + useful_inss_list = [] + useful_inss_dict = dict() + libc_calls = [] + labels = dict() - #print sys.argv[1]+"\t", - #rclass = str(1) + # print sys.argv[1]+"\t", + #rclass = str(1) - for i,ins in enumerate(raw_inss.split("\n")): + for i, ins in enumerate(raw_inss.split("\n")): - # prefix removal - ins = ins.replace("repz ","") - ins = ins.replace("rep ","") + # prefix removal + ins = ins.replace("repz ", "") + ins = ins.replace("rep ", "") - pins = ins.split("\t") - #print pins - ins_addr = pins[0].replace(":","").replace(" ","") - #print pins,ins_addr + pins = ins.split("\t") + # print pins + ins_addr = pins[0].replace(":", "").replace(" ", "") + # print pins,ins_addr - if len(pins) == 1 and ">" in ins: #label - #print ins - #assert(0) - x = pins[0].split(" ") + if len(pins) == 1 and ">" in ins: # label + # print ins + # assert(0) + x = pins[0].split(" ") - ins_addr = x[0] + ins_addr = x[0] - y = [i,ins_addr, None, None] - useful_inss_dict[ins_addr] = y - useful_inss_list.append(y) + y = [i, ins_addr, None, None] + useful_inss_dict[ins_addr] = y + useful_inss_list.append(y) - #print "label:",y + # print "label:",y - elif any(map( lambda x: x in ins, control_flow_ins)) and len(pins) == 3: # control flow instruction - #print pins - x = pins[2].split(" ") + elif any(map(lambda x: x in ins, control_flow_ins)) and len(pins) == 3: # control flow instruction + # print pins + x = pins[2].split(" ") - ins_nme = x[0] - ins_jaddr = x[-2] + ins_nme = x[0] + ins_jaddr = x[-2] - #if ("" == ins_jaddr): - # print pins - #print x - #print ins_nme, ins_jaddr - y = [i, ins_addr, ins_nme, ins_jaddr] + # if ("" == ins_jaddr): + # print pins + # print x + # print ins_nme, ins_jaddr + y = [i, ins_addr, ins_nme, ins_jaddr] - useful_inss_dict[ins_addr] = y - useful_inss_list.append(y) + useful_inss_dict[ins_addr] = y + useful_inss_list.append(y) - if "call" in pins[2]: - if ins_jaddr <> '': - func_addr = int(ins_jaddr,16) - if func_addr in elf.plt: - libc_calls.append(i) + if "call" in pins[2]: + if ins_jaddr != '': + func_addr = int(ins_jaddr, 16) + if func_addr in elf.plt: + libc_calls.append(i) - else: # all other instructions - y = [i, ins_addr, None, None] + else: # all other instructions + y = [i, ins_addr, None, None] - useful_inss_dict[ins_addr] = y - useful_inss_list.append(y) + useful_inss_dict[ins_addr] = y + useful_inss_list.append(y) - #print useful_inss_list - max_inss = len(useful_inss_list) - traces = set() - collected_traces = "" + # print useful_inss_list + max_inss = len(useful_inss_list) + traces = set() + collected_traces = "" - # exploration time! - for _ in range(max_explored_subtraces): + # exploration time! + for _ in range(max_explored_subtraces): - # resuling (sub)trace - r = "" - # starting point - i = random.choice(libc_calls) - j = 0 + # resuling (sub)trace + r = "" + # starting point + i = random.choice(libc_calls) + j = 0 - #r = elf.path+"\t" - r = "" + #r = elf.path+"\t" + r = "" - while True: + while True: - # last instruction case - if (i+j) == max_inss: - break + # last instruction case + if (i + j) == max_inss: + break - _,ins_addr,ins_nme,ins_jaddr = useful_inss_list[i+j] + _, ins_addr, ins_nme, ins_jaddr = useful_inss_list[i + j] - #print i+j,ins_nme, ins_jaddr + # print i+j,ins_nme, ins_jaddr - if ins_nme in ['call', 'callq']: # ordinary call - #"addr", ins_jaddr + if ins_nme in ['call', 'callq']: # ordinary call + #"addr", ins_jaddr - if ins_jaddr == '': - break # parametric jmp, similar to ret for us + if ins_jaddr == '': + break # parametric jmp, similar to ret for us - ins_jaddr = int(ins_jaddr,16) - if ins_jaddr in elf.plt: - r = r + " " + elf.plt[ins_jaddr] - if elf.plt[ins_jaddr] == "exit": - break - else: + ins_jaddr = int(ins_jaddr, 16) + if ins_jaddr in elf.plt: + r = r + " " + elf.plt[ins_jaddr] + if elf.plt[ins_jaddr] == "exit": + break + else: - if ins_jaddr in useful_inss_dict: - #assert(0) - #r = r + " " + hex(ins_jaddr) - i,_,_,_ = useful_inss_dict[ins_jaddr] - j = 0 - continue + if ins_jaddr in useful_inss_dict: + # assert(0) + #r = r + " " + hex(ins_jaddr) + i, _, _, _ = useful_inss_dict[ins_jaddr] + j = 0 + continue - else: - pass # ignored call + else: + pass # ignored call - elif ins_nme in ['ret','retq']: - break - else: - pass - #print i+j,ins_nme, ins_jaddr + elif ins_nme in ['ret', 'retq']: + break + else: + pass + # print i+j,ins_nme, ins_jaddr - #print j - if ins_nme == 'jmp' : + # print j + if ins_nme == 'jmp': - if ins_jaddr in elf.plt: # call equivalent using jmp - r = r + " " + elf.plt[jaddr] + if ins_jaddr in elf.plt: # call equivalent using jmp + r = r + " " + elf.plt[jaddr] - else: + else: - if ins_jaddr == '': - break # parametric jmp, similar to ret for us + if ins_jaddr == '': + break # parametric jmp, similar to ret for us - ins_jaddr = int(ins_jaddr,16) - if ins_jaddr in useful_inss_dict: - #r = r + " " + hex(ins_jaddr) - i,_,_,_ = useful_inss_dict[ins_jaddr] - j = 0 - continue + ins_jaddr = int(ins_jaddr, 16) + if ins_jaddr in useful_inss_dict: + #r = r + " " + hex(ins_jaddr) + i, _, _, _ = useful_inss_dict[ins_jaddr] + j = 0 + continue - else: - pass # ignored call + else: + pass # ignored call + if ins_nme in cond_control_flow_ins: - if ins_nme in cond_control_flow_ins: + assert(ins_jaddr is not None) - assert(ins_jaddr <> None) + cond = random.randint(0, 1) - cond = random.randint(0,1) + if cond == 1: - if cond == 1: + i, _, _, _ = useful_inss_dict[ins_jaddr] + j = 0 + continue - i,_,_,_ = useful_inss_dict[ins_jaddr] - j = 0 - continue + j = j + 1 - j = j + 1 + #r = r + "\t"+rclass + x = hash(r) + size = len(r.split(" ")) - 1 - #r = r + "\t"+rclass - x = hash(r) - size = len(r.split(" "))-1 + # if x not in traces and size >= min_size: + # print r+" .", + collected_traces = collected_traces + r + " ." + # traces.add(x) + # if len(traces) >= max_subtraces: + # break - #if x not in traces and size >= min_size: - #print r+" .", - collected_traces = collected_traces + r + " ." - #traces.add(x) - #if len(traces) >= max_subtraces: - # break + row = [elf.path, collected_traces] + if mclass is not None: + row.append(mclass) - row = [elf.path, collected_traces] - if mclass is not None: - row.append(mclass) - - csvwriter.writerow(row) + csvwriter.writerow(row) diff --git a/vdiscover/Recall.py b/vdiscover/Recall.py index fe0396b..aabdf96 100644 --- a/vdiscover/Recall.py +++ b/vdiscover/Recall.py @@ -8,63 +8,58 @@ from Utils import * -def Recall(model_file, in_file, in_type, out_file, test_mode, probability=False): - model = load_model(model_file) - csvwriter = write_csv(out_file) +def Recall( + model_file, + in_file, + in_type, + out_file, + test_mode, + probability=False): - x = dict() + model = loadModel(model_file) + csvwriter = writeCSV(out_file) - testcases, features, test_classes = read_traces(in_file, None, cut=None) - x[in_type] = features + x = dict() - if probability: - predicted_classes = map(lambda x: x[1], model.predict_proba(x)) # probability of the second class - else: - predicted_classes = model.predict(x) + testcases, features, test_classes = readTraces(in_file, None, cut=None) + x[in_type] = features - for testcase,y in zip(testcases,predicted_classes): - csvwriter.writerow([testcase,y]) - - if test_mode == "simple": - nclasses = len(set(test_classes)) - one_class = int(test_classes[0]) - - if nclasses == 1: - err = [None, None] - err[one_class] = recall_score(test_classes, predicted_classes, average=None)[one_class] - err[1 - one_class] = err[one_class] + if probability: + # probability of the second class + predicted_classes = map(lambda x: x[1], model.predict_proba(x)) else: - err = recall_score(test_classes, predicted_classes, average=None) - - print classification_report(test_classes, predicted_classes) - print "Accuracy per class:", round(err[0],2), round(err[1],2) - print "Average accuracy:", round(sum(err)/2.0,2) - - elif test_mode == "aggregated": - - - #print len(testcases), len(predicted_classes), len(test_classes) - prog_pred = dict() + predicted_classes = map(str, model.predict(x)) + #predicted_classes = model.predict(x) - for (program, predicted, real) in zip(testcases, predicted_classes, test_classes): - prog_pred[program] = prog_pred.get(program,[]) + [abs(predicted-real)] + for testcase, y in zip(testcases, predicted_classes): + csvwriter.writerow([testcase, y]) - print round(numpy.mean(map(numpy.mean, prog_pred.values())),2) + if test_mode == "simple": + nclasses = len(set(test_classes)) + one_class = int(test_classes[0]) - # BROKEN! - #prog_classes = dict() - #for prog,cl in zip(testcases, test_classes): - # prog_classes[prog] = cl + if nclasses == 1: + err = [None, None] + err[one_class] = recall_score( + test_classes, predicted_classes, average=None)[one_class] + err[1 - one_class] = err[one_class] + else: + err = recall_score(test_classes, predicted_classes, average=None) - #prog_pred = dict(zip(prog_classes.keys(), [[]]*len(prog_classes))) - #for prog, pred in zip(testcases,predicted_classes): - # prog_pred[prog].append(abs(pred - prog_classes[prog])) + print classification_report(test_classes, predicted_classes) + print "Accuracy per class:", round(err[0], 2), round(err[1], 2) + print "Average accuracy:", round(sum(err) / 2.0, 2) - #errors = [] - #for prog, preds in prog_pred.items(): - # errors.append(sum(preds)/float(len(preds))) + elif test_mode == "aggregated": - #print sum(errors) / float(len(errors)) + # print len(testcases), len(predicted_classes), len(test_classes) + prog_pred = dict() + for (program, predicted, real) in zip( + testcases, predicted_classes, test_classes): + predicted,real = int(predicted), int(real) + prog_pred[program] = prog_pred.get( + program, []) + [abs(predicted - real)] + print round(numpy.mean(map(numpy.mean, prog_pred.values())), 2) diff --git a/vdiscover/Run.py b/vdiscover/Run.py index 95ba6ba..cda4293 100644 --- a/vdiscover/Run.py +++ b/vdiscover/Run.py @@ -19,7 +19,6 @@ """ - #from ptrace.debugger.child import createChild from os import system, dup2, close, open as fopen, O_RDONLY from sys import stdin @@ -38,6 +37,7 @@ class ChildError(RuntimeError): pass + def _execChild(arguments, no_stdout, env): if no_stdout: try: @@ -45,7 +45,7 @@ def _execChild(arguments, no_stdout, env): dup2(null.fileno(), 1) dup2(1, 2) null.close() - except IOError, err: + except IOError as err: close(2) close(1) try: @@ -53,9 +53,10 @@ def _execChild(arguments, no_stdout, env): execve(arguments[0], arguments, env) else: execv(arguments[0], arguments) - except Exception, err: + except Exception as err: raise ChildError(str(err)) + def createChild(arguments, no_stdout, env=None): """ Create a child process: @@ -73,66 +74,66 @@ def createChild(arguments, no_stdout, env=None): if pid: return pid else: - #print "limit",getrlimit(RLIMIT_DATA) - setrlimit(RLIMIT_AS, (1024*1024*1024, -1)) - #print "limit",getrlimit(RLIMIT_DATA) + # print "limit",getrlimit(RLIMIT_DATA) + setrlimit(RLIMIT_AS, (1024 * 1024 * 1024, -1)) + # print "limit",getrlimit(RLIMIT_DATA) try: - ptrace_traceme() - except PtraceError, err: - raise ChildError(str(err)) + ptrace_traceme() + except PtraceError as err: + raise ChildError(str(err)) _execChild(arguments, no_stdout, env) exit(255) def Launch(cmd, no_stdout, env): - global fds - global c - c = c + 1 - #cmd = ["/usr/bin/timeout", "-k", "1", "3"]+cmd - #print cmd - if cmd[-1][0:2] == "< ": - filename = cmd[-1].replace("< ", "") + global fds + global c + c = c + 1 + #cmd = ["/usr/bin/timeout", "-k", "1", "3"]+cmd + # print cmd + if cmd[-1][0:2] == "< ": + filename = cmd[-1].replace("< ", "") - #try: - # close(3) - #except OSError: - # print "OsError!" - # pass + # try: + # close(3) + # except OSError: + # print "OsError!" + # pass - for fd in fds: - #print fd, - try: - close(fd) - #print "closed!" - except OSError: - #print "failed close!" - pass + for fd in fds: + # print fd, + try: + close(fd) + # print "closed!" + except OSError: + # print "failed close!" + pass - fds = [] + fds = [] - desc = fopen(filename,O_RDONLY) - fds.append(desc) - dup2(desc, stdin.fileno()) - fds.append(desc) - #close(desc) + desc = fopen(filename, O_RDONLY) + fds.append(desc) + dup2(desc, stdin.fileno()) + fds.append(desc) + # close(desc) - cmd = cmd[:-1] + cmd = cmd[:-1] - #print "c:", c - #print "self pid", getpid() + # print "c:", c + # print "self pid", getpid() - r = createChild(cmd, no_stdout, env) + r = createChild(cmd, no_stdout, env) - #print "new pid", r - #print "self pid", getpid() - #print "Done!" + # print "new pid", r + # print "self pid", getpid() + # print "Done!" - return r + return r -#class Runner: +# class Runner: # def __init__(self, cmd, timeout): # #threading.Thread.__init__(self) # diff --git a/vdiscover/Sampling.py b/vdiscover/Sampling.py index 23d5ac4..c8ef251 100644 --- a/vdiscover/Sampling.py +++ b/vdiscover/Sampling.py @@ -20,22 +20,23 @@ import random import copy + def cluster_sampler(clustered_traces, n_per_cluster): - #cc = copy.copy(clusters) - #n_per_cluster = 1#n / len(cc) - clusters = dict() - for label, cluster in clustered_traces: - clusters[cluster] = clusters.get(cluster, []) + [label.split(":")[-1]] - - selected = set() - tmp = set() - - for (cluster, seeds) in clusters.items(): - n_sample = min(len(seeds), n_per_cluster) - tmp = set(seeds).intersection(selected) - if len(tmp) >= n_sample: - selected.update(set(random.sample(tmp, n_sample))) - else: - selected.update(set(random.sample(seeds, n_sample))) - - return selected + #cc = copy.copy(clusters) + # n_per_cluster = 1#n / len(cc) + clusters = dict() + for label, cluster in clustered_traces: + clusters[cluster] = clusters.get(cluster, []) + [label.split(":")[-1]] + + selected = set() + tmp = set() + + for (cluster, seeds) in clusters.items(): + n_sample = min(len(seeds), n_per_cluster) + tmp = set(seeds).intersection(selected) + if len(tmp) >= n_sample: + selected.update(set(random.sample(tmp, n_sample))) + else: + selected.update(set(random.sample(seeds, n_sample))) + + return selected diff --git a/vdiscover/Spec.py b/vdiscover/Spec.py index c80455d..054a147 100644 --- a/vdiscover/Spec.py +++ b/vdiscover/Spec.py @@ -21,7 +21,7 @@ realpath = os.path.dirname(os.path.realpath(__file__)) datadir = "data/" -f = open(realpath+"/"+datadir+"prototypes.conf") +f = open(realpath + "/" + datadir + "prototypes.conf") specs = dict() for raw_spec in f.readlines(): @@ -30,12 +30,12 @@ raw_spec = raw_spec.replace(" (", "(") raw_spec = raw_spec.replace(" ", " ") raw_spec = raw_spec.replace(" ", " ") - if raw_spec <> "" and raw_spec[0] <> ";" and (not "SYS_" in raw_spec): + if raw_spec != "" and raw_spec[0] != ";" and (not "SYS_" in raw_spec): x = raw_spec.split(" ") ret = x[0] x = x[1].split("(") name = x[0] - param_types = x[1].replace(");", "").split(",") + param_types = x[1].replace(");", "").split(",") specs[name] = [ret] + param_types -#print specs +# print specs diff --git a/vdiscover/Train.py b/vdiscover/Train.py index 3bc9a13..d177844 100644 --- a/vdiscover/Train.py +++ b/vdiscover/Train.py @@ -23,133 +23,145 @@ from Pipeline import * from sklearn.metrics import confusion_matrix -def TrainScikitLearn(model_file, train_file, valid_file, ftype, nsamples): - #csvreader = open_csv(train_file) - modelfile = open_model(model_file) - train_programs, train_features, train_classes = read_traces(train_file, nsamples, cut=None) - print "using", len(train_features),"examples to train." +def TrainScikitLearn(model_file, train_file, valid_file, vtype, ftype, nsamples): - train_dict = dict() - train_dict[ftype] = train_features + modelfile = openModel(model_file) + train_programs, train_features, train_classes = readTraces( + train_file, nsamples, cut=None) + print "using", len(train_features), "examples to train." - print "Transforming data and fitting model.." - model = make_train_pipeline(ftype) - model.fit(train_dict,train_classes) + train_dict = dict() + train_dict[ftype] = train_features - print "Done!" - #print model - #print confusion_matrix(train_classes, model.predict(train_dict)) + print "Transforming data and fitting model.." - print "Saving model to",model_file - modelfile.write(pickle.dumps(model)) + if vtype == "bow": + model = makeTrainPipelineBOW(ftype) -def TrainKeras(model_file, train_file, valid_file, ftype, nsamples): - - csvreader = open_csv(train_file) - modelfile = open_model(model_file) - - train_features = [] - train_programs = [] - train_classes = [] - - print "Reading and sampling data to train..", - if nsamples is None: - for i,(program, features, cl) in enumerate(csvreader): - train_programs.append(program) - train_features.append(features) - train_classes.append(int(cl)) - else: - - train_size = file_len(in_file) - skip_until = random.randint(0,train_size - nsamples) - - for i,(program, features, cl) in enumerate(csvreader): - - if i < skip_until: - continue - elif i - skip_until == nsamples: - break - - train_programs.append(program) - train_features.append(features) - train_classes.append(int(cl)) - train_size = len(train_features) - - assert(train_size == len(train_classes)) - - print "using", train_size,"examples to train." - - train_dict = dict() - train_dict[ftype] = train_features - batch_size = 16 - window_size = 25 - - from keras.preprocessing.text import Tokenizer + model.fit(train_dict, train_classes) - tokenizer = Tokenizer(nb_words=None, filters="", lower=False, split=" ") - #print type(train_features[0]) - tokenizer.fit_on_texts(train_features) - max_features = len(tokenizer.word_counts) + print "Done!" + # print model + # print confusion_matrix(train_classes, model.predict(train_dict)) - preprocessor = KerasPreprocessor(tokenizer, window_size, batch_size) + print "Saving model to", model_file + modelfile.write(pickle.dumps(model)) - if valid_file is not None: - csvreader = open_csv(valid_file) - valid_features = [] - valid_programs = [] - valid_classes = [] - - print "Reading data to valid..", - for i,(program, features, cl) in enumerate(csvreader): - valid_programs.append(program) - valid_features.append(features) - valid_classes.append(int(cl)) - - print "using", len(train_features),"examples to valid." - #X_valid,y_valid = preprocessor.preprocess(valid_features, valid_classes) - else: - valid_features,train_features = train_features[0:int(0.1*train_size)], train_features[int(0.1*train_size):] - valid_classes,train_classes = train_classes[0:int(0.1*train_size)], train_classes[int(0.1*train_size):] - - X_valid,y_valid = preprocessor.preprocess(valid_features, valid_classes, 500) - X_train,y_train = preprocessor.preprocess(train_features, train_classes, 10000) - - from keras.models import Sequential - from keras.layers.core import Dense, Dropout, Activation - from keras.layers.embeddings import Embedding - from keras.layers.recurrent import LSTM, GRU - from keras.optimizers import Adam - - print "Creating and compiling a LSTM.." - model = Sequential() - model.add(Embedding(max_features, 10)) - model.add(LSTM(10, 32)) - model.add(Dropout(0.50)) - model.add(Dense(32, 1)) - model.add(Activation('sigmoid')) - - # try using different optimizers and different optimizer config - opt = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-8, kappa=1-1e-8) - model.compile(loss='binary_crossentropy', optimizer=opt, class_mode="binary") - #model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=30, validation_data = (X_valid,y_valid), show_accuracy=True) - model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=5, show_accuracy=True) - - print "Saving model to",model_file - - modelfile.write(pickle.dumps(KerasPredictor(preprocessor,model,ftype))) +""" +def TrainKeras(model_file, train_file, valid_file, ftype, nsamples): + csvreader = open_csv(train_file) + modelfile = open_model(model_file) + + train_features = [] + train_programs = [] + train_classes = [] + + print "Reading and sampling data to train..", + if nsamples is None: + for i, (program, features, cl) in enumerate(csvreader): + train_programs.append(program) + train_features.append(features) + train_classes.append(int(cl)) + else: + + train_size = file_len(in_file) + skip_until = random.randint(0, train_size - nsamples) + + for i, (program, features, cl) in enumerate(csvreader): + + if i < skip_until: + continue + elif i - skip_until == nsamples: + break + + train_programs.append(program) + train_features.append(features) + train_classes.append(int(cl)) + train_size = len(train_features) + + assert(train_size == len(train_classes)) + + print "using", train_size, "examples to train." + + train_dict = dict() + train_dict[ftype] = train_features + batch_size = 16 + window_size = 25 + + from keras.preprocessing.text import Tokenizer + + tokenizer = Tokenizer(nb_words=None, filters="", lower=False, split=" ") + # print type(train_features[0]) + tokenizer.fit_on_texts(train_features) + max_features = len(tokenizer.word_counts) + + preprocessor = KerasPreprocessor(tokenizer, window_size, batch_size) + + if valid_file is not None: + csvreader = open_csv(valid_file) + + valid_features = [] + valid_programs = [] + valid_classes = [] + + print "Reading data to valid..", + for i, (program, features, cl) in enumerate(csvreader): + valid_programs.append(program) + valid_features.append(features) + valid_classes.append(int(cl)) + + print "using", len(train_features), "examples to valid." + #X_valid,y_valid = preprocessor.preprocess(valid_features, valid_classes) + else: + valid_features, train_features = train_features[ + 0:int(0.1 * train_size)], train_features[int(0.1 * train_size):] + valid_classes, train_classes = train_classes[ + 0:int(0.1 * train_size)], train_classes[int(0.1 * train_size):] + + X_valid, y_valid = preprocessor.preprocess( + valid_features, valid_classes, 500) + X_train, y_train = preprocessor.preprocess( + train_features, train_classes, 10000) + + from keras.models import Sequential + from keras.layers.core import Dense, Dropout, Activation + from keras.layers.embeddings import Embedding + from keras.layers.recurrent import LSTM, GRU + from keras.optimizers import Adam + + print "Creating and compiling a LSTM.." + model = Sequential() + model.add(Embedding(max_features, 10)) + model.add(LSTM(10, 32)) + model.add(Dropout(0.50)) + model.add(Dense(32, 1)) + model.add(Activation('sigmoid')) + + # try using different optimizers and different optimizer config + opt = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, + epsilon=1e-8, kappa=1 - 1e-8) + model.compile(loss='binary_crossentropy', + optimizer=opt, class_mode="binary") + #model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=30, validation_data = (X_valid,y_valid), show_accuracy=True) + model.fit(X_train, y_train, batch_size=batch_size, + nb_epoch=5, show_accuracy=True) + + print "Saving model to", model_file + + modelfile.write(pickle.dumps(KerasPredictor(preprocessor, model, ftype))) +""" -def Train(model_file, train_file, valid_file, ttype, ftype, nsamples): - if ttype == "rf": - TrainScikitLearn(model_file, train_file, valid_file, ftype, nsamples) +def Train(model_file, train_file, valid_file, model_type, vector_type, feature_type, nsamples): - elif ttype == "lstm": - try: - import keras - except: - print "Failed to import keras modules to perform LSTM training" - return - TrainKeras(model_file, train_file, valid_file, ftype, nsamples) + TrainScikitLearn(model_file, train_file, valid_file, vector_type, feature_type, nsamples) + #elif ttype == "lstm": + # try: + # import keras + # except: + # print "Failed to import keras modules to perform LSTM training" + # return + # TrainKeras(model_file, train_file, valid_file, ftype, nsamples) diff --git a/vdiscover/Types.py b/vdiscover/Types.py index e7d87ef..f7065b4 100644 --- a/vdiscover/Types.py +++ b/vdiscover/Types.py @@ -19,64 +19,71 @@ import copy + class Type: - def __init__(self, name, size, index = None): - self.name = str(name) - self.size_in_bytes = size - self.index = index - - def __str__(self): - - r = str(self.name) - if (self.index <> None): - r = r +"("+str(self.index)+")" - - return r - - def getSize(self): - return self.size_in_bytes - - #def copy(self): - # return copy.copy(self) - -ptypes = [Type("Num32", 4, None) , - Type("Ptr32", 4, None) , # Generic pointer - Type("SPtr32", 4, None), # Stack pointer - Type("HPtr32", 4, None), # Heap pointer - Type("GxPtr32", 4, None), # Global eXecutable pointer - Type("FPtr32", 4, None), # File pointer - Type("NPtr32", 4, None), # NULL pointer - Type("DPtr32", 4, None), # Dangling pointer - Type("GPtr32", 4, None), # Global pointer + + def __init__(self, name, size, index=None): + self.name = str(name) + self.size_in_bytes = size + self.index = index + + def __str__(self): + + r = str(self.name) + if (self.index is not None): + r = r + "(" + str(self.index) + ")" + + return r + + def getSize(self): + return self.size_in_bytes + + # def copy(self): + # return copy.copy(self) + +ptypes = [Type("Num32", 4, None), + Type("Ptr32", 4, None), # Generic pointer + Type("SPtr32", 4, None), # Stack pointer + Type("HPtr32", 4, None), # Heap pointer + Type("GxPtr32", 4, None), # Global eXecutable pointer + Type("FPtr32", 4, None), # File pointer + Type("NPtr32", 4, None), # NULL pointer + Type("DPtr32", 4, None), # Dangling pointer + Type("GPtr32", 4, None), # Global pointer Type("Top32", 4, None) ] -for i in range(0,33,8): - ptypes.append(Type("Num32B"+str(i), 4, None)) +for i in range(0, 33, 8): + ptypes.append(Type("Num32B" + str(i), 4, None)) -num32_ptypes = filter(lambda t: "Num32" in str(t), ptypes) -ptr32_ptypes = ptypes[1:9] +num32_ptypes = filter(lambda t: "Num32" in str(t), ptypes) +ptr32_ptypes = ptypes[1:9] generic_ptypes = [Type("Top32", 4, None)] + def isNum(ptype): - return ptype in ["int", "ulong", "long", "char"] + return ptype in ["int", "ulong", "long", "char"] + def isPtr(ptype): - return "addr" in ptype or "*" in ptype or "string" in ptype or "format" in ptype or "file" in ptype + return "addr" in ptype or "*" in ptype or "string" in ptype or "format" in ptype or "file" in ptype + def isVoid(ptype): - return ptype == "void" + return ptype == "void" + def isNull(val): - return val == "0x0" or val == "0" + return val == "0x0" or val == "0" + def GetPtype(ptype): - if isPtr(ptype): - return Type("Ptr32", 4) - elif isNum(ptype): - return Type("Num32", 4) - elif isVoid(ptype): - return Type("Top32", 4) - else: - return Type("Top32", 4) + if isPtr(ptype): + return Type("Ptr32", 4) + elif isNum(ptype): + return Type("Num32", 4) + elif isVoid(ptype): + return Type("Top32", 4) + else: + return Type("Top32", 4) diff --git a/vdiscover/Utils.py b/vdiscover/Utils.py index 91ecf48..cfa6780 100644 --- a/vdiscover/Utils.py +++ b/vdiscover/Utils.py @@ -26,7 +26,7 @@ def update_progress(progress): - barLength = 30 # Modify this to change the length of the progress bar + barLength = 30 # Modify this to change the length of the progress bar status = "" if isinstance(progress, int): progress = float(progress) @@ -39,167 +39,177 @@ def update_progress(progress): if progress >= 1: progress = 1 status = "Done...\r\n" - block = int(round(barLength*progress)) - text = "\rPercent: [{0}] {1}% {2}".format( "#"*block + "-"*(barLength-block), progress*100, status) + block = int(round(barLength * progress)) + text = "\rPercent: [{0}] {1}% {2}".format( + "#" * block + "-" * (barLength - block), progress * 100, status) sys.stdout.write(text) sys.stdout.flush() -def file_len(fname): - if ".gz" in fname: - cat = "zcat" - else: - cat = "cat" +def getFileLength(fname): - p = subprocess.Popen(cat + " " + fname + " | wc -l", shell=True, stdout=subprocess.PIPE, - stderr=subprocess.PIPE) - result, err = p.communicate() - if p.returncode != 0: - raise IOError(err) - return int(result.strip().split()[0]) + if ".gz" in fname: + cat = "zcat" + else: + cat = "cat" -def load_csv(in_file): + p = subprocess.Popen( + cat + " " + fname + " | wc -l", + shell=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + result, err = p.communicate() + if p.returncode != 0: + raise IOError(err) + return int(result.strip().split()[0]) - if ".gz" in in_file: - infile = gzip.open(in_file, "r") - else: - infile = open(in_file, "r") - return csv.reader(infile, delimiter='\t') +def loadCSV(in_file): -def write_csv(in_file): + if ".gz" in in_file: + infile = gzip.open(in_file, "r") + else: + infile = open(in_file, "r") - if ".gz" in in_file: - infile = gzip.open(in_file, "w") - else: - infile = open(in_file, "w") + return csv.reader(infile, delimiter='\t') - return csv.writer(infile, delimiter='\t') -def open_csv(in_file): +def writeCSV(in_file): - if ".gz" in in_file: - infile = gzip.open(in_file, "a+") - else: - infile = open(in_file, "a+") + if ".gz" in in_file: + infile = gzip.open(in_file, "w") + else: + infile = open(in_file, "w") - return csv.writer(infile, delimiter='\t') + return csv.writer(infile, delimiter='\t') -def load_model(model_file): - if ".pklz" in model_file: - modelfile = gzip.open(model_file,"r") - else: - modelfile = open(model_file,"r") +def openCSV(in_file): - model = pickle.load(gzip.open(model_file)) - return model + if ".gz" in in_file: + infile = gzip.open(in_file, "a+") + else: + infile = open(in_file, "a+") -def open_model(model_file): + return csv.writer(infile, delimiter='\t') - if ".pklz" in model_file: - modelfile = gzip.open(model_file,"w+") - else: - modelfile = open(model_file,"w+") - return modelfile +def loadModel(model_file): -def read_traces(train_file, nsamples, cut=None, maxsize=50): + if ".pklz" in model_file: + modelfile = gzip.open(model_file, "r") + else: + modelfile = open(model_file, "r") - if type(train_file) == str: - csvreader = load_csv(train_file) - elif type(train_file) == list: - csvreader = train_file - else: - assert(0) + model = pickle.load(gzip.open(model_file)) + return model - train_features = [] - train_programs = [] - train_classes = [] - #print "Reading and sampling data to train..", - if nsamples is None: - for i,col in enumerate(csvreader): +def openModel(model_file): - if len(col) < 2 or len(col) > 3: - print "Ignoring line", i, ":", "\t".join(col) - continue + if ".pklz" in model_file: + modelfile = gzip.open(model_file, "w+") + else: + modelfile = open(model_file, "w+") - program = col[0] - features = col[1] - if len(col) == 3: - cl = str(col[2]) #int(col[2]) - else: - cl = -1 + return modelfile - raw_trace = features[:-1] - trace = raw_trace.split(" ") - size = len(trace) - if cut is None or size < maxsize: +def readTraces(train_file, nsamples, cut=None, maxsize=50): - train_programs.append(program) - train_features.append(features) - train_classes.append(cl) - else: - for _ in range(cut): + if isinstance(train_file, str): + csvreader = loadCSV(train_file) + elif isinstance(train_file, list): + csvreader = train_file + else: + assert(0) - #start = random.randint(0,size/2) - #end = random.randint(size/2+1, size) - start = random.randint(0,size) - end = start + maxsize + train_features = [] + train_programs = [] + train_classes = [] - features = " ".join(trace[start:end+1]) + # print "Reading and sampling data to train..", + if nsamples is None: + for i, col in enumerate(csvreader): - train_programs.append(program) - train_features.append(features) - train_classes.append(cl) - else: + if len(col) < 2 or len(col) > 3: + print "Ignoring line", i, ":", "\t".join(col) + continue - if type(train_file) == str: - train_size = file_len(train_file) - elif type(train_file) == list: - train_size = len(csvreader) + program = col[0] + features = col[1] + if len(col) == 3: + cl = str(col[2]) # int(col[2]) + else: + cl = -1 - #train_size = file_len(train_file) - skip_until = random.randint(0,train_size - nsamples) + raw_trace = features[:-1] + trace = raw_trace.split(" ") + size = len(trace) - for i,col in enumerate(csvreader): + if cut is None or size < maxsize: - if i < skip_until: - continue - elif i - skip_until == nsamples: - break + train_programs.append(program) + train_features.append(features) + train_classes.append(cl) + else: + for _ in range(cut): - program = col[0] - features = col[1] - if len(col) > 2: - cl = int(col[2]) - else: - cl = -1 + #start = random.randint(0,size/2) + #end = random.randint(size/2+1, size) + start = random.randint(0, size) + end = start + maxsize - raw_trace = features[:-1] - trace = raw_trace.split(" ") - size = len(trace) + features = " ".join(trace[start:end + 1]) - if cut is None or size < maxsize: + train_programs.append(program) + train_features.append(features) + train_classes.append(cl) + else: - train_programs.append(program) - train_features.append(features) - train_classes.append(cl) - else: - for _ in range(cut): + if isinstance(train_file, str): + train_size = getFileLength(train_file) + elif isinstance(train_file, list): + train_size = len(csvreader) - #start = random.randint(0,size/2) - #end = random.randint(size/2+1, size) - start = random.randint(0,size-2) - end = start + random.randint(1,size-1) + #train_size = file_len(train_file) + skip_until = random.randint(0, train_size - nsamples) - features = " ".join(trace[start:end+1]) + for i, col in enumerate(csvreader): - train_programs.append(program) - train_features.append(features) - train_classes.append(cl) + if i < skip_until: + continue + elif i - skip_until == nsamples: + break + program = col[0] + features = col[1] + if len(col) > 2: + cl = int(col[2]) + else: + cl = -1 - return train_programs, train_features, train_classes + raw_trace = features[:-1] + trace = raw_trace.split(" ") + size = len(trace) + + if cut is None or size < maxsize: + + train_programs.append(program) + train_features.append(features) + train_classes.append(cl) + else: + for _ in range(cut): + + #start = random.randint(0,size/2) + #end = random.randint(size/2+1, size) + start = random.randint(0, size - 2) + end = start + random.randint(1, size - 1) + + features = " ".join(trace[start:end + 1]) + + train_programs.append(program) + train_features.append(features) + train_classes.append(cl) + + return train_programs, train_features, train_classes diff --git a/vdiscover/Vulnerabilities.py b/vdiscover/Vulnerabilities.py index 0835d11..7481878 100644 --- a/vdiscover/Vulnerabilities.py +++ b/vdiscover/Vulnerabilities.py @@ -17,53 +17,57 @@ Copyright 2014 by G.Grieco """ -from Event import Call, Crash, Abort, Exit, Signal, Vulnerability +from Event import Call, Crash, Abort, Exit, Signal, Vulnerability from Analysis import FindModule + def detect_vulnerabilities(preevents, events, process, mm): - r = [] + r = [] + + for (i, event) in enumerate(events): + r.append(detect_vulnerability(preevents, event, process, mm)) - for (i, event) in enumerate(events): - r.append(detect_vulnerability(preevents, event, process, mm)) + return filter(lambda e: e is not None, r) - return filter(lambda e: e is not None, r) def detect_vulnerability(preevents, event, process, mm): if isinstance(event, Call): - (name, args) = event.GetTypedName() - if name == "system" or name == "popen": - pass + (name, args) = event.GetTypedName() + if name == "system" or name == "popen": + pass elif isinstance(event, Abort): - if len(event.bt) > 0 and len(preevents) > 0: + if len(event.bt) > 0 and len(preevents) > 0: - if not (str(preevents[-1]) in ["free", "malloc", "realloc"]): - return None + if not (str(preevents[-1]) in ["free", "malloc", "realloc"]): + return None - for (typ, val) in event.bt: - module = FindModule(val, mm) - if module == "[vdso]": - pass - elif "libc-" in module: - assert(0) - return Vulnerability("MemoryCorruption") - else: - return None + for (typ, val) in event.bt: + module = FindModule(val, mm) + if module == "[vdso]": + pass + elif "libc-" in module: + assert(0) + return Vulnerability("MemoryCorruption") + else: + return None elif isinstance(event, Crash): - if str(event.fp_type[0]) == "DPtr32" and str(event.eip_type[0]) == "DPtr32": - return Vulnerability("StackCorruption") + if str( + event.fp_type[0]) == "DPtr32" and str( + event.eip_type[0]) == "DPtr32": + return Vulnerability("StackCorruption") - for (typ,val) in event.bt: - if str(typ) == "DPtr32": - return Vulnerability("StackCorruption") + for (typ, val) in event.bt: + if str(typ) == "DPtr32": + return Vulnerability("StackCorruption") elif isinstance(event, Signal): - pass + pass return None diff --git a/vdp b/vdp index e2a1be5..c5ee9f6 100755 --- a/vdp +++ b/vdp @@ -27,26 +27,36 @@ import random from subprocess import Popen, PIPE, STDOUT -from vdiscover.Detection import GetArgs, GetFiles, GetCmd -from vdiscover.Mutation import NullMutator, RandomByteMutator, RandomExpanderMutator, RandomInputMutator, opened_files -from vdiscover.Input import prepare_inputs +from vdiscover.Detection import GetArgs, GetFiles, GetCmd +from vdiscover.Mutation import NullMutator, RandomByteMutator, RandomExpanderMutator, RandomInputMutator, opened_files +from vdiscover.Input import prepare_inputs if __name__ == "__main__": # To help argparse to detect the number of columns correctly - #os.environ['COLUMNS'] = str(os.popen('stty size', 'r').read().split()[1]) #str(shutil.get_terminal_size().columns) + # os.environ['COLUMNS'] = str(os.popen('stty size', + # 'r').read().split()[1]) #str(shutil.get_terminal_size().columns) # Arguments - parser = argparse.ArgumentParser(description='Vulnerability Detection Procedure') - parser.add_argument("testcase", help="Testcase to analyze", type=str, default=None) - parser.add_argument("cmd", help="Testcase to analyze", type=str, default=None) - - parser.add_argument("--io-mode", - help="Input-Output mode", action="store_true", default=False) - - parser.add_argument("--seed-range", type=str, - help="A seed range to feed the cmd (io mode only)", default="0:0") + parser = argparse.ArgumentParser( + description='Vulnerability Detection Procedure') + parser.add_argument( + "testcase", help="Testcase to analyze", type=str, default=None) + parser.add_argument("cmd", help="Testcase to analyze", + type=str, default=None) + + parser.add_argument( + "--io-mode", + help="Input-Output mode", + action="store_true", + default=False) + + parser.add_argument( + "--seed-range", + type=str, + help="A seed range to feed the cmd (io mode only)", + default="0:0") parser.add_argument("--show-stdout", help="Don't use /dev/null as stdout/stderr", @@ -59,10 +69,9 @@ if __name__ == "__main__": parser.add_argument("--timeout", dest="timeout", type=int, help="Timeout in seconds (io mode only)", default=3) - options = parser.parse_args() - start_seed,stop_seed = tuple(options.seed_range.split(":")) + start_seed, stop_seed = tuple(options.seed_range.split(":")) testcase = options.testcase vdp_cmd = options.cmd show_stdout = options.show_stdout @@ -84,33 +93,42 @@ if __name__ == "__main__": prepared_inputs = prepare_inputs(original_input) if show_cmd: - print vdp_cmd,program," ".join(prepared_inputs) - exit(0) + print vdp_cmd, program, " ".join(prepared_inputs) + exit(0) if io_mode: - DEVNULL = open(os.devnull, 'wb') - in_filename = files[0].GetName() - out_filename = files[0].GetFilename() - vdp_cmd = vdp_cmd.replace("", in_filename) - vdp_cmd = vdp_cmd.replace("", out_filename) - - for seed in xrange(int(start_seed),int(stop_seed)): - cmd = vdp_cmd.replace("",str(seed)) - p = Popen(cmd.split(" "), stdin=PIPE, stdout=DEVNULL, stderr=DEVNULL, env=dict()) - p.communicate() - p = Popen(["timeout","-k","1",str(timeout),program]+prepared_inputs, stdin=PIPE, stdout=DEVNULL, stderr=DEVNULL, env=dict()) - p.communicate() - - if p.returncode < 0: - print >> sys.stderr, testcase, p.returncode, seed - exit(1) - - exit(0) + DEVNULL = open(os.devnull, 'wb') + in_filename = files[0].GetName() + out_filename = files[0].GetFilename() + vdp_cmd = vdp_cmd.replace("", in_filename) + vdp_cmd = vdp_cmd.replace("", out_filename) + + for seed in xrange(int(start_seed), int(stop_seed)): + cmd = vdp_cmd.replace("", str(seed)) + p = Popen(cmd.split(" "), stdin=PIPE, + stdout=DEVNULL, stderr=DEVNULL, env=dict()) + p.communicate() + p = Popen(["timeout", + "-k", + "1", + str(timeout), + program] + prepared_inputs, + stdin=PIPE, + stdout=DEVNULL, + stderr=DEVNULL, + env=dict()) + p.communicate() + + if p.returncode < 0: + print >> sys.stderr, testcase, p.returncode, seed + exit(1) + + exit(0) else: - p = Popen(vdp_cmd.split(" ")+[program]+prepared_inputs, stdin=PIPE, env=dict()) - p.communicate() + p = Popen(vdp_cmd.split(" ") + [program] + + prepared_inputs, stdin=PIPE, env=dict()) + p.communicate() exit(p.returncode) - diff --git a/vpredictor b/vpredictor index c3dd938..674fd19 100755 --- a/vpredictor +++ b/vpredictor @@ -25,25 +25,32 @@ import sys import csv csv.field_size_limit(sys.maxsize) -sys.setrecursionlimit(1024*1024*1024) +sys.setrecursionlimit(1024 * 1024 * 1024) from vdiscover.Pipeline import * -from vdiscover.Recall import Recall -from vdiscover.Train import Train +from vdiscover.Recall import Recall +from vdiscover.Train import Train if __name__ == "__main__": # Arguments - parser = argparse.ArgumentParser(description='A trainer and predictor of vulnerabilities') - parser.add_argument("infile", help="A csv with the features to train or predict", type=str, default=None) + parser = argparse.ArgumentParser( + description='A trainer and predictor of vulnerabilities') + parser.add_argument( + "infile", + help="A csv with the features to train or predict", + type=str, + default=None) parser.add_argument("--model", type=str, help="Use a pretrained model (recall only)", action="store", default=None) - parser.add_argument("--prob", - help="Output the probability of each prediction (recall only)", - action="store_true", default=False) + parser.add_argument( + "--prob", + help="Output the probability of each prediction (recall only)", + action="store_true", + default=False) parser.add_argument("--test", help="Test a model using infile (recall only)", @@ -53,7 +60,6 @@ if __name__ == "__main__": help="Test a model using infile (recall only)", action="store_true", default=False) - parser.add_argument("--static", help="Use static features", action="store_true", default=False) @@ -62,45 +68,59 @@ if __name__ == "__main__": help="Use dynamic features", action="store_true", default=False) - parser.add_argument("--valid", - help="Valid a model using infile", - action="store", default=None) + #parser.add_argument("--valid", + # help="Valid a model using infile", + # action="store", default=None) - parser.add_argument("--cluster-with-repr", - help="Cluster input traces using some representation (bow, doc2vec)", - action="store", default=None) + #parser.add_argument( + # "--cluster-with-repr", + # help="Cluster input traces using some representation (bow, doc2vec)", + # action="store", + # default=None) - parser.add_argument("--cluster-with-rdim", - help="Cluster input traces reducing dimensionality (pca, svd, none)", - action="store", default="pca") + #parser.add_argument( + # "--cluster-with-rdim", + # help="Cluster input traces reducing dimensionality (pca, svd, none)", + # action="store", + # default="pca") - #parser.add_argument("--cluster-doc2vec", + # parser.add_argument("--cluster-doc2vec", # help="Cluster input traces using doc2vec", # action="store_true", default=False) - parser.add_argument("--cluster-param", type=float, - help="Cluster parameter", - action="store", default=0.1) + #parser.add_argument("--cluster-param", type=float, + # help="Cluster parameter", + # action="store", default=0.1) - parser.add_argument("--cluster-cnn", - help="Cluster input traces using a convolutional model", - action="store_true", default=False) + #parser.add_argument( + # "--cluster-cnn", + # help="Cluster input traces using a convolutional model", + # action="store_true", + # default=False) - parser.add_argument("--train-rf", - help="Train a Random Forest using infile", + parser.add_argument("--train", + help="Train a model using a random forest", action="store_true", default=False) - #parser.add_argument("--train-lstm", + parser.add_argument("--vect", type=str, + help="Which technique use to vectorize traces", + action="store", default="bow") + + + # parser.add_argument("--train-lstm", # help="Train a LSTM using infile (warning: very experimental and slow)", # action="store_true", default=False) - #parser.add_argument("--train-cnn", + # parser.add_argument("--train-cnn", # help="Train a CNN using infile", # action="store_true", default=False) - parser.add_argument("--n-samples", type=int, - help="Select a number of samples from infile (train only)", - action="store", default=None) + parser.add_argument( + "--n-samples", + type=int, + help="Select a number of samples from infile (train only)", + action="store", + default=None) parser.add_argument("--out-file", help="File to output the results/model", @@ -108,21 +128,24 @@ if __name__ == "__main__": options = parser.parse_args() in_file = options.infile - valid_file = options.valid + vector_type = options.vect + + #valid_file = options.valid test_simple = options.test test_aggr = options.test_aggr - training_mode_rf = options.train_rf + #training_mode_rf = options.train #training_mode_lstm = options.train_lstm #training_mode_cnn = options.train_cnn - training_mode_cluster_repr = options.cluster_with_repr + #training_mode_cluster_repr = options.cluster_with_repr - cluster_rdim = options.cluster_with_rdim - cluster_param = options.cluster_param + #cluster_rdim = options.cluster_with_rdim + #cluster_param = options.cluster_param - training_mode = training_mode_rf or training_mode_cluster_repr #training_mode_cluster_bow or training_mode_cluster_cnn or training_mode_cluster_doc2vec + # training_mode_cluster_bow or training_mode_cluster_cnn or training_mode_cluster_doc2vec + training_mode = options.train #training_mode_rf or training_mode_cluster_repr probability_mode = options.prob nsamples = options.n_samples @@ -133,25 +156,27 @@ if __name__ == "__main__": out_file = options.out_file model_file = options.model - if (not static_only and not dynamic_only) or (static_only and dynamic_only): - print "VDiscover requires to select either static of dynamic features exclusively" - exit(-1) + if (not static_only and not dynamic_only) or ( + static_only and dynamic_only): + print "VDiscover requires to select either static of dynamic features exclusively" + exit(-1) elif static_only: - ftype = "static" + features_type = "static" elif dynamic_only: - ftype = "dynamic" + features_type = "dynamic" if training_mode: - if training_mode_rf: - Train(out_file, in_file, valid_file, "rf", ftype, nsamples) + model_type = "rf" + Train(out_file, in_file, None, model_type, vector_type, features_type, nsamples) - elif training_mode_cluster_repr: - cluster_repr = training_mode_cluster_repr - from vdiscover.Cluster import ClusterScikit + #elif training_mode_cluster_repr: + # cluster_repr = training_mode_cluster_repr + # from vdiscover.Cluster import ClusterScikit + # + # ClusterScikit(None, in_file, valid_file, ftype, nsamples, + # cluster_repr, cluster_rdim, cluster_param) - ClusterScikit(None, in_file, valid_file, ftype, nsamples, cluster_repr, cluster_rdim, cluster_param) - - """ + """ elif training_mode_cluster_cnn: if (model_file is None): @@ -165,14 +190,15 @@ if __name__ == "__main__": """ else: - if model_file is None: - print "VDiscover requires a pre-trained model to predict" - exit(-1) - - test_mode = None - if test_simple: - test_mode = "simple" - elif test_aggr: - test_mode = "aggregated" - - Recall(model_file, in_file, ftype, out_file, test_mode, probability=probability_mode) + if model_file is None: + print "VDiscover requires a pre-trained model to predict" + exit(-1) + + test_mode = None + if test_simple: + test_mode = "simple" + elif test_aggr: + test_mode = "aggregated" + + Recall(model_file, in_file, features_type, out_file, + test_mode, probability=probability_mode)