Skip to content

Commit

Permalink
max event limitation + fixes in clustering and sampling
Browse files Browse the repository at this point in the history
  • Loading branch information
gaa-cifasis committed Jan 13, 2016
1 parent 163d57a commit 12d1339
Show file tree
Hide file tree
Showing 7 changed files with 77 additions and 31 deletions.
12 changes: 8 additions & 4 deletions tseeder
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ Copyright 2015 by G.Grieco
"""

import os
import shutil
import argparse
import sys
import csv
Expand All @@ -36,7 +37,7 @@ if __name__ == "__main__":
parser.add_argument("infile", help="A csv with the features to train or predict", type=str, default=None)
parser.add_argument("outdir", help="A directory with the seeds", type=str, default=None)
parser.add_argument("-n", help="Number of seeds to select per cluster", type=int, default=1)
parser.add_argument("--random", help="Sample randomly", action="store_true", default=None)
#parser.add_argument("--random", help="Sample randomly", action="store_true", default=None)

options = parser.parse_args()
in_file = options.infile
Expand All @@ -50,7 +51,10 @@ if __name__ == "__main__":

selected = cluster_sampler(clusters, nseeds)

for seed in selected:
print "cp", seed, outdir

if not os.path.exists(outdir):
os.makedirs(outdir)

print "Copying seeds.."
for seed in selected:
print seed
shutil.copy(seed, outdir)
21 changes: 17 additions & 4 deletions vd
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ sys.setrecursionlimit(1024*1024*1024)
#from vdiscover.Detection import WriteTestcase
from vdiscover.Process import Process
from vdiscover.Printer import TypePrinter
from vdiscover.Cluster import ClusterScikit, ClusterConv
#from vdiscover.Cluster import ClusterScikit, ClusterConv
from vdiscover.Utils import update_progress
from vdiscover.Sampling import cluster_sampler

Expand Down Expand Up @@ -60,15 +60,26 @@ if __name__ == "__main__":
#vectorizer = options.vectorizer
program = cmd.split(" ")[0]
programf = program.replace("/","__")
timeout = 5
main_module = program.split("/")[-1]

timeout = 15
envs = dict()
traces_path = outfile#outdir+"/traces.raw"

if os.path.exists(traces_path):
traces = traces_path
else:

app = Process(program, envs, timeout, [], [], True)
#app = Process(program, envs, timeout, ["libpixman-1.so","libcairo.so.2","libpango"], [], True)
modules_to_trace = [main_module]
if "LD_LIBRARY_PATH" in os.environ:
libs = os.environ["LD_LIBRARY_PATH"]
for _,_,files in os.walk(libs):
for f in files:
modules_to_trace.append(f)

print "Tracing", modules_to_trace
app = Process(program, envs, timeout, modules_to_trace, [], True)
prt = TypePrinter(traces_path, program, 0)
traces = []
all_files = []
Expand All @@ -85,13 +96,15 @@ if __name__ == "__main__":

for progress,testcase in enumerate(all_files):
#print testcase
progress = round(float(progress)/nfiles, 2)
progress = round(float(progress)/nfiles,4)
update_progress(progress)
prepared_cmd = cmd.replace(program,"")
prepared_cmd = prepared_cmd.split("@@")
prepared_cmd = prepared_cmd[0].split(" ") + [testcase] + prepared_cmd[1].split(" ")
prepared_cmd = filter(lambda x: x<>'', prepared_cmd)
#print "Getting data.."
events = app.getData(prepared_cmd)
#print "Printing data.. ", len(events)
traces.append(prt.print_events(testcase,events))
#print prepared_cmd
#print traces[-1]
Expand Down
35 changes: 25 additions & 10 deletions vdiscover/Cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,12 @@
import subprocess
import pickle
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

# hack from https://stackoverflow.com/questions/2801882/generating-a-png-with-matplotlib-when-display-is-undefined to avoid using X
mpl.use('Agg')
import matplotlib.pyplot as plt

from Utils import *
from Pipeline import *

Expand Down Expand Up @@ -111,13 +114,17 @@ def ClusterCnn(model_file, train_file, valid_file, ftype, nsamples, outdir):
colors = mpl.colors.cnames.keys()
progs = list(set(labels))
ncolors = len(colors)

size = len(labels)
print "Plotting.."

for prog,[x,y] in zip(labels, X_red):
#for prog,[x,y] in sample(zip(labels, X_red), min(size, 1000)):
x = gauss(0,0.05) + x
y = gauss(0,0.05) + y
color = 'r'
plt.scatter(x, y, c=color )

"""
if valid_file is not None:
valid_programs, valid_features, valid_classes = read_traces(valid_file, None, cut=None, maxsize=window_size) #None)
valid_dict = dict()
Expand All @@ -134,13 +141,16 @@ def ClusterCnn(model_file, train_file, valid_file, ftype, nsamples, outdir):
y = gauss(0,0.05) + y
plt.scatter(x, y, c='b')
plt.text(x, y+0.02, prog.split("/")[-1])

plt.show()
#plt.savefig("plot.png")

"""
plt.savefig(train_file.replace(".gz","")+".png")
print "Bandwidth estimation.."
from sklearn.cluster import MeanShift, estimate_bandwidth

bandwidth = estimate_bandwidth(X_red, quantile=0.2)

X_red_sample = X_red[:min(size, 1000)]
bandwidth = estimate_bandwidth(X_red_sample, quantile=0.2)
print "Clustering with bandwidth:", bandwidth

#X_red = np.vstack((X_red,X_red_valid))
Expand All @@ -150,16 +160,17 @@ def ClusterCnn(model_file, train_file, valid_file, ftype, nsamples, outdir):
print X_red.shape, len(X_red), len(labels)
#print valid_labels

af = MeanShift(bandwidth=bandwidth/5).fit(X_red)
af = MeanShift(bandwidth=bandwidth/1).fit(X_red)

cluster_centers = af.cluster_centers_
cluster_labels = af.labels_
n_clusters = len(cluster_centers)

plt.figure()
for ([x,y],label, cluster_label) in zip(X_red,labels, cluster_labels):
#x = gauss(0,0.1) + x
#y = gauss(0,0.1) + y
#for ([x,y],label, cluster_label) in sample(zip(X_red,labels, cluster_labels), min(size, 1000)):
x = gauss(0,0.1) + x
y = gauss(0,0.1) + y
plt.scatter(x, y, c = colors[cluster_label % ncolors])
#print label
#if label in valid_labels:
Expand All @@ -169,6 +180,7 @@ def ClusterCnn(model_file, train_file, valid_file, ftype, nsamples, outdir):
plt.plot(x, y, 'o', markerfacecolor=colors[i % ncolors],
markeredgecolor='k', markersize=7)

"""
#for prog,[x,y] in zip(valid_labels, X_red_valid):
#x = gauss(0,0.1) + x
#y = gauss(0,0.1) + y
Expand All @@ -180,6 +192,9 @@ def ClusterCnn(model_file, train_file, valid_file, ftype, nsamples, outdir):
#plt.savefig("clusters.png")
plt.show()
"""
plt.savefig(train_file.replace(".gz","")+".clusters.png")

clustered_traces = zip(labels, cluster_labels)
writer = open_csv(train_file.replace(".gz","")+".clusters")
for label, cluster in clustered_traces:
Expand Down
3 changes: 2 additions & 1 deletion vdiscover/Pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,7 @@ def preprocess_traces(self, X_data, y_data=None, labels=None):
cut_X_data = []
cut_label_data = []
cut_y_data = []
rep = 5
#rep = 5

X_size = len(X_data)

Expand All @@ -249,6 +249,7 @@ def preprocess_traces(self, X_data, y_data=None, labels=None):

size = len(trace)
rep = 1 + int(float(size) / float(self.max_len))
rep = min(rep, 10)

for _ in range(rep):

Expand Down
2 changes: 1 addition & 1 deletion vdiscover/Printer.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ def print_events(self, label, events):
trace = ""

for x,y in events:
trace = trace+x+"="+y+" "
trace = trace + ("%s=%s " % (x,y))

row = [self.pname+":"+label,trace]

Expand Down
27 changes: 17 additions & 10 deletions vdiscover/Process.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@
from Alarm import alarm_handler, TimeoutEx

class Process(Application):
def __init__(self, program, envs, timeout, included_mods = [], ignored_mods = [], no_stdout = True):
def __init__(self, program, envs, timeout, included_mods = [], ignored_mods = [], no_stdout = True, max_events = 10000):

Application.__init__(self) # no effect

Expand All @@ -63,7 +63,7 @@ def __init__(self, program, envs, timeout, included_mods = [], ignored_mods = []
self.pid = None
self.mm = None
self.timeouts = 0
self.max_timeouts = 10
self.max_events = max_events

# Parse ELF
self.elf = ELF(self.program, plt = False)
Expand All @@ -77,6 +77,7 @@ def __init__(self, program, envs, timeout, included_mods = [], ignored_mods = []
self.last_signal = {}
self.last_call = None
self.crashed = False
self.nevents = 0
self.events = []

self.binfo = dict()
Expand Down Expand Up @@ -146,7 +147,7 @@ def createEvents(self, signal):

else:
call = Call(name, module)
self.mm.update()
#self.mm.update()
#print "updated mm"
call.detect_parameters(self.process, self.mm)
breakpoint.desinstall(set_ip=True)
Expand Down Expand Up @@ -259,9 +260,11 @@ def cont(self, signum=None):
signal = self.debugger.waitSignals()
process = signal.process
events = self.createEvents(signal)
vulns = self.DetectVulnerabilities(self.events, events)

#vulns = self.DetectVulnerabilities(self.events, events)
#print "vulns detected"
self.events = self.events + events + vulns
self.events = self.events + events #+ vulns
self.nevents = self.nevents + len(events)


def readInstrSize(self, address, default_size=None):
Expand Down Expand Up @@ -328,11 +331,18 @@ def runProcess(self, cmd):

# Set the breakpoints
self.breakpoint(self.elf.GetEntrypoint())
#print hex(self.elf.GetEntrypoint())

try:
while True:

#self.cont()
if not self.debugger or self.crashed:
if self.nevents > self.max_events:

self.events.append(Timeout(timeout))
alarm(0)
return
elif not self.debugger or self.crashed:
# There is no more process: quit
alarm(0)
return
Expand Down Expand Up @@ -371,6 +381,7 @@ def runProcess(self, cmd):

def getData(self, inputs):
self.events = []
self.nevents = 0
self.debugger = PtraceDebugger()

self.runProcess([self.program]+inputs)
Expand All @@ -389,7 +400,3 @@ def getData(self, inputs):

self.process = None
return self.events


def timeouted(self):
return self.timeouts >= self.max_timeouts
8 changes: 7 additions & 1 deletion vdiscover/Sampling.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,14 @@ def cluster_sampler(clustered_traces, n_per_cluster):
clusters[cluster] = clusters.get(cluster, []) + [label.split(":")[-1]]

selected = set()
tmp = set()

for (cluster, seeds) in clusters.items():
n_sample = min(len(seeds), n_per_cluster)
selected.update(set(random.sample(seeds, n_sample)))
tmp = set(seeds).intersection(selected)
if len(tmp) >= n_sample:
selected.update(set(random.sample(tmp, n_sample)))
else:
selected.update(set(random.sample(seeds, n_sample)))

return selected

0 comments on commit 12d1339

Please sign in to comment.