From 4041929da6f1b15d8a5963bfd3388a9e021c3bd4 Mon Sep 17 00:00:00 2001 From: gaa-cifasis Date: Sat, 23 Apr 2016 06:56:03 -0700 Subject: [PATCH] added clustering of static data + fixes --- fextractor | 2 +- vdiscover/Cluster.py | 10 +++++++--- vdiscover/Pipeline.py | 12 +++++++++--- vdiscover/RandomWalk.py | 10 +++++----- vdiscover/Utils.py | 4 ++-- 5 files changed, 24 insertions(+), 14 deletions(-) diff --git a/fextractor b/fextractor index 42e4b2e..6a09410 100755 --- a/fextractor +++ b/fextractor @@ -67,7 +67,7 @@ if __name__ == "__main__": help="Extract only dynamic features from a testcase", action="store_true", default=False) - parser.add_argument("--mclass", type=int, + parser.add_argument("--mclass", type=str, help="Include class column, to use later in training mode", action="store", default=None) diff --git a/vdiscover/Cluster.py b/vdiscover/Cluster.py index d082789..9954fb6 100644 --- a/vdiscover/Cluster.py +++ b/vdiscover/Cluster.py @@ -311,9 +311,13 @@ def ClusterScikit(model_file, train_file, valid_file, ftype, nsamples): for prog,[x,y],cl in zip(train_programs, X_red, train_classes): x = gauss(0,0.1) + x y = gauss(0,0.1) + y - plt.scatter(x, y, c=colors[cl]) - plt.text(x, y+0.02, prog.split("/")[-1]) - + try: + plt.scatter(x, y, c=colors[int(cl)]) + plt.text(x, y+0.02, prog.split("/")[-1]) + except ValueError: + plt.text(x, y+0.02, cl) + + if valid_file is not None: valid_programs, valid_features, valid_classes = read_traces(valid_file, None) diff --git a/vdiscover/Pipeline.py b/vdiscover/Pipeline.py index 28b0540..384f8ce 100644 --- a/vdiscover/Pipeline.py +++ b/vdiscover/Pipeline.py @@ -24,8 +24,8 @@ from sklearn.naive_bayes import GaussianNB, MultinomialNB from sklearn.linear_model import LogisticRegression from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer -from sklearn.decomposition import TruncatedSVD, PCA -from sklearn.manifold import MDS +from sklearn.decomposition import PCA +#from sklearn.manifold import MDS from random import random, randint, sample, gauss @@ -126,7 +126,13 @@ def make_cluster_pipeline_bow(ftype): ]) elif ftype is "static": - raise NotImplemented + return Pipeline(steps=[ + ('selector', ItemSelector(key='static')), + ('dvectorizer', TfidfVectorizer(tokenizer=dynamic_tokenizer, use_idf=False, norm=None, ngram_range=(1,1), lowercase=False)), + ('todense', DenseTransformer()), + ('cutfoff', CutoffMax(16)), + ('reducer', PCA(n_components=2)), + ]) else: assert(0) diff --git a/vdiscover/RandomWalk.py b/vdiscover/RandomWalk.py index 3297101..7f6fc11 100644 --- a/vdiscover/RandomWalk.py +++ b/vdiscover/RandomWalk.py @@ -210,12 +210,12 @@ def RandomWalkElf(program, outfile, mclass, max_subtraces, max_explored_subtrace x = hash(r) size = len(r.split(" "))-1 - if x not in traces and size >= min_size: + #if x not in traces and size >= min_size: #print r+" .", - collected_traces = collected_traces + r + " ." - traces.add(x) - if len(traces) >= max_subtraces: - break + collected_traces = collected_traces + r + " ." + #traces.add(x) + #if len(traces) >= max_subtraces: + # break row = [elf.path, collected_traces] if mclass is not None: diff --git a/vdiscover/Utils.py b/vdiscover/Utils.py index 5826fcf..52ea72e 100644 --- a/vdiscover/Utils.py +++ b/vdiscover/Utils.py @@ -122,13 +122,13 @@ def read_traces(train_file, nsamples, cut=None, maxsize=50): for i,col in enumerate(csvreader): if len(col) < 2: - print "Ignoring line", i, ":", col.join("\t") + print "Ignoring line", i, ":", "\t".join(col) continue program = col[0] features = col[1] if len(col) > 2: - cl = int(col[2]) + cl = str(col[2]) #int(col[2]) else: cl = -1