added clustering of static data + fixes

CIFASIS · Apr 23, 2016 · 4041929 · 4041929
1 parent 766924d
commit 4041929
Show file tree

Hide file tree

Showing 5 changed files with 24 additions and 14 deletions.
diff --git a/fextractor b/fextractor
@@ -67,7 +67,7 @@ if __name__ == "__main__":
                         help="Extract only dynamic features from a testcase",
                         action="store_true", default=False)
 
-    parser.add_argument("--mclass", type=int,
+    parser.add_argument("--mclass", type=str,
                         help="Include class column, to use later in training mode",
                         action="store", default=None)
 

diff --git a/vdiscover/Cluster.py b/vdiscover/Cluster.py
@@ -311,9 +311,13 @@ def ClusterScikit(model_file, train_file, valid_file, ftype, nsamples):
   for prog,[x,y],cl in zip(train_programs, X_red, train_classes):
     x = gauss(0,0.1) + x
     y = gauss(0,0.1) + y
-    plt.scatter(x, y, c=colors[cl])
-    plt.text(x, y+0.02, prog.split("/")[-1])
-
+    try:
+        plt.scatter(x, y, c=colors[int(cl)])
+        plt.text(x, y+0.02, prog.split("/")[-1])
+    except ValueError:
+        plt.text(x, y+0.02, cl)
+
+
 
   if valid_file is not None:
     valid_programs, valid_features, valid_classes = read_traces(valid_file, None)

diff --git a/vdiscover/Pipeline.py b/vdiscover/Pipeline.py
@@ -24,8 +24,8 @@
 from sklearn.naive_bayes import GaussianNB,  MultinomialNB
 from sklearn.linear_model import LogisticRegression
 from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
-from sklearn.decomposition import TruncatedSVD, PCA
-from sklearn.manifold import MDS
+from sklearn.decomposition import PCA
+#from sklearn.manifold import MDS
 
 from random import random, randint, sample, gauss
 
@@ -126,7 +126,13 @@ def make_cluster_pipeline_bow(ftype):
 
     ])
   elif ftype is "static":
-    raise NotImplemented
+    return Pipeline(steps=[
+         ('selector', ItemSelector(key='static')),
+         ('dvectorizer', TfidfVectorizer(tokenizer=dynamic_tokenizer, use_idf=False, norm=None, ngram_range=(1,1), lowercase=False)),
+         ('todense', DenseTransformer()),
+         ('cutfoff', CutoffMax(16)),
+         ('reducer', PCA(n_components=2)),
+    ])
   else:
     assert(0)
 

diff --git a/vdiscover/RandomWalk.py b/vdiscover/RandomWalk.py
@@ -210,12 +210,12 @@ def RandomWalkElf(program, outfile, mclass, max_subtraces, max_explored_subtrace
     x = hash(r)
     size = len(r.split(" "))-1
 
-    if x not in traces and size >= min_size:
+    #if x not in traces and size >= min_size:
       #print r+" .",
-      collected_traces = collected_traces + r + " ."
-      traces.add(x)
-      if len(traces) >= max_subtraces:
-        break
+    collected_traces = collected_traces + r + " ."
+      #traces.add(x)
+      #if len(traces) >= max_subtraces:
+      #  break
 
   row = [elf.path, collected_traces]
   if mclass is not None:

diff --git a/vdiscover/Utils.py b/vdiscover/Utils.py
@@ -122,13 +122,13 @@ def read_traces(train_file, nsamples, cut=None, maxsize=50):
     for i,col in enumerate(csvreader):
 
       if len(col) < 2:
-        print "Ignoring line", i, ":", col.join("\t")
+        print "Ignoring line", i, ":", "\t".join(col)
         continue
 
       program = col[0]
       features = col[1]
       if len(col) > 2:
-        cl = int(col[2])
+        cl = str(col[2]) #int(col[2])
       else:
         cl = -1