Skip to content

Commit

Permalink
added clustering of static data + fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
gaa-cifasis committed Apr 23, 2016
1 parent 766924d commit 4041929
Show file tree
Hide file tree
Showing 5 changed files with 24 additions and 14 deletions.
2 changes: 1 addition & 1 deletion fextractor
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ if __name__ == "__main__":
help="Extract only dynamic features from a testcase",
action="store_true", default=False)

parser.add_argument("--mclass", type=int,
parser.add_argument("--mclass", type=str,
help="Include class column, to use later in training mode",
action="store", default=None)

Expand Down
10 changes: 7 additions & 3 deletions vdiscover/Cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -311,9 +311,13 @@ def ClusterScikit(model_file, train_file, valid_file, ftype, nsamples):
for prog,[x,y],cl in zip(train_programs, X_red, train_classes):
x = gauss(0,0.1) + x
y = gauss(0,0.1) + y
plt.scatter(x, y, c=colors[cl])
plt.text(x, y+0.02, prog.split("/")[-1])

try:
plt.scatter(x, y, c=colors[int(cl)])
plt.text(x, y+0.02, prog.split("/")[-1])
except ValueError:
plt.text(x, y+0.02, cl)



if valid_file is not None:
valid_programs, valid_features, valid_classes = read_traces(valid_file, None)
Expand Down
12 changes: 9 additions & 3 deletions vdiscover/Pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.manifold import MDS
from sklearn.decomposition import PCA
#from sklearn.manifold import MDS

from random import random, randint, sample, gauss

Expand Down Expand Up @@ -126,7 +126,13 @@ def make_cluster_pipeline_bow(ftype):

])
elif ftype is "static":
raise NotImplemented
return Pipeline(steps=[
('selector', ItemSelector(key='static')),
('dvectorizer', TfidfVectorizer(tokenizer=dynamic_tokenizer, use_idf=False, norm=None, ngram_range=(1,1), lowercase=False)),
('todense', DenseTransformer()),
('cutfoff', CutoffMax(16)),
('reducer', PCA(n_components=2)),
])
else:
assert(0)

Expand Down
10 changes: 5 additions & 5 deletions vdiscover/RandomWalk.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,12 +210,12 @@ def RandomWalkElf(program, outfile, mclass, max_subtraces, max_explored_subtrace
x = hash(r)
size = len(r.split(" "))-1

if x not in traces and size >= min_size:
#if x not in traces and size >= min_size:
#print r+" .",
collected_traces = collected_traces + r + " ."
traces.add(x)
if len(traces) >= max_subtraces:
break
collected_traces = collected_traces + r + " ."
#traces.add(x)
#if len(traces) >= max_subtraces:
# break

row = [elf.path, collected_traces]
if mclass is not None:
Expand Down
4 changes: 2 additions & 2 deletions vdiscover/Utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,13 +122,13 @@ def read_traces(train_file, nsamples, cut=None, maxsize=50):
for i,col in enumerate(csvreader):

if len(col) < 2:
print "Ignoring line", i, ":", col.join("\t")
print "Ignoring line", i, ":", "\t".join(col)
continue

program = col[0]
features = col[1]
if len(col) > 2:
cl = int(col[2])
cl = str(col[2]) #int(col[2])
else:
cl = -1

Expand Down

0 comments on commit 4041929

Please sign in to comment.