Skip to content

Commit

Permalink
more cluster and sampling code
Browse files Browse the repository at this point in the history
  • Loading branch information
g committed Dec 17, 2015
1 parent cb03f03 commit 6f55fe1
Show file tree
Hide file tree
Showing 4 changed files with 106 additions and 86 deletions.
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
url='http://vdiscover.org/',
author='G.Grieco',
author_email='[email protected]',
scripts=['fextractor', 'vpredictor', 'tcreator', 'vd'],
scripts=['fextractor', 'vpredictor', 'tcreator', 'tseeder', 'vd'],
install_requires=[
"python-ptrace",
"scikit-learn"
Expand Down
80 changes: 38 additions & 42 deletions vdiscover/Cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,14 +26,13 @@
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
#import pylab as plb

from Utils import *
from Pipeline import *

"""
def Cluster(X, labels)

#def Cluster(X, labels)
"""
assert(len(X_red) == len(labels))
from sklearn.cluster import MeanShift, estimate_bandwidth
Expand All @@ -48,7 +47,7 @@ def Cluster(X, labels)
n_clusters = len(cluster_centers)
plt.figure()
for ([x,y],label, cluster_label) in zip(X_red,labels, cluster_labels):
x = gauss(0,0.1) + x
y = gauss(0,0.1) + y
Expand All @@ -60,9 +59,20 @@ def Cluster(X, labels)
markeredgecolor='k', markersize=7)
plt.title('Estimated number of clusters: %d' % n_clusters)
return zip(labels, cluster_labels)
"""
#return zip(labels, cluster_labels)



batch_size = 25
window_size = 32
maxlen = window_size

embedding_dims = 5
nb_filters = 50
filter_length = 3
hidden_dims = 50
nb_epoch = 3

def ClusterCnn(model_file, train_file, valid_file, ftype, nsamples, outdir):

Expand All @@ -79,18 +89,8 @@ def ClusterCnn(model_file, train_file, valid_file, ftype, nsamples, outdir):

max_features = len(preprocessor.tokenizer.word_counts)

batch_size = 100
window_size = 300
maxlen = window_size

embedding_dims = 20
nb_filters = 50
filter_length = 3
hidden_dims = 250

#csvreader = load_csv(train_file)
print "Reading and sampling data to train.."
train_programs, train_features, train_classes = read_traces(train_file, nsamples, cut=1, maxsize=window_size)
train_programs, train_features, train_classes = read_traces(train_file, nsamples, cut=None)
train_size = len(train_features)

#y = train_programs
Expand All @@ -113,25 +113,25 @@ def ClusterCnn(model_file, train_file, valid_file, ftype, nsamples, outdir):
ncolors = len(colors)

for prog,[x,y] in zip(labels, X_red):
x = gauss(0,0.1) + x
y = gauss(0,0.1) + y
color = 'r'
x = gauss(0,0.05) + x
y = gauss(0,0.05) + y
color = 'r'
plt.scatter(x, y, c=color )

if valid_file is not None:
valid_programs, valid_features, valid_classes = read_traces(valid_file, None, cut=1, maxsize=window_size) #None)
if valid_file is not None:
valid_programs, valid_features, valid_classes = read_traces(valid_file, None, cut=None, maxsize=window_size) #None)
valid_dict = dict()

X_valid, _, valid_labels = preprocessor.preprocess_traces(valid_features, y_data=None, labels=valid_programs)
valid_dict[ftype] = new_model.predict(X_valid)
valid_dict[ftype] = new_model.predict(X_valid)
X_red_valid_comp = model.transform(valid_dict)

X_red_valid = X_red_valid_comp[:,0:2]
X_red_valid_next = X_red_valid_comp[:,2:4]

for prog,[x,y] in zip(valid_labels, X_red_valid):
x = gauss(0,0.1) + x
y = gauss(0,0.1) + y
x = gauss(0,0.05) + x
y = gauss(0,0.05) + y
plt.scatter(x, y, c='b')
plt.text(x, y+0.02, prog.split("/")[-1])

Expand All @@ -158,8 +158,8 @@ def ClusterCnn(model_file, train_file, valid_file, ftype, nsamples, outdir):

plt.figure()
for ([x,y],label, cluster_label) in zip(X_red,labels, cluster_labels):
x = gauss(0,0.1) + x
y = gauss(0,0.1) + y
#x = gauss(0,0.1) + x
#y = gauss(0,0.1) + y
plt.scatter(x, y, c = colors[cluster_label % ncolors])
#print label
#if label in valid_labels:
Expand All @@ -181,6 +181,11 @@ def ClusterCnn(model_file, train_file, valid_file, ftype, nsamples, outdir):
#plt.savefig("clusters.png")
plt.show()
clustered_traces = zip(labels, cluster_labels)
writer = open_csv(train_file.replace(".gz","")+".clusters")
for label, cluster in clustered_traces:
writer.writerow([label, cluster])

"""
clusters = dict()
for label, cluster in clustered_traces:
Expand All @@ -190,7 +195,7 @@ def ClusterCnn(model_file, train_file, valid_file, ftype, nsamples, outdir):
plt.figure()
plt.title('Cluster %d' % cluster)
#X_clus = []
#for prog in traces:
# i = labels.index(prog)
# X_clus.append(X_train[i])
Expand Down Expand Up @@ -218,8 +223,9 @@ def ClusterCnn(model_file, train_file, valid_file, ftype, nsamples, outdir):
plt.show()
#plt.savefig('cluster-%d.png' % cluster)

return clustered_traces
"""

#return clustered_traces


def TrainCnn(model_file, train_file, valid_file, ftype, nsamples):
Expand All @@ -230,16 +236,6 @@ def TrainCnn(model_file, train_file, valid_file, ftype, nsamples):
train_programs = []
train_classes = []

batch_size = 100
window_size = 300
maxlen = window_size

embedding_dims = 20
nb_filters = 250
filter_length = 3
hidden_dims = 250
nb_epoch = 100

train_programs, train_features, train_classes = read_traces(train_file, nsamples, cut=None)
train_size = len(train_features)

Expand All @@ -251,11 +247,11 @@ def TrainCnn(model_file, train_file, valid_file, ftype, nsamples):
max_features = len(tokenizer.word_counts)

preprocessor = DeepReprPreprocessor(tokenizer, window_size, batch_size)
X_train,y_train = preprocessor.preprocess(train_features, 50000)
X_train,y_train = preprocessor.preprocess(train_features, 10000)
nb_classes = len(preprocessor.classes)
print preprocessor.classes

model = make_cluste_cnn("train", max_features, maxlen, embedding_dims, nb_filters, filter_length, hidden_dims, nb_classes)
model = make_cluster_cnn("train", max_features, maxlen, embedding_dims, nb_filters, filter_length, hidden_dims, nb_classes)
model.fit(X_train, y_train, validation_split=0.1, batch_size=batch_size, nb_epoch=nb_epoch, show_accuracy=True)

model.mypreprocessor = preprocessor
Expand Down
103 changes: 62 additions & 41 deletions vdiscover/Pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,7 @@ def preprocess_traces(self, X_data, y_data=None, labels=None):
cut_X_data = []
cut_label_data = []
cut_y_data = []
rep = 5

X_size = len(X_data)

Expand All @@ -247,21 +248,27 @@ def preprocess_traces(self, X_data, y_data=None, labels=None):
trace = raw_trace.split(" ")

size = len(trace)
rep = 1 + int(float(size) / float(self.max_len))

start = size - (self.max_len)
start = randint(0, max(start,0))
new_trace = " ".join(trace[start:(start+size)])
cut_X_data.append(new_trace)
for _ in range(rep):

if labels is not None:
cut_label_data.append(labels[i])
else:
cut_label_data.append("+"+str(size))
start = size - (self.max_len)
start = randint(0, max(start,0))

if y_data is not None:
cut_y_data.append(y_data[i])
else:
cut_y_data.append(0)
new_trace = " ".join(trace[start:(start+self.max_len)])
#print "sizes:", size, len(trace[start:(start+self.max_len)])

cut_X_data.append(new_trace)

if labels is not None:
cut_label_data.append(labels[i])
else:
cut_label_data.append("+"+str(size))

if y_data is not None:
cut_y_data.append(y_data[i])
else:
cut_y_data.append(0)

X_train = self.tokenizer.texts_to_sequences(cut_X_data)
labels = cut_label_data
Expand Down Expand Up @@ -290,60 +297,74 @@ def preprocess(self, X_data, cut_size=1):

size = len(trace)

start = randint(0, size-2)
end = randint(start, size-2)
if size <= (self.max_len + 1):
start = 0
end = size - 2
new_trace = " ".join(trace[start:(end+1)])
last_event = trace[(end+1)].split(":")
cut_y_data.append(last_event[0])
else:
#print size
start = size - (self.max_len) - 2
start = randint(0, start)
end = start + self.max_len
#print len(trace[start:end])
#new_trace = " ".join(trace[start:end])

new_trace = " ".join(trace[start:(end+1)])
last_event = trace[end+1].split(":")
cut_y_data.append(last_event[0])
#start = randint(0, size-2)
#end = randint(start, size-2)

new_trace = " ".join(trace[start:(end+1)])
last_event = trace[end+1].split(":")
cut_y_data.append(last_event[0])


for y in set(cut_y_data):
stats[y] = float(cut_y_data.count(y)) / len(cut_y_data)

#print stats, sum(stats.values())

print stats, sum(stats.values())
#assert(0)
cut_y_data = []
for _ in xrange(cut_size):

i = randint(0, X_size-1)

raw_trace = X_data[i][:-1]
trace = raw_trace.split(" ")

size = len(trace)

start = randint(0, size-4)
end = randint(start, size-4)#start + randint(0, self.max_len)

new_trace = " ".join(trace[start:(end+1)])
last_event = trace[end+3].split(":")
cl = last_event[0]
if size <= (self.max_len + 1):
start = 0
end = size - 2
new_trace = " ".join(trace[start:(end+1)])
last_event = trace[(end+1)].split(":")
else:
#print size
start = size - (self.max_len) - 2
start = randint(0, start)
end = start + self.max_len
#print len(trace[start:end])
#new_trace = " ".join(trace[start:end])

#print raw_trace
#print start,end
#print new_trace
#print cl
#assert(0)

#if len(last_event) > 1:
# print cl, last_event[1]
if cl in stats:
if random() <= stats[cl]:
continue
#start = randint(0, size-2)
#end = randint(start, size-2)

new_trace = " ".join(trace[start:(end+1)])
last_event = trace[end+1].split(":")

cut_X_data.append(new_trace)
cl = last_event[0]

if cl not in self.classes:
self.classes.append(cl)
stats[cl] = 0.0
else:
if random() <= stats[cl]:
continue

cut_X_data.append(new_trace)
cut_y_data.append(self.classes.index(cl))

#if y_data is not None:
# y = y_data[i]
# cut_y_data.append(y)

X_train = self.tokenizer.texts_to_sequences(cut_X_data)

y_train = []
Expand Down
7 changes: 5 additions & 2 deletions vdiscover/Sampling.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@ def cluster_sampler(clustered_traces, n_per_cluster):
for label, cluster in clustered_traces:
clusters[cluster] = clusters.get(cluster, []) + [label.split(":")[-1]]

print "Selecting", len(clusters), "seeds"
selected = set()
for (cluster, seeds) in clusters.items():
print ",".join(random.sample(seeds, n_per_cluster))
n_sample = min(len(seeds), n_per_cluster)
selected.update(set(random.sample(seeds, n_sample)))

return selected

0 comments on commit 6f55fe1

Please sign in to comment.