From 6f55fe13066bb9ddb99e63ced457a7a60dc04084 Mon Sep 17 00:00:00 2001 From: g Date: Thu, 17 Dec 2015 13:55:21 -0300 Subject: [PATCH] more cluster and sampling code --- setup.py | 2 +- vdiscover/Cluster.py | 80 ++++++++++++++++---------------- vdiscover/Pipeline.py | 103 +++++++++++++++++++++++++----------------- vdiscover/Sampling.py | 7 ++- 4 files changed, 106 insertions(+), 86 deletions(-) diff --git a/setup.py b/setup.py index 1a24318..5690720 100755 --- a/setup.py +++ b/setup.py @@ -12,7 +12,7 @@ url='http://vdiscover.org/', author='G.Grieco', author_email='gg@cifasis-conicet.gov.ar', - scripts=['fextractor', 'vpredictor', 'tcreator', 'vd'], + scripts=['fextractor', 'vpredictor', 'tcreator', 'tseeder', 'vd'], install_requires=[ "python-ptrace", "scikit-learn" diff --git a/vdiscover/Cluster.py b/vdiscover/Cluster.py index 3739bfb..ec91943 100644 --- a/vdiscover/Cluster.py +++ b/vdiscover/Cluster.py @@ -26,14 +26,13 @@ import numpy as np import matplotlib.pyplot as plt import matplotlib as mpl -#import pylab as plb from Utils import * from Pipeline import * -""" -def Cluster(X, labels) +#def Cluster(X, labels) +""" assert(len(X_red) == len(labels)) from sklearn.cluster import MeanShift, estimate_bandwidth @@ -48,7 +47,7 @@ def Cluster(X, labels) n_clusters = len(cluster_centers) plt.figure() - + for ([x,y],label, cluster_label) in zip(X_red,labels, cluster_labels): x = gauss(0,0.1) + x y = gauss(0,0.1) + y @@ -60,9 +59,20 @@ def Cluster(X, labels) markeredgecolor='k', markersize=7) plt.title('Estimated number of clusters: %d' % n_clusters) - - return zip(labels, cluster_labels) """ +#return zip(labels, cluster_labels) + + + +batch_size = 25 +window_size = 32 +maxlen = window_size + +embedding_dims = 5 +nb_filters = 50 +filter_length = 3 +hidden_dims = 50 +nb_epoch = 3 def ClusterCnn(model_file, train_file, valid_file, ftype, nsamples, outdir): @@ -79,18 +89,8 @@ def ClusterCnn(model_file, train_file, valid_file, ftype, nsamples, outdir): max_features = len(preprocessor.tokenizer.word_counts) - batch_size = 100 - window_size = 300 - maxlen = window_size - - embedding_dims = 20 - nb_filters = 50 - filter_length = 3 - hidden_dims = 250 - - #csvreader = load_csv(train_file) print "Reading and sampling data to train.." - train_programs, train_features, train_classes = read_traces(train_file, nsamples, cut=1, maxsize=window_size) + train_programs, train_features, train_classes = read_traces(train_file, nsamples, cut=None) train_size = len(train_features) #y = train_programs @@ -113,25 +113,25 @@ def ClusterCnn(model_file, train_file, valid_file, ftype, nsamples, outdir): ncolors = len(colors) for prog,[x,y] in zip(labels, X_red): - x = gauss(0,0.1) + x - y = gauss(0,0.1) + y - color = 'r' + x = gauss(0,0.05) + x + y = gauss(0,0.05) + y + color = 'r' plt.scatter(x, y, c=color ) - if valid_file is not None: - valid_programs, valid_features, valid_classes = read_traces(valid_file, None, cut=1, maxsize=window_size) #None) + if valid_file is not None: + valid_programs, valid_features, valid_classes = read_traces(valid_file, None, cut=None, maxsize=window_size) #None) valid_dict = dict() X_valid, _, valid_labels = preprocessor.preprocess_traces(valid_features, y_data=None, labels=valid_programs) - valid_dict[ftype] = new_model.predict(X_valid) + valid_dict[ftype] = new_model.predict(X_valid) X_red_valid_comp = model.transform(valid_dict) X_red_valid = X_red_valid_comp[:,0:2] X_red_valid_next = X_red_valid_comp[:,2:4] for prog,[x,y] in zip(valid_labels, X_red_valid): - x = gauss(0,0.1) + x - y = gauss(0,0.1) + y + x = gauss(0,0.05) + x + y = gauss(0,0.05) + y plt.scatter(x, y, c='b') plt.text(x, y+0.02, prog.split("/")[-1]) @@ -158,8 +158,8 @@ def ClusterCnn(model_file, train_file, valid_file, ftype, nsamples, outdir): plt.figure() for ([x,y],label, cluster_label) in zip(X_red,labels, cluster_labels): - x = gauss(0,0.1) + x - y = gauss(0,0.1) + y + #x = gauss(0,0.1) + x + #y = gauss(0,0.1) + y plt.scatter(x, y, c = colors[cluster_label % ncolors]) #print label #if label in valid_labels: @@ -181,6 +181,11 @@ def ClusterCnn(model_file, train_file, valid_file, ftype, nsamples, outdir): #plt.savefig("clusters.png") plt.show() clustered_traces = zip(labels, cluster_labels) + writer = open_csv(train_file.replace(".gz","")+".clusters") + for label, cluster in clustered_traces: + writer.writerow([label, cluster]) + + """ clusters = dict() for label, cluster in clustered_traces: @@ -190,7 +195,7 @@ def ClusterCnn(model_file, train_file, valid_file, ftype, nsamples, outdir): plt.figure() plt.title('Cluster %d' % cluster) #X_clus = [] - + #for prog in traces: # i = labels.index(prog) # X_clus.append(X_train[i]) @@ -218,8 +223,9 @@ def ClusterCnn(model_file, train_file, valid_file, ftype, nsamples, outdir): plt.show() #plt.savefig('cluster-%d.png' % cluster) - - return clustered_traces + """ + + #return clustered_traces def TrainCnn(model_file, train_file, valid_file, ftype, nsamples): @@ -230,16 +236,6 @@ def TrainCnn(model_file, train_file, valid_file, ftype, nsamples): train_programs = [] train_classes = [] - batch_size = 100 - window_size = 300 - maxlen = window_size - - embedding_dims = 20 - nb_filters = 250 - filter_length = 3 - hidden_dims = 250 - nb_epoch = 100 - train_programs, train_features, train_classes = read_traces(train_file, nsamples, cut=None) train_size = len(train_features) @@ -251,11 +247,11 @@ def TrainCnn(model_file, train_file, valid_file, ftype, nsamples): max_features = len(tokenizer.word_counts) preprocessor = DeepReprPreprocessor(tokenizer, window_size, batch_size) - X_train,y_train = preprocessor.preprocess(train_features, 50000) + X_train,y_train = preprocessor.preprocess(train_features, 10000) nb_classes = len(preprocessor.classes) print preprocessor.classes - model = make_cluste_cnn("train", max_features, maxlen, embedding_dims, nb_filters, filter_length, hidden_dims, nb_classes) + model = make_cluster_cnn("train", max_features, maxlen, embedding_dims, nb_filters, filter_length, hidden_dims, nb_classes) model.fit(X_train, y_train, validation_split=0.1, batch_size=batch_size, nb_epoch=nb_epoch, show_accuracy=True) model.mypreprocessor = preprocessor diff --git a/vdiscover/Pipeline.py b/vdiscover/Pipeline.py index 516ed0e..3730609 100644 --- a/vdiscover/Pipeline.py +++ b/vdiscover/Pipeline.py @@ -236,6 +236,7 @@ def preprocess_traces(self, X_data, y_data=None, labels=None): cut_X_data = [] cut_label_data = [] cut_y_data = [] + rep = 5 X_size = len(X_data) @@ -247,21 +248,27 @@ def preprocess_traces(self, X_data, y_data=None, labels=None): trace = raw_trace.split(" ") size = len(trace) + rep = 1 + int(float(size) / float(self.max_len)) - start = size - (self.max_len) - start = randint(0, max(start,0)) - new_trace = " ".join(trace[start:(start+size)]) - cut_X_data.append(new_trace) + for _ in range(rep): - if labels is not None: - cut_label_data.append(labels[i]) - else: - cut_label_data.append("+"+str(size)) + start = size - (self.max_len) + start = randint(0, max(start,0)) - if y_data is not None: - cut_y_data.append(y_data[i]) - else: - cut_y_data.append(0) + new_trace = " ".join(trace[start:(start+self.max_len)]) + #print "sizes:", size, len(trace[start:(start+self.max_len)]) + + cut_X_data.append(new_trace) + + if labels is not None: + cut_label_data.append(labels[i]) + else: + cut_label_data.append("+"+str(size)) + + if y_data is not None: + cut_y_data.append(y_data[i]) + else: + cut_y_data.append(0) X_train = self.tokenizer.texts_to_sequences(cut_X_data) labels = cut_label_data @@ -290,19 +297,33 @@ def preprocess(self, X_data, cut_size=1): size = len(trace) - start = randint(0, size-2) - end = randint(start, size-2) + if size <= (self.max_len + 1): + start = 0 + end = size - 2 + new_trace = " ".join(trace[start:(end+1)]) + last_event = trace[(end+1)].split(":") + cut_y_data.append(last_event[0]) + else: + #print size + start = size - (self.max_len) - 2 + start = randint(0, start) + end = start + self.max_len + #print len(trace[start:end]) + #new_trace = " ".join(trace[start:end]) - new_trace = " ".join(trace[start:(end+1)]) - last_event = trace[end+1].split(":") - cut_y_data.append(last_event[0]) + #start = randint(0, size-2) + #end = randint(start, size-2) + + new_trace = " ".join(trace[start:(end+1)]) + last_event = trace[end+1].split(":") + cut_y_data.append(last_event[0]) for y in set(cut_y_data): stats[y] = float(cut_y_data.count(y)) / len(cut_y_data) - #print stats, sum(stats.values()) - + print stats, sum(stats.values()) + #assert(0) cut_y_data = [] for _ in xrange(cut_size): @@ -310,40 +331,40 @@ def preprocess(self, X_data, cut_size=1): raw_trace = X_data[i][:-1] trace = raw_trace.split(" ") - size = len(trace) - start = randint(0, size-4) - end = randint(start, size-4)#start + randint(0, self.max_len) - new_trace = " ".join(trace[start:(end+1)]) - last_event = trace[end+3].split(":") - cl = last_event[0] + if size <= (self.max_len + 1): + start = 0 + end = size - 2 + new_trace = " ".join(trace[start:(end+1)]) + last_event = trace[(end+1)].split(":") + else: + #print size + start = size - (self.max_len) - 2 + start = randint(0, start) + end = start + self.max_len + #print len(trace[start:end]) + #new_trace = " ".join(trace[start:end]) - #print raw_trace - #print start,end - #print new_trace - #print cl - #assert(0) - - #if len(last_event) > 1: - # print cl, last_event[1] - if cl in stats: - if random() <= stats[cl]: - continue + #start = randint(0, size-2) + #end = randint(start, size-2) + new_trace = " ".join(trace[start:(end+1)]) + last_event = trace[end+1].split(":") - cut_X_data.append(new_trace) + cl = last_event[0] if cl not in self.classes: self.classes.append(cl) + stats[cl] = 0.0 + else: + if random() <= stats[cl]: + continue + cut_X_data.append(new_trace) cut_y_data.append(self.classes.index(cl)) - #if y_data is not None: - # y = y_data[i] - # cut_y_data.append(y) - X_train = self.tokenizer.texts_to_sequences(cut_X_data) y_train = [] diff --git a/vdiscover/Sampling.py b/vdiscover/Sampling.py index 06e5f58..6d26ac2 100644 --- a/vdiscover/Sampling.py +++ b/vdiscover/Sampling.py @@ -27,6 +27,9 @@ def cluster_sampler(clustered_traces, n_per_cluster): for label, cluster in clustered_traces: clusters[cluster] = clusters.get(cluster, []) + [label.split(":")[-1]] - print "Selecting", len(clusters), "seeds" + selected = set() for (cluster, seeds) in clusters.items(): - print ",".join(random.sample(seeds, n_per_cluster)) + n_sample = min(len(seeds), n_per_cluster) + selected.update(set(random.sample(seeds, n_sample))) + + return selected