From 8c947b7a44c9bf049d1b37f1e1cb553554a9b232 Mon Sep 17 00:00:00 2001 From: gaa-cifasis Date: Thu, 29 Sep 2016 12:34:57 +0000 Subject: [PATCH] added doc2vec support in clustering + fixes --- vd | 2 +- vdiscover/Cluster.py | 191 ++++++++++++++++++++++++++++++++++-------- vdiscover/Pipeline.py | 56 +++++++++++-- vpredictor | 40 ++++++--- 4 files changed, 233 insertions(+), 56 deletions(-) diff --git a/vd b/vd index a465371..26d7cae 100755 --- a/vd +++ b/vd @@ -94,7 +94,7 @@ if __name__ == "__main__": nfiles = len(files) #print "Processing directory ","./"++("/".join(y)), "with", nfiles, "seeds" for f in files: - all_files.append(x+"/".join(y)+"/"+f) + all_files.append(x+"/".join(y)+f) random.shuffle(all_files) nfiles = len(all_files) diff --git a/vdiscover/Cluster.py b/vdiscover/Cluster.py index eb19219..a1013b0 100644 --- a/vdiscover/Cluster.py +++ b/vdiscover/Cluster.py @@ -281,21 +281,42 @@ def TrainCnn(model_file, train_file, valid_file, ftype, nsamples): #model.save_weights(model_file) modelfile.write(pickle.dumps(preprocessor, protocol=2)) - -def ClusterScikit(model_file, train_file, valid_file, ftype, nsamples): +""" +def ClusterDoc2Vec(model_file, train_file, valid_file, ftype, nsamples, param): train_programs, train_features, train_classes = read_traces(train_file, nsamples) train_size = len(train_programs) print "using", train_size,"examples to train." + from gensim.models.doc2vec import TaggedDocument + from gensim.models import Doc2Vec + + print "Vectorizing traces.." + sentences = [] + + for (prog,trace) in zip(train_programs,train_features): + sentences.append(TaggedDocument(trace.split(" "), [prog])) + + model = Doc2Vec(dm=2, min_count=1, window=5, size=100, sample=1e-4, negative=5, workers=8, iter=1) + model.build_vocab(sentences) + + for epoch in range(20): + #print model + model.train(sentences) + shuffle(sentences) + train_dict = dict() - train_dict[ftype] = train_features - #batch_size = 16 - #window_size = 20 + + vec_train_features = [] + for prog in train_programs: + #print prog, model.docvecs[prog] + vec_train_features.append(model.docvecs[prog]) + + train_dict[ftype] = vec_train_features print "Transforming data and fitting model.." - model = make_cluster_pipeline_bow(ftype) + model = make_cluster_pipeline_doc2vec(ftype) X_red = model.fit_transform(train_dict) #mpl.rcParams.update({'font.size': 10}) @@ -311,28 +332,16 @@ def ClusterScikit(model_file, train_file, valid_file, ftype, nsamples): plt.text(x, y+0.02, prog.split("/")[-1]) except ValueError: plt.text(x, y+0.02, cl) - - - if valid_file is not None: - valid_programs, valid_features, valid_classes = read_traces(valid_file, None) - valid_dict = dict() - valid_dict[ftype] = valid_features - - X_red = model.transform(valid_dict) - for prog,[x,y],cl in zip(valid_programs, X_red, valid_classes): - x = gauss(0,0.1) + x - y = gauss(0,0.1) + y - plt.scatter(x, y, c=colors[cl+1]) - plt.text(x, y+0.02, prog.split("/")[-1]) + #plt.show() + plt.savefig(train_file.replace(".gz","")+".png") - plt.show() from sklearn.cluster import MeanShift, estimate_bandwidth bandwidth = estimate_bandwidth(X_red, quantile=0.2) print "Clustering with bandwidth:", bandwidth - af = MeanShift(bandwidth=bandwidth/5).fit(X_red) + af = MeanShift(bandwidth=bandwidth*param).fit(X_red) cluster_centers = af.cluster_centers_ labels = af.labels_ @@ -352,27 +361,137 @@ def ClusterScikit(model_file, train_file, valid_file, ftype, nsamples): markeredgecolor='k', markersize=7) plt.title('Estimated number of clusters: %d' % n_clusters_) - plt.show() + plt.savefig(train_file.replace(".gz","")+".clusters.png") + + #plt.show() clustered_traces = zip(train_programs, labels) writer = write_csv(train_file.replace(".gz","")+".clusters") for label, cluster in clustered_traces: - writer.writerow([label, cluster]) + writer.writerow([label.split("/")[-1], cluster]) + +""" + +def ClusterScikit(model_file, train_file, valid_file, ftype, nsamples, vectorizer, reducer, param): + + train_programs, train_features, train_classes = read_traces(train_file, nsamples) + train_size = len(train_programs) + print "using", train_size,"examples to train." + + if vectorizer == "bow": + + train_dict = dict() + train_dict[ftype] = train_features + #batch_size = 16 + #window_size = 20 + + print "Transforming data and fitting model.." + model = make_cluster_pipeline_bow(ftype, reducer) + X_red = model.fit_transform(train_dict) + + elif vectorizer == "doc2vec": + + from gensim.models.doc2vec import TaggedDocument + from gensim.models import Doc2Vec + + print "Vectorizing traces.." + sentences = [] + + for (prog,trace) in zip(train_programs,train_features): + sentences.append(TaggedDocument(trace.split(" "), [prog])) + + model = Doc2Vec(dm=2, min_count=1, window=5, size=100, sample=1e-4, negative=5, workers=8, iter=1) + model.build_vocab(sentences) -def Cluster(train_file, valid_file, ftype, nsamples): + for epoch in range(20): + #print model + model.train(sentences) + shuffle(sentences) - ClusterScikit(None, train_file, valid_file, ftype, nsamples) + train_dict = dict() - #if ttype == "cluster": - #ClusterScikit(out_file, train_file, valid_file, ftype, nsamples) + vec_train_features = [] + for prog in train_programs: + #print prog, model.docvecs[prog] + vec_train_features.append(model.docvecs[prog]) - #try: - # import keras - #except: - # print "Failed to import keras modules to perform LSTM training" - # return + train_dict[ftype] = vec_train_features - #if model_file is None: - # TrainDeepRepr(out_file, train_file, valid_file, ftype, nsamples) - #else: - # PlotDeepRepr(model_file, train_file, valid_file, ftype, nsamples, outfile) + print "Transforming data and fitting model.." + model = make_cluster_pipeline_doc2vec(ftype, reducer) + X_red = model.fit_transform(train_dict) + + + #pl.rcParams.update({'font.size': 10}) + if type(X_red) == list: + X_red = np.vstack(X_red) + print X_red.shape + + if X_red.shape[1] == 2: + + plt.figure() + colors = 'brgcmykbgrcmykbgrcmykbgrcmyk' + ncolors = len(colors) + + for prog,[x,y],cl in zip(train_programs, X_red, train_classes): + x = gauss(0,0.1) + x + y = gauss(0,0.1) + y + try: + plt.scatter(x, y, c=colors[int(cl)]) + plt.text(x, y+0.02, prog.split("/")[-1]) + except ValueError: + plt.text(x, y+0.02, cl) + + + + if valid_file is not None: + valid_programs, valid_features, valid_classes = read_traces(valid_file, None) + valid_dict = dict() + valid_dict[ftype] = valid_features + + X_red = model.transform(valid_dict) + for prog,[x,y],cl in zip(valid_programs, X_red, valid_classes): + x = gauss(0,0.1) + x + y = gauss(0,0.1) + y + plt.scatter(x, y, c=colors[cl+1]) + plt.text(x, y+0.02, prog.split("/")[-1]) + + #plt.show() + plt.savefig(train_file.replace(".gz","")+".png") + + + from sklearn.cluster import MeanShift, estimate_bandwidth + + bandwidth = estimate_bandwidth(X_red, quantile=0.2) + print "Clustering with bandwidth:", bandwidth + + af = MeanShift(bandwidth=bandwidth*param).fit(X_red) + + cluster_centers = af.cluster_centers_ + labels = af.labels_ + n_clusters_ = len(cluster_centers) + + if X_red.shape[1] == 2: + + plt.close('all') + plt.figure(1) + plt.clf() + + for ([x,y],label, cluster_label) in zip(X_red,train_programs, labels): + x = gauss(0,0.1) + x + y = gauss(0,0.1) + y + plt.scatter(x, y, c = colors[cluster_label % ncolors]) + + for i,[x,y] in enumerate(cluster_centers): + plt.plot(x, y, 'o', markerfacecolor=colors[i % ncolors], + markeredgecolor='k', markersize=7) + + plt.title('Estimated number of clusters: %d' % n_clusters_) + plt.savefig(train_file.replace(".gz","")+".clusters.png") + + #plt.show() + + clustered_traces = zip(train_programs, labels) + writer = write_csv(train_file.replace(".gz","")+".clusters") + for label, cluster in clustered_traces: + writer.writerow([label.split("/")[-1], cluster]) diff --git a/vdiscover/Pipeline.py b/vdiscover/Pipeline.py index 384f8ce..cc7fc96 100644 --- a/vdiscover/Pipeline.py +++ b/vdiscover/Pipeline.py @@ -24,10 +24,10 @@ from sklearn.naive_bayes import GaussianNB, MultinomialNB from sklearn.linear_model import LogisticRegression from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer -from sklearn.decomposition import PCA +from sklearn.decomposition import PCA, TruncatedSVD #from sklearn.manifold import MDS -from random import random, randint, sample, gauss +from random import random, randint, sample, gauss, shuffle def static_tokenizer(s): return filter(lambda x: x<>'', s.split(" ")) @@ -115,16 +115,38 @@ def make_train_pipeline(ftype): else: assert(0) -def make_cluster_pipeline_bow(ftype): - if ftype is "dynamic": +def make_cluster_pipeline_bow(ftype, rdim): + if ftype is "dynamic" and rdim == "pca": + return Pipeline(steps=[ ('selector', ItemSelector(key='dynamic')), ('dvectorizer', TfidfVectorizer(tokenizer=dynamic_tokenizer, use_idf=False, norm=None, ngram_range=(1,1), lowercase=False)), ('todense', DenseTransformer()), - ('cutfoff', CutoffMax(16)), + #('cutfoff', CutoffMax(16)), ('reducer', PCA(n_components=2)), ]) + + elif ftype is "dynamic" and rdim == "svd": + + return Pipeline(steps=[ + ('selector', ItemSelector(key='dynamic')), + ('dvectorizer', TfidfVectorizer(tokenizer=dynamic_tokenizer, use_idf=False, norm=None, ngram_range=(1,1), lowercase=False)), + ('todense', DenseTransformer()), + #('cutfoff', CutoffMax(16)), + ('reducer', TruncatedSVD(n_components=2)), + + ]) + + elif ftype is "dynamic" and rdim == "none": + + return Pipeline(steps=[ + ('selector', ItemSelector(key='dynamic')), + ('dvectorizer', TfidfVectorizer(tokenizer=dynamic_tokenizer, use_idf=False, norm=None, ngram_range=(1,1), lowercase=False)), + ('todense', DenseTransformer()), + #('cutfoff', CutoffMax(16)), + ]) + elif ftype is "static": return Pipeline(steps=[ ('selector', ItemSelector(key='static')), @@ -136,11 +158,33 @@ def make_cluster_pipeline_bow(ftype): else: assert(0) + +def make_cluster_pipeline_doc2vec(ftype, rdim): + if ftype is "dynamic" and rdim == "pca": + return Pipeline(steps=[ + ('selector', ItemSelector(key='dynamic')), + ('reducer', PCA(n_components=2)), + ]) + elif ftype is "dynamic" and rdim == "svd": + return Pipeline(steps=[ + ('selector', ItemSelector(key='dynamic')), + ('reducer', TruncatedSVD(n_components=2)), + ]) + elif ftype is "dynamic" and rdim == "none": + return Pipeline(steps=[ + ('selector', ItemSelector(key='dynamic')) + ]) + elif ftype is "static": + raise NotImplemented + else: + assert(0) + + + def make_cluster_pipeline_subtraces(ftype): if ftype is "dynamic": return Pipeline(steps=[ ('selector', ItemSelector(key='dynamic')), - #('todense', DenseTransformer()), ('reducer', PCA(n_components=12)), ]) elif ftype is "static": diff --git a/vpredictor b/vpredictor index 3b51644..c3dd938 100755 --- a/vpredictor +++ b/vpredictor @@ -66,9 +66,21 @@ if __name__ == "__main__": help="Valid a model using infile", action="store", default=None) - parser.add_argument("--cluster-bow", - help="Cluster input traces using BOW", - action="store_true", default=False) + parser.add_argument("--cluster-with-repr", + help="Cluster input traces using some representation (bow, doc2vec)", + action="store", default=None) + + parser.add_argument("--cluster-with-rdim", + help="Cluster input traces reducing dimensionality (pca, svd, none)", + action="store", default="pca") + + #parser.add_argument("--cluster-doc2vec", + # help="Cluster input traces using doc2vec", + # action="store_true", default=False) + + parser.add_argument("--cluster-param", type=float, + help="Cluster parameter", + action="store", default=0.1) parser.add_argument("--cluster-cnn", help="Cluster input traces using a convolutional model", @@ -105,10 +117,12 @@ if __name__ == "__main__": #training_mode_lstm = options.train_lstm #training_mode_cnn = options.train_cnn - training_mode_cluster_bow = options.cluster_bow - training_mode_cluster_cnn = options.cluster_cnn + training_mode_cluster_repr = options.cluster_with_repr + + cluster_rdim = options.cluster_with_rdim + cluster_param = options.cluster_param - training_mode = training_mode_rf or training_mode_cluster_bow or training_mode_cluster_cnn + training_mode = training_mode_rf or training_mode_cluster_repr #training_mode_cluster_bow or training_mode_cluster_cnn or training_mode_cluster_doc2vec probability_mode = options.prob nsamples = options.n_samples @@ -130,25 +144,25 @@ if __name__ == "__main__": if training_mode: if training_mode_rf: Train(out_file, in_file, valid_file, "rf", ftype, nsamples) - #elif training_mode_cnn: - elif training_mode_cluster_bow: + + elif training_mode_cluster_repr: + cluster_repr = training_mode_cluster_repr from vdiscover.Cluster import ClusterScikit - #Cluster(in_file, valid_file, ftype, nsamples) - ClusterScikit(None, in_file, valid_file, ftype, nsamples) + ClusterScikit(None, in_file, valid_file, ftype, nsamples, cluster_repr, cluster_rdim, cluster_param) + + """ elif training_mode_cluster_cnn: - #Cluster(in_file, valid_file, ftype, nsamples) if (model_file is None): from vdiscover.Cluster import TrainCnn TrainCnn(out_file, in_file, valid_file, ftype, nsamples) - #print "Clustering using a convolutional model requires a pre-trained model" exit(0) from vdiscover.Cluster import ClusterCnn ClusterCnn(model_file, in_file, valid_file, ftype, nsamples, None) - + """ else: if model_file is None: