From 8c947b7a44c9bf049d1b37f1e1cb553554a9b232 Mon Sep 17 00:00:00 2001
From: gaa-cifasis <gg@cifasis-conicet.gov.ar>
Date: Thu, 29 Sep 2016 12:34:57 +0000
Subject: [PATCH] added doc2vec support in clustering + fixes

---
 vd                    |   2 +-
 vdiscover/Cluster.py  | 191 ++++++++++++++++++++++++++++++++++--------
 vdiscover/Pipeline.py |  56 +++++++++++--
 vpredictor            |  40 ++++++---
 4 files changed, 233 insertions(+), 56 deletions(-)

diff --git a/vd b/vd
index a465371..26d7cae 100755
--- a/vd
+++ b/vd
@@ -94,7 +94,7 @@ if __name__ == "__main__":
         nfiles = len(files)
         #print "Processing directory ","./"++("/".join(y)), "with", nfiles, "seeds"
         for f in files:
-          all_files.append(x+"/".join(y)+"/"+f)
+          all_files.append(x+"/".join(y)+f)
 
       random.shuffle(all_files)
       nfiles = len(all_files)
diff --git a/vdiscover/Cluster.py b/vdiscover/Cluster.py
index eb19219..a1013b0 100644
--- a/vdiscover/Cluster.py
+++ b/vdiscover/Cluster.py
@@ -281,21 +281,42 @@ def TrainCnn(model_file, train_file, valid_file, ftype, nsamples):
   #model.save_weights(model_file)
   modelfile.write(pickle.dumps(preprocessor, protocol=2))
 
-
-def ClusterScikit(model_file, train_file, valid_file, ftype, nsamples):
+"""
+def ClusterDoc2Vec(model_file, train_file, valid_file, ftype, nsamples, param):
 
   train_programs, train_features, train_classes = read_traces(train_file, nsamples)
   train_size = len(train_programs)
 
   print "using", train_size,"examples to train."
 
+  from gensim.models.doc2vec import TaggedDocument
+  from gensim.models import Doc2Vec
+
+  print "Vectorizing traces.."
+  sentences = []
+  
+  for (prog,trace) in zip(train_programs,train_features):
+     sentences.append(TaggedDocument(trace.split(" "), [prog]))
+
+  model = Doc2Vec(dm=2, min_count=1, window=5, size=100, sample=1e-4, negative=5, workers=8, iter=1)
+  model.build_vocab(sentences)
+
+  for epoch in range(20):
+    #print model
+    model.train(sentences)
+    shuffle(sentences)
+
   train_dict = dict()
-  train_dict[ftype] = train_features
-  #batch_size = 16
-  #window_size = 20
+
+  vec_train_features = []
+  for prog in train_programs:
+    #print prog, model.docvecs[prog]
+    vec_train_features.append(model.docvecs[prog])
+
+  train_dict[ftype] = vec_train_features
 
   print "Transforming data and fitting model.."
-  model = make_cluster_pipeline_bow(ftype)
+  model = make_cluster_pipeline_doc2vec(ftype)
   X_red = model.fit_transform(train_dict)
 
   #mpl.rcParams.update({'font.size': 10})
@@ -311,28 +332,16 @@ def ClusterScikit(model_file, train_file, valid_file, ftype, nsamples):
         plt.text(x, y+0.02, prog.split("/")[-1])
     except ValueError:
         plt.text(x, y+0.02, cl)
-     
-   
 
-  if valid_file is not None:
-    valid_programs, valid_features, valid_classes = read_traces(valid_file, None)
-    valid_dict = dict()
-    valid_dict[ftype] = valid_features
-
-    X_red = model.transform(valid_dict)
-    for prog,[x,y],cl in zip(valid_programs, X_red, valid_classes):
-      x = gauss(0,0.1) + x
-      y = gauss(0,0.1) + y
-      plt.scatter(x, y, c=colors[cl+1])
-      plt.text(x, y+0.02, prog.split("/")[-1])
+  #plt.show() 
+  plt.savefig(train_file.replace(".gz","")+".png")
 
-  plt.show()
   from sklearn.cluster import MeanShift, estimate_bandwidth
 
   bandwidth = estimate_bandwidth(X_red, quantile=0.2)
   print "Clustering with bandwidth:", bandwidth
 
-  af = MeanShift(bandwidth=bandwidth/5).fit(X_red)
+  af = MeanShift(bandwidth=bandwidth*param).fit(X_red)
 
   cluster_centers = af.cluster_centers_
   labels = af.labels_
@@ -352,27 +361,137 @@ def ClusterScikit(model_file, train_file, valid_file, ftype, nsamples):
              markeredgecolor='k', markersize=7)
 
   plt.title('Estimated number of clusters: %d' % n_clusters_)
-  plt.show()
+  plt.savefig(train_file.replace(".gz","")+".clusters.png")
+
+  #plt.show()
 
   clustered_traces = zip(train_programs, labels)
   writer = write_csv(train_file.replace(".gz","")+".clusters")
   for label, cluster in clustered_traces:
-     writer.writerow([label, cluster])
+     writer.writerow([label.split("/")[-1], cluster])
+
+"""
+
+def ClusterScikit(model_file, train_file, valid_file, ftype, nsamples, vectorizer, reducer, param):
+
+  train_programs, train_features, train_classes = read_traces(train_file, nsamples)
+  train_size = len(train_programs)
+  print "using", train_size,"examples to train."
+
+  if vectorizer == "bow":
+ 
+    train_dict = dict()
+    train_dict[ftype] = train_features
+    #batch_size = 16
+    #window_size = 20
+
+    print "Transforming data and fitting model.."
+    model = make_cluster_pipeline_bow(ftype, reducer)
+    X_red = model.fit_transform(train_dict)
+
+  elif vectorizer == "doc2vec":
+
+    from gensim.models.doc2vec import TaggedDocument
+    from gensim.models import Doc2Vec
+
+    print "Vectorizing traces.."
+    sentences = []
+  
+    for (prog,trace) in zip(train_programs,train_features):
+      sentences.append(TaggedDocument(trace.split(" "), [prog]))
+
+    model = Doc2Vec(dm=2, min_count=1, window=5, size=100, sample=1e-4, negative=5, workers=8, iter=1)
+    model.build_vocab(sentences)
 
-def Cluster(train_file, valid_file, ftype, nsamples):
+    for epoch in range(20):
+      #print model
+      model.train(sentences)
+      shuffle(sentences)
 
-  ClusterScikit(None, train_file, valid_file, ftype, nsamples)
+    train_dict = dict()
 
-  #if ttype == "cluster":
-    #ClusterScikit(out_file, train_file, valid_file, ftype, nsamples)
+    vec_train_features = []
+    for prog in train_programs:
+      #print prog, model.docvecs[prog]
+      vec_train_features.append(model.docvecs[prog])
 
-    #try:
-    #  import keras
-    #except:
-    #  print "Failed to import keras modules to perform LSTM training"
-    #  return
+    train_dict[ftype] = vec_train_features
 
-    #if model_file is None:
-    #  TrainDeepRepr(out_file, train_file, valid_file, ftype, nsamples)
-    #else:
-    #  PlotDeepRepr(model_file, train_file, valid_file, ftype, nsamples, outfile)
+    print "Transforming data and fitting model.."
+    model = make_cluster_pipeline_doc2vec(ftype, reducer)
+    X_red = model.fit_transform(train_dict)
+
+
+  #pl.rcParams.update({'font.size': 10})
+  if type(X_red) == list:
+    X_red = np.vstack(X_red)
+    print X_red.shape 
+
+  if X_red.shape[1] == 2:
+
+    plt.figure()
+    colors = 'brgcmykbgrcmykbgrcmykbgrcmyk'
+    ncolors = len(colors)
+
+    for prog,[x,y],cl in zip(train_programs, X_red, train_classes):
+      x = gauss(0,0.1) + x
+      y = gauss(0,0.1) + y
+      try:
+          plt.scatter(x, y, c=colors[int(cl)])
+          plt.text(x, y+0.02, prog.split("/")[-1])
+      except ValueError:
+          plt.text(x, y+0.02, cl)
+     
+   
+
+    if valid_file is not None:
+      valid_programs, valid_features, valid_classes = read_traces(valid_file, None)
+      valid_dict = dict()
+      valid_dict[ftype] = valid_features
+
+      X_red = model.transform(valid_dict)
+      for prog,[x,y],cl in zip(valid_programs, X_red, valid_classes):
+        x = gauss(0,0.1) + x
+        y = gauss(0,0.1) + y
+        plt.scatter(x, y, c=colors[cl+1])
+        plt.text(x, y+0.02, prog.split("/")[-1])
+
+    #plt.show()
+    plt.savefig(train_file.replace(".gz","")+".png")
+
+
+  from sklearn.cluster import MeanShift, estimate_bandwidth
+
+  bandwidth = estimate_bandwidth(X_red, quantile=0.2)
+  print "Clustering with bandwidth:", bandwidth
+
+  af = MeanShift(bandwidth=bandwidth*param).fit(X_red)
+
+  cluster_centers = af.cluster_centers_
+  labels = af.labels_
+  n_clusters_ = len(cluster_centers)
+
+  if X_red.shape[1] == 2:
+
+    plt.close('all')
+    plt.figure(1)
+    plt.clf()
+
+    for ([x,y],label, cluster_label) in zip(X_red,train_programs, labels):
+      x = gauss(0,0.1) + x
+      y = gauss(0,0.1) + y
+      plt.scatter(x, y, c = colors[cluster_label % ncolors])
+
+    for i,[x,y] in enumerate(cluster_centers):
+      plt.plot(x, y, 'o', markerfacecolor=colors[i % ncolors],
+               markeredgecolor='k', markersize=7)
+
+    plt.title('Estimated number of clusters: %d' % n_clusters_)
+    plt.savefig(train_file.replace(".gz","")+".clusters.png")
+
+  #plt.show()
+
+  clustered_traces = zip(train_programs, labels)
+  writer = write_csv(train_file.replace(".gz","")+".clusters")
+  for label, cluster in clustered_traces:
+     writer.writerow([label.split("/")[-1], cluster])
diff --git a/vdiscover/Pipeline.py b/vdiscover/Pipeline.py
index 384f8ce..cc7fc96 100644
--- a/vdiscover/Pipeline.py
+++ b/vdiscover/Pipeline.py
@@ -24,10 +24,10 @@
 from sklearn.naive_bayes import GaussianNB,  MultinomialNB
 from sklearn.linear_model import LogisticRegression
 from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
-from sklearn.decomposition import PCA
+from sklearn.decomposition import PCA, TruncatedSVD
 #from sklearn.manifold import MDS
 
-from random import random, randint, sample, gauss
+from random import random, randint, sample, gauss, shuffle
 
 def static_tokenizer(s):
     return filter(lambda x: x<>'', s.split(" "))
@@ -115,16 +115,38 @@ def make_train_pipeline(ftype):
   else:
     assert(0)
 
-def make_cluster_pipeline_bow(ftype):
-  if ftype is "dynamic":
+def make_cluster_pipeline_bow(ftype, rdim):
+  if ftype is "dynamic" and rdim == "pca":
+
     return Pipeline(steps=[
          ('selector', ItemSelector(key='dynamic')),
          ('dvectorizer', TfidfVectorizer(tokenizer=dynamic_tokenizer, use_idf=False, norm=None, ngram_range=(1,1), lowercase=False)),
          ('todense', DenseTransformer()),
-         ('cutfoff', CutoffMax(16)),
+         #('cutfoff', CutoffMax(16)),
          ('reducer', PCA(n_components=2)),
 
     ])
+
+  elif ftype is "dynamic" and rdim == "svd":
+
+    return Pipeline(steps=[
+         ('selector', ItemSelector(key='dynamic')),
+         ('dvectorizer', TfidfVectorizer(tokenizer=dynamic_tokenizer, use_idf=False, norm=None, ngram_range=(1,1), lowercase=False)),
+         ('todense', DenseTransformer()),
+         #('cutfoff', CutoffMax(16)),
+         ('reducer', TruncatedSVD(n_components=2)),
+
+    ])
+
+  elif ftype is "dynamic" and rdim == "none":
+
+    return Pipeline(steps=[
+         ('selector', ItemSelector(key='dynamic')),
+         ('dvectorizer', TfidfVectorizer(tokenizer=dynamic_tokenizer, use_idf=False, norm=None, ngram_range=(1,1), lowercase=False)),
+         ('todense', DenseTransformer()),
+         #('cutfoff', CutoffMax(16)),
+    ])
+
   elif ftype is "static":
     return Pipeline(steps=[
          ('selector', ItemSelector(key='static')),
@@ -136,11 +158,33 @@ def make_cluster_pipeline_bow(ftype):
   else:
     assert(0)
 
+
+def make_cluster_pipeline_doc2vec(ftype, rdim):
+  if ftype is "dynamic" and rdim == "pca":
+    return Pipeline(steps=[
+         ('selector', ItemSelector(key='dynamic')),
+         ('reducer', PCA(n_components=2)),
+    ])
+  elif ftype is "dynamic" and rdim == "svd":
+    return Pipeline(steps=[
+         ('selector', ItemSelector(key='dynamic')),
+         ('reducer', TruncatedSVD(n_components=2)),
+    ])
+  elif ftype is "dynamic" and rdim == "none":
+    return Pipeline(steps=[
+         ('selector', ItemSelector(key='dynamic'))
+    ])
+  elif ftype is "static":
+    raise NotImplemented
+  else:
+    assert(0)
+
+
+
 def make_cluster_pipeline_subtraces(ftype):
   if ftype is "dynamic":
     return Pipeline(steps=[
          ('selector', ItemSelector(key='dynamic')),
-         #('todense', DenseTransformer()),
          ('reducer', PCA(n_components=12)),
     ])
   elif ftype is "static":
diff --git a/vpredictor b/vpredictor
index 3b51644..c3dd938 100755
--- a/vpredictor
+++ b/vpredictor
@@ -66,9 +66,21 @@ if __name__ == "__main__":
                         help="Valid a model using infile",
                         action="store", default=None)
 
-    parser.add_argument("--cluster-bow",
-                        help="Cluster input traces using BOW",
-                        action="store_true", default=False)
+    parser.add_argument("--cluster-with-repr",
+                        help="Cluster input traces using some representation (bow, doc2vec)",
+                        action="store", default=None)
+
+    parser.add_argument("--cluster-with-rdim",
+                        help="Cluster input traces reducing dimensionality (pca, svd, none)",
+                        action="store", default="pca")
+
+    #parser.add_argument("--cluster-doc2vec",
+    #                    help="Cluster input traces using doc2vec",
+    #                    action="store_true", default=False)
+
+    parser.add_argument("--cluster-param", type=float,
+                        help="Cluster parameter",
+                        action="store", default=0.1)
 
     parser.add_argument("--cluster-cnn",
                         help="Cluster input traces using a convolutional model",
@@ -105,10 +117,12 @@ if __name__ == "__main__":
     #training_mode_lstm = options.train_lstm
     #training_mode_cnn = options.train_cnn
 
-    training_mode_cluster_bow = options.cluster_bow
-    training_mode_cluster_cnn = options.cluster_cnn
+    training_mode_cluster_repr = options.cluster_with_repr
+
+    cluster_rdim = options.cluster_with_rdim
+    cluster_param = options.cluster_param
 
-    training_mode = training_mode_rf or training_mode_cluster_bow or training_mode_cluster_cnn
+    training_mode = training_mode_rf or training_mode_cluster_repr #training_mode_cluster_bow or training_mode_cluster_cnn or training_mode_cluster_doc2vec
 
     probability_mode = options.prob
     nsamples = options.n_samples
@@ -130,25 +144,25 @@ if __name__ == "__main__":
     if training_mode:
       if training_mode_rf:
         Train(out_file, in_file, valid_file, "rf", ftype, nsamples)
-      #elif training_mode_cnn:
-      elif training_mode_cluster_bow:
+
+      elif training_mode_cluster_repr:
+        cluster_repr = training_mode_cluster_repr
         from vdiscover.Cluster  import ClusterScikit
 
-        #Cluster(in_file, valid_file, ftype, nsamples)
-        ClusterScikit(None, in_file, valid_file, ftype, nsamples)
+        ClusterScikit(None, in_file, valid_file, ftype, nsamples, cluster_repr, cluster_rdim, cluster_param)
+
+      """
       elif training_mode_cluster_cnn:
 
-        #Cluster(in_file, valid_file, ftype, nsamples)
         if (model_file is None):
           from vdiscover.Cluster import TrainCnn
 
           TrainCnn(out_file, in_file, valid_file, ftype, nsamples)
-          #print "Clustering using a convolutional model requires a pre-trained model"
           exit(0)
 
         from vdiscover.Cluster  import ClusterCnn
         ClusterCnn(model_file, in_file, valid_file, ftype, nsamples, None)
-
+      """
 
     else:
       if model_file is None: