27
27
import matplotlib as mpl
28
28
29
29
# hack from https://stackoverflow.com/questions/2801882/generating-a-png-with-matplotlib-when-display-is-undefined to avoid using X
30
- mpl .use ('Agg' )
30
+ # mpl.use('Agg')
31
31
import matplotlib .pyplot as plt
32
32
33
33
from Utils import *
@@ -284,10 +284,6 @@ def TrainCnn(model_file, train_file, valid_file, ftype, nsamples):
284
284
285
285
def ClusterScikit (model_file , train_file , valid_file , ftype , nsamples ):
286
286
287
- #import matplotlib.pyplot as plt
288
- #import matplotlib as mpl
289
-
290
- #csvreader = open_csv(train_file)
291
287
train_programs , train_features , train_classes = read_traces (train_file , nsamples )
292
288
train_size = len (train_programs )
293
289
@@ -298,15 +294,14 @@ def ClusterScikit(model_file, train_file, valid_file, ftype, nsamples):
298
294
#batch_size = 16
299
295
#window_size = 20
300
296
301
- #from sklearn.cluster import MeanShift
302
-
303
297
print "Transforming data and fitting model.."
304
298
model = make_cluster_pipeline_bow (ftype )
305
299
X_red = model .fit_transform (train_dict )
306
300
307
301
#mpl.rcParams.update({'font.size': 10})
308
302
plt .figure ()
309
303
colors = 'brgcmykbgrcmykbgrcmykbgrcmyk'
304
+ ncolors = len (colors )
310
305
311
306
for prog ,[x ,y ],cl in zip (train_programs , X_red , train_classes ):
312
307
x = gauss (0 ,0.1 ) + x
@@ -332,26 +327,37 @@ def ClusterScikit(model_file, train_file, valid_file, ftype, nsamples):
332
327
plt .text (x , y + 0.02 , prog .split ("/" )[- 1 ])
333
328
334
329
plt .show ()
335
- #af = MeanShift().fit(X_red)
330
+ from sklearn .cluster import MeanShift , estimate_bandwidth
331
+
332
+ bandwidth = estimate_bandwidth (X_red , quantile = 0.2 )
333
+ print "Clustering with bandwidth:" , bandwidth
334
+
335
+ af = MeanShift (bandwidth = bandwidth / 5 ).fit (X_red )
336
+
337
+ cluster_centers = af .cluster_centers_
338
+ labels = af .labels_
339
+ n_clusters_ = len (cluster_centers )
336
340
337
- #cluster_centers = af.cluster_centers_
338
- #labels = af.labels_
339
- #n_clusters_ = len(cluster_centers )
341
+ plt . close ( 'all' )
342
+ plt . figure ( 1 )
343
+ plt . clf ( )
340
344
341
- #plt.close('all')
342
- #plt.figure(1)
343
- #plt.clf()
345
+ for ([x ,y ],label , cluster_label ) in zip (X_red ,train_programs , labels ):
346
+ x = gauss (0 ,0.1 ) + x
347
+ y = gauss (0 ,0.1 ) + y
348
+ plt .scatter (x , y , c = colors [cluster_label % ncolors ])
344
349
345
- #for k, col in zip(range(n_clusters_), colors):
346
- # my_members = labels == k
347
- # cluster_center = cluster_centers[k]
348
- # plt.plot(X_red[my_members, 0], X_red[my_members, 1], col + '.')
349
- # plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,
350
- # markeredgecolor='k', markersize=14)
350
+ for i ,[x ,y ] in enumerate (cluster_centers ):
351
+ plt .plot (x , y , 'o' , markerfacecolor = colors [i % ncolors ],
352
+ markeredgecolor = 'k' , markersize = 7 )
351
353
354
+ plt .title ('Estimated number of clusters: %d' % n_clusters_ )
355
+ plt .show ()
352
356
353
- #plt.title('Estimated number of clusters: %d' % n_clusters_)
354
- #plt.show()
357
+ clustered_traces = zip (train_programs , labels )
358
+ writer = write_csv (train_file .replace (".gz" ,"" )+ ".clusters" )
359
+ for label , cluster in clustered_traces :
360
+ writer .writerow ([label , cluster ])
355
361
356
362
def Cluster (train_file , valid_file , ftype , nsamples ):
357
363
0 commit comments