Skip to content

Commit f43b930

Browse files
committed
Added notebooks for python clustering & assessment of clustering solutions
Added notebooks used for python clustering, comparison of clustering solutions, and forward/reverse inference for manual decoding
1 parent d814c87 commit f43b930

13 files changed

+1305
-0
lines changed
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.

evaluating_clustering_solutions.ipynb

+266
Large diffs are not rendered by default.

fwd_rev_decoding_annotations.ipynb

+695
Large diffs are not rendered by default.

re-clustering.ipynb

+344
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,344 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 42,
6+
"metadata": {
7+
"collapsed": true
8+
},
9+
"outputs": [],
10+
"source": [
11+
"import time\n",
12+
"from os.path import join, basename\n",
13+
"import re\n",
14+
"import matplotlib.pyplot as plt\n",
15+
"import pylab\n",
16+
"import pyale\n",
17+
"import nibabel as nib\n",
18+
"import numpy as np\n",
19+
"import pandas as pd\n",
20+
"from sklearn.cluster import k_means, dbscan, SpectralClustering, FeatureAgglomeration\n",
21+
"from sklearn.metrics import silhouette_score, normalized_mutual_info_score\n",
22+
"from scipy.cluster import hierarchy\n",
23+
"import scipy.io as sio\n",
24+
"import seaborn as sns\n",
25+
"%matplotlib inline"
26+
]
27+
},
28+
{
29+
"cell_type": "code",
30+
"execution_count": 7,
31+
"metadata": {
32+
"collapsed": true
33+
},
34+
"outputs": [],
35+
"source": [
36+
"from math import log\n",
37+
"\n",
38+
"def variation_of_information(X, Y):\n",
39+
" #from https://gist.github.com/jwcarr/626cbc80e0006b526688\n",
40+
" n = float(sum([len(x) for x in X]))\n",
41+
" sigma = 0.0\n",
42+
" for x in X:\n",
43+
" p = len(x) / n\n",
44+
" for y in Y:\n",
45+
" q = len(y) / n\n",
46+
" r = len(set(x) & set(y)) / n\n",
47+
" if r > 0.0:\n",
48+
" sigma += r * (log(r / p, 2) + log(r / q, 2))\n",
49+
" return abs(sigma)"
50+
]
51+
},
52+
{
53+
"cell_type": "code",
54+
"execution_count": 15,
55+
"metadata": {
56+
"collapsed": true
57+
},
58+
"outputs": [],
59+
"source": [
60+
"out_dir = '/Users/Katie/Dropbox/Data/Naturalistic/NewSolution/python_output'"
61+
]
62+
},
63+
{
64+
"cell_type": "code",
65+
"execution_count": 2,
66+
"metadata": {},
67+
"outputs": [],
68+
"source": [
69+
"matlab_nat = sio.loadmat('/Users/Katie/Dropbox/Data/Naturalistic/NewSolution/Naturalistic_11.25.17.mat')"
70+
]
71+
},
72+
{
73+
"cell_type": "code",
74+
"execution_count": 10,
75+
"metadata": {},
76+
"outputs": [],
77+
"source": [
78+
"corrmat = matlab_nat['corrmat']\n",
79+
"corrmat2 = np.corrcoef(corrmat)\n",
80+
"minus_r = 1 - corrmat2"
81+
]
82+
},
83+
{
84+
"cell_type": "code",
85+
"execution_count": 11,
86+
"metadata": {},
87+
"outputs": [
88+
{
89+
"name": "stdout",
90+
"output_type": "stream",
91+
"text": [
92+
"Kernel k-means clustering with k = 2: 1.20s\n",
93+
"Ward's clustering with k = 2: 0.01s\n",
94+
"K-means clustering with k = 2: 5.59s\n",
95+
"Kernel k-means clustering with k = 3: 1.96s\n",
96+
"Ward's clustering with k = 3: 0.01s\n",
97+
"K-means clustering with k = 3: 7.44s\n",
98+
"Kernel k-means clustering with k = 4: 2.74s\n",
99+
"Ward's clustering with k = 4: 0.01s\n",
100+
"K-means clustering with k = 4: 7.83s\n",
101+
"Kernel k-means clustering with k = 5: 2.71s\n",
102+
"Ward's clustering with k = 5: 0.01s\n",
103+
"K-means clustering with k = 5: 8.55s\n",
104+
"Kernel k-means clustering with k = 6: 2.58s\n",
105+
"Ward's clustering with k = 6: 0.01s\n",
106+
"K-means clustering with k = 6: 8.52s\n",
107+
"Kernel k-means clustering with k = 7: 2.93s\n",
108+
"Ward's clustering with k = 7: 0.01s\n",
109+
"K-means clustering with k = 7: 9.48s\n",
110+
"Kernel k-means clustering with k = 8: 3.19s\n",
111+
"Ward's clustering with k = 8: 0.01s\n",
112+
"K-means clustering with k = 8: 9.67s\n",
113+
"Kernel k-means clustering with k = 9: 3.56s\n",
114+
"Ward's clustering with k = 9: 0.01s\n",
115+
"K-means clustering with k = 9: 10.26s\n",
116+
"Kernel k-means clustering with k = 10: 3.58s\n",
117+
"Ward's clustering with k = 10: 0.01s\n",
118+
"K-means clustering with k = 10: 10.78s\n"
119+
]
120+
}
121+
],
122+
"source": [
123+
"k = np.arange(2, 11)\n",
124+
"k_labels = {}\n",
125+
"kk_labels = {}\n",
126+
"w_labels = {}\n",
127+
"\n",
128+
"for i in k:\n",
129+
" #Kernel Kmeans nonlinear\n",
130+
" start = time.time()\n",
131+
" kernelk = SpectralClustering(n_clusters=i, eigen_solver=None, n_init=1000, gamma=1.0, affinity='rbf', assign_labels='kmeans', degree=3, coef0=1, n_jobs=1)\n",
132+
" labels = kernelk.fit_predict(minus_r)\n",
133+
" print \"Kernel k-means clustering with k = {0}: %.2fs\".format(i) % (time.time() - start) \n",
134+
" #np.savetxt('/Users/Katie/Dropbox/Data/Naturalistic/NewSolution/kernelk_labels_{0}.txt'.format(i), labels, delimiter='\\t')\n",
135+
" kk_labels[i] = labels\n",
136+
"\n",
137+
" #Ward's\n",
138+
" #calculate pdist (1 - r) \n",
139+
" start = time.time()\n",
140+
" ward = FeatureAgglomeration(n_clusters=i, linkage='ward', memory='nilearn_cache')\n",
141+
" ward.fit(minus_r)\n",
142+
" #np.savetxt('/Users/Katie/Dropbox/Data/Naturalistic/NewSolution/ward_labels_{0}.txt'.format(i), ward.labels_, delimiter='\\t')\n",
143+
" w_labels[i] = ward.labels_\n",
144+
" print \"Ward's clustering with k = {0}: %.2fs\".format(i) % (time.time() - start)\n",
145+
"\n",
146+
"\n",
147+
" #KMeans clustering\n",
148+
" start = time.time()\n",
149+
" [centroid, label, inertia] = k_means(minus_r, i, init='k-means++', precompute_distances='auto',\n",
150+
" n_init=1000, max_iter=1023, verbose=False, tol=0.0001,\n",
151+
" random_state=None, copy_x=True, n_jobs=1, algorithm='auto',\n",
152+
" return_n_iter=False)\n",
153+
" k_labels[i] = label\n",
154+
" #np.savetxt('/Users/Katie/Dropbox/Data/Naturalistic/NewSolution/kmeans_labels_{0}.txt'.format(i), label, delimiter='\\t')\n",
155+
" print \"K-means clustering with k = {0}: %.2fs\".format(i) % (time.time() - start)\n",
156+
" "
157+
]
158+
},
159+
{
160+
"cell_type": "code",
161+
"execution_count": 5,
162+
"metadata": {},
163+
"outputs": [],
164+
"source": [
165+
"from sklearn import cluster\n",
166+
"\n",
167+
"eps = np.arange(0.01, 2, 0.01)\n",
168+
"db_labels = {}\n",
169+
"\n",
170+
"for i in eps:\n",
171+
" [core, labels] = cluster.dbscan(minus_r, eps=i, metric='precomputed')\n",
172+
" db_labels['eps = {0}'.format(i)] = labels\n",
173+
"\n",
174+
"dbscan_out = pd.Series(db_labels)\n",
175+
"dbscan_out.to_csv('{0}/dbscan.txt'.format(out_dir), sep='\\t')"
176+
]
177+
},
178+
{
179+
"cell_type": "code",
180+
"execution_count": 43,
181+
"metadata": {},
182+
"outputs": [],
183+
"source": [
184+
"k_sils = []\n",
185+
"kk_sils = []\n",
186+
"w_sils = []\n",
187+
"matlab_sils = []\n",
188+
"mk_kk_mutual_info = []\n",
189+
"mk_w_mutual_info = []\n",
190+
"mk_pk_mutual_info = []\n",
191+
"\n",
192+
"\n",
193+
"k = np.arange(2, 11)\n",
194+
"\n",
195+
"for i in k:\n",
196+
" j = i-2\n",
197+
" \n",
198+
" matlab_soln = matlab_nat['IDX'][0, j].flatten()\n",
199+
" matlab_silhouette = silhouette_score(minus_r, matlab_soln, metric='precomputed', random_state=None)\n",
200+
" matlab_sils.append(matlab_silhouette)\n",
201+
" \n",
202+
" \n",
203+
" k_silhouette = silhouette_score(minus_r, k_labels[i], metric='precomputed', random_state=None)\n",
204+
" k_sils.append(k_silhouette)\n",
205+
"\n",
206+
" kk_silhouette = silhouette_score(minus_r, kk_labels[i], metric='precomputed', random_state=None)\n",
207+
" kk_sils.append(kk_silhouette)\n",
208+
"\n",
209+
" w_silhouette = silhouette_score(minus_r, w_labels[i], metric='precomputed', random_state=None)\n",
210+
" w_sils.append(w_silhouette)\n",
211+
" \n",
212+
" k_labels[i].reshape(-1, 1)\n",
213+
" kk_labels[i].reshape(-1, 1)\n",
214+
" w_labels[i].reshape(-1, 1)\n",
215+
"\n",
216+
" mk_kk_nmi = normalized_mutual_info_score(matlab_soln, kk_labels[i])\n",
217+
" mk_kk_mutual_info.append(mk_kk_nmi)\n",
218+
" \n",
219+
" mk_pk_nmi = normalized_mutual_info_score(k_labels[i], matlab_soln)\n",
220+
" mk_pk_mutual_info.append(mk_pk_nmi)\n",
221+
"\n",
222+
" mk_w_nmi = normalized_mutual_info_score(matlab_soln, w_labels[i])\n",
223+
" mk_w_mutual_info.append(mk_w_nmi)\n",
224+
"\n",
225+
"np.savetxt('{0}/wards_silhouettes.txt'.format(out_dir), w_sils, delimiter='\\t')\n",
226+
"np.savetxt('{0}/kernel_kmean_silhouettes.txt'.format(out_dir), kk_sils, delimiter='\\t')\n",
227+
"np.savetxt('{0}/python_kmean_silhouettes.txt'.format(out_dir), k_sils, delimiter='\\t')\n",
228+
"np.savetxt('{0}/matlab_kmean_silhouettes.txt'.format(out_dir), matlab_sils, delimiter='\\t')\n",
229+
"\n",
230+
"np.savetxt('{0}/mk_kk_nmi.txt'.format(out_dir), mk_kk_mutual_info, delimiter='\\t')\n",
231+
"np.savetxt('{0}/mk_w_nmi.txt'.format(out_dir), mk_w_mutual_info, delimiter='\\t')\n",
232+
"np.savetxt('{0}/mk_pk_nmi.txt'.format(out_dir), mk_pk_mutual_info, delimiter='\\t')"
233+
]
234+
},
235+
{
236+
"cell_type": "code",
237+
"execution_count": 8,
238+
"metadata": {
239+
"collapsed": true
240+
},
241+
"outputs": [],
242+
"source": [
243+
"k_clusters_idx = {}\n",
244+
"kk_clusters_idx = {}\n",
245+
"w_clusters_idx = {}\n",
246+
"mk_clusters_idx = \n",
247+
"\n",
248+
"for i in k:\n",
249+
" k_clusters = []\n",
250+
" kk_clusters = []\n",
251+
" w_clusters = []\n",
252+
" for j in range(i):\n",
253+
" k_clusters.append(list(np.where(k_labels[i-2] == j)[0]))\n",
254+
" kk_clusters.append(list(np.where(kk_labels[i-2] == j)[0]))\n",
255+
" w_clusters.append(list(np.where(w_labels[i-2] == j)[0]))\n",
256+
" k_clusters_idx['Solution {0}'.format(i)] = k_clusters\n",
257+
" kk_clusters_idx['Solution {0}'.format(i)] = kk_clusters\n",
258+
" w_clusters_idx['Solution {0}'.format(i)] = w_clusters"
259+
]
260+
},
261+
{
262+
"cell_type": "code",
263+
"execution_count": 113,
264+
"metadata": {},
265+
"outputs": [],
266+
"source": [
267+
"vi_k = {}\n",
268+
"vi_kk = {}\n",
269+
"vi_w = {}\n",
270+
"vi_mk = {}\n",
271+
"\n",
272+
"\n",
273+
"for i in k[:-1]:\n",
274+
" j = k_clusters_idx['Solution {0}'.format(i)]\n",
275+
" z = k_clusters_idx['Solution {0}'.format(i+1)]\n",
276+
" k_vi = variation_of_information(j, z)\n",
277+
" vi_k[i+1] = k_vi\n",
278+
" \n",
279+
" j = kk_clusters_idx['Solution {0}'.format(i)]\n",
280+
" z = kk_clusters_idx['Solution {0}'.format(i+1)]\n",
281+
" kk_vi = variation_of_information(j, z)\n",
282+
" vi_kk[i+1] = kk_vi\n",
283+
" \n",
284+
" j = w_clusters_idx['Solution {0}'.format(i)]\n",
285+
" z = w_clusters_idx['Solution {0}'.format(i+1)]\n",
286+
" w_vi = variation_of_information(j, z)\n",
287+
" vi_w[i+1] = w_vi\n",
288+
" \n",
289+
"\n",
290+
"wards_vi = pd.Series(vi_w)\n",
291+
"#wards_vi.to_csv('{0}/wards_vi.csv'.format(out_dir), sep=',')\n",
292+
"\n",
293+
"kmeans_vi = pd.Series(vi_k)\n",
294+
"#kmeans_vi.to_csv('{0}/pythonkmean_vi.csv'.format(out_dir), sep=',')\n",
295+
"\n",
296+
"kernelk_vi = pd.Series(vi_kk)\n",
297+
"#kernelk_vi.to_csv('{0}/kernelkmean_vi.csv'.format(out_dir), sep=',')\n",
298+
"\n",
299+
"vi_mk = [0.831299855, 0.894586133, 1.064756853, 0.575203598, 0.854956836, 0.809800235, 0.827846322, 1.277838909]\n",
300+
"matlabk_vi = pd.Series(vi_mk, index=[3, 4, 5, 6, 7, 8, 9, 10])\n",
301+
"#matlabk_vi.to_csv('{0}/matlabkmean_vi.csv'.format(out_dir), sep=',')"
302+
]
303+
},
304+
{
305+
"cell_type": "code",
306+
"execution_count": 132,
307+
"metadata": {},
308+
"outputs": [],
309+
"source": [
310+
"variation_info = pd.DataFrame([kernelk_vi, matlabk_vi, kmeans_vi, wards_vi]).transpose()"
311+
]
312+
},
313+
{
314+
"cell_type": "code",
315+
"execution_count": 133,
316+
"metadata": {},
317+
"outputs": [],
318+
"source": [
319+
"variation_info.to_csv('{0}/variation_of_information.csv'.format(out_dir), sep=',')"
320+
]
321+
}
322+
],
323+
"metadata": {
324+
"kernelspec": {
325+
"display_name": "Python 2",
326+
"language": "python",
327+
"name": "python2"
328+
},
329+
"language_info": {
330+
"codemirror_mode": {
331+
"name": "ipython",
332+
"version": 2
333+
},
334+
"file_extension": ".py",
335+
"mimetype": "text/x-python",
336+
"name": "python",
337+
"nbconvert_exporter": "python",
338+
"pygments_lexer": "ipython2",
339+
"version": "2.7.13"
340+
}
341+
},
342+
"nbformat": 4,
343+
"nbformat_minor": 2
344+
}

0 commit comments

Comments
 (0)