|
| 1 | +{ |
| 2 | + "cells": [ |
| 3 | + { |
| 4 | + "cell_type": "code", |
| 5 | + "execution_count": 42, |
| 6 | + "metadata": { |
| 7 | + "collapsed": true |
| 8 | + }, |
| 9 | + "outputs": [], |
| 10 | + "source": [ |
| 11 | + "import time\n", |
| 12 | + "from os.path import join, basename\n", |
| 13 | + "import re\n", |
| 14 | + "import matplotlib.pyplot as plt\n", |
| 15 | + "import pylab\n", |
| 16 | + "import pyale\n", |
| 17 | + "import nibabel as nib\n", |
| 18 | + "import numpy as np\n", |
| 19 | + "import pandas as pd\n", |
| 20 | + "from sklearn.cluster import k_means, dbscan, SpectralClustering, FeatureAgglomeration\n", |
| 21 | + "from sklearn.metrics import silhouette_score, normalized_mutual_info_score\n", |
| 22 | + "from scipy.cluster import hierarchy\n", |
| 23 | + "import scipy.io as sio\n", |
| 24 | + "import seaborn as sns\n", |
| 25 | + "%matplotlib inline" |
| 26 | + ] |
| 27 | + }, |
| 28 | + { |
| 29 | + "cell_type": "code", |
| 30 | + "execution_count": 7, |
| 31 | + "metadata": { |
| 32 | + "collapsed": true |
| 33 | + }, |
| 34 | + "outputs": [], |
| 35 | + "source": [ |
| 36 | + "from math import log\n", |
| 37 | + "\n", |
| 38 | + "def variation_of_information(X, Y):\n", |
| 39 | + " #from https://gist.github.com/jwcarr/626cbc80e0006b526688\n", |
| 40 | + " n = float(sum([len(x) for x in X]))\n", |
| 41 | + " sigma = 0.0\n", |
| 42 | + " for x in X:\n", |
| 43 | + " p = len(x) / n\n", |
| 44 | + " for y in Y:\n", |
| 45 | + " q = len(y) / n\n", |
| 46 | + " r = len(set(x) & set(y)) / n\n", |
| 47 | + " if r > 0.0:\n", |
| 48 | + " sigma += r * (log(r / p, 2) + log(r / q, 2))\n", |
| 49 | + " return abs(sigma)" |
| 50 | + ] |
| 51 | + }, |
| 52 | + { |
| 53 | + "cell_type": "code", |
| 54 | + "execution_count": 15, |
| 55 | + "metadata": { |
| 56 | + "collapsed": true |
| 57 | + }, |
| 58 | + "outputs": [], |
| 59 | + "source": [ |
| 60 | + "out_dir = '/Users/Katie/Dropbox/Data/Naturalistic/NewSolution/python_output'" |
| 61 | + ] |
| 62 | + }, |
| 63 | + { |
| 64 | + "cell_type": "code", |
| 65 | + "execution_count": 2, |
| 66 | + "metadata": {}, |
| 67 | + "outputs": [], |
| 68 | + "source": [ |
| 69 | + "matlab_nat = sio.loadmat('/Users/Katie/Dropbox/Data/Naturalistic/NewSolution/Naturalistic_11.25.17.mat')" |
| 70 | + ] |
| 71 | + }, |
| 72 | + { |
| 73 | + "cell_type": "code", |
| 74 | + "execution_count": 10, |
| 75 | + "metadata": {}, |
| 76 | + "outputs": [], |
| 77 | + "source": [ |
| 78 | + "corrmat = matlab_nat['corrmat']\n", |
| 79 | + "corrmat2 = np.corrcoef(corrmat)\n", |
| 80 | + "minus_r = 1 - corrmat2" |
| 81 | + ] |
| 82 | + }, |
| 83 | + { |
| 84 | + "cell_type": "code", |
| 85 | + "execution_count": 11, |
| 86 | + "metadata": {}, |
| 87 | + "outputs": [ |
| 88 | + { |
| 89 | + "name": "stdout", |
| 90 | + "output_type": "stream", |
| 91 | + "text": [ |
| 92 | + "Kernel k-means clustering with k = 2: 1.20s\n", |
| 93 | + "Ward's clustering with k = 2: 0.01s\n", |
| 94 | + "K-means clustering with k = 2: 5.59s\n", |
| 95 | + "Kernel k-means clustering with k = 3: 1.96s\n", |
| 96 | + "Ward's clustering with k = 3: 0.01s\n", |
| 97 | + "K-means clustering with k = 3: 7.44s\n", |
| 98 | + "Kernel k-means clustering with k = 4: 2.74s\n", |
| 99 | + "Ward's clustering with k = 4: 0.01s\n", |
| 100 | + "K-means clustering with k = 4: 7.83s\n", |
| 101 | + "Kernel k-means clustering with k = 5: 2.71s\n", |
| 102 | + "Ward's clustering with k = 5: 0.01s\n", |
| 103 | + "K-means clustering with k = 5: 8.55s\n", |
| 104 | + "Kernel k-means clustering with k = 6: 2.58s\n", |
| 105 | + "Ward's clustering with k = 6: 0.01s\n", |
| 106 | + "K-means clustering with k = 6: 8.52s\n", |
| 107 | + "Kernel k-means clustering with k = 7: 2.93s\n", |
| 108 | + "Ward's clustering with k = 7: 0.01s\n", |
| 109 | + "K-means clustering with k = 7: 9.48s\n", |
| 110 | + "Kernel k-means clustering with k = 8: 3.19s\n", |
| 111 | + "Ward's clustering with k = 8: 0.01s\n", |
| 112 | + "K-means clustering with k = 8: 9.67s\n", |
| 113 | + "Kernel k-means clustering with k = 9: 3.56s\n", |
| 114 | + "Ward's clustering with k = 9: 0.01s\n", |
| 115 | + "K-means clustering with k = 9: 10.26s\n", |
| 116 | + "Kernel k-means clustering with k = 10: 3.58s\n", |
| 117 | + "Ward's clustering with k = 10: 0.01s\n", |
| 118 | + "K-means clustering with k = 10: 10.78s\n" |
| 119 | + ] |
| 120 | + } |
| 121 | + ], |
| 122 | + "source": [ |
| 123 | + "k = np.arange(2, 11)\n", |
| 124 | + "k_labels = {}\n", |
| 125 | + "kk_labels = {}\n", |
| 126 | + "w_labels = {}\n", |
| 127 | + "\n", |
| 128 | + "for i in k:\n", |
| 129 | + " #Kernel Kmeans nonlinear\n", |
| 130 | + " start = time.time()\n", |
| 131 | + " kernelk = SpectralClustering(n_clusters=i, eigen_solver=None, n_init=1000, gamma=1.0, affinity='rbf', assign_labels='kmeans', degree=3, coef0=1, n_jobs=1)\n", |
| 132 | + " labels = kernelk.fit_predict(minus_r)\n", |
| 133 | + " print \"Kernel k-means clustering with k = {0}: %.2fs\".format(i) % (time.time() - start) \n", |
| 134 | + " #np.savetxt('/Users/Katie/Dropbox/Data/Naturalistic/NewSolution/kernelk_labels_{0}.txt'.format(i), labels, delimiter='\\t')\n", |
| 135 | + " kk_labels[i] = labels\n", |
| 136 | + "\n", |
| 137 | + " #Ward's\n", |
| 138 | + " #calculate pdist (1 - r) \n", |
| 139 | + " start = time.time()\n", |
| 140 | + " ward = FeatureAgglomeration(n_clusters=i, linkage='ward', memory='nilearn_cache')\n", |
| 141 | + " ward.fit(minus_r)\n", |
| 142 | + " #np.savetxt('/Users/Katie/Dropbox/Data/Naturalistic/NewSolution/ward_labels_{0}.txt'.format(i), ward.labels_, delimiter='\\t')\n", |
| 143 | + " w_labels[i] = ward.labels_\n", |
| 144 | + " print \"Ward's clustering with k = {0}: %.2fs\".format(i) % (time.time() - start)\n", |
| 145 | + "\n", |
| 146 | + "\n", |
| 147 | + " #KMeans clustering\n", |
| 148 | + " start = time.time()\n", |
| 149 | + " [centroid, label, inertia] = k_means(minus_r, i, init='k-means++', precompute_distances='auto',\n", |
| 150 | + " n_init=1000, max_iter=1023, verbose=False, tol=0.0001,\n", |
| 151 | + " random_state=None, copy_x=True, n_jobs=1, algorithm='auto',\n", |
| 152 | + " return_n_iter=False)\n", |
| 153 | + " k_labels[i] = label\n", |
| 154 | + " #np.savetxt('/Users/Katie/Dropbox/Data/Naturalistic/NewSolution/kmeans_labels_{0}.txt'.format(i), label, delimiter='\\t')\n", |
| 155 | + " print \"K-means clustering with k = {0}: %.2fs\".format(i) % (time.time() - start)\n", |
| 156 | + " " |
| 157 | + ] |
| 158 | + }, |
| 159 | + { |
| 160 | + "cell_type": "code", |
| 161 | + "execution_count": 5, |
| 162 | + "metadata": {}, |
| 163 | + "outputs": [], |
| 164 | + "source": [ |
| 165 | + "from sklearn import cluster\n", |
| 166 | + "\n", |
| 167 | + "eps = np.arange(0.01, 2, 0.01)\n", |
| 168 | + "db_labels = {}\n", |
| 169 | + "\n", |
| 170 | + "for i in eps:\n", |
| 171 | + " [core, labels] = cluster.dbscan(minus_r, eps=i, metric='precomputed')\n", |
| 172 | + " db_labels['eps = {0}'.format(i)] = labels\n", |
| 173 | + "\n", |
| 174 | + "dbscan_out = pd.Series(db_labels)\n", |
| 175 | + "dbscan_out.to_csv('{0}/dbscan.txt'.format(out_dir), sep='\\t')" |
| 176 | + ] |
| 177 | + }, |
| 178 | + { |
| 179 | + "cell_type": "code", |
| 180 | + "execution_count": 43, |
| 181 | + "metadata": {}, |
| 182 | + "outputs": [], |
| 183 | + "source": [ |
| 184 | + "k_sils = []\n", |
| 185 | + "kk_sils = []\n", |
| 186 | + "w_sils = []\n", |
| 187 | + "matlab_sils = []\n", |
| 188 | + "mk_kk_mutual_info = []\n", |
| 189 | + "mk_w_mutual_info = []\n", |
| 190 | + "mk_pk_mutual_info = []\n", |
| 191 | + "\n", |
| 192 | + "\n", |
| 193 | + "k = np.arange(2, 11)\n", |
| 194 | + "\n", |
| 195 | + "for i in k:\n", |
| 196 | + " j = i-2\n", |
| 197 | + " \n", |
| 198 | + " matlab_soln = matlab_nat['IDX'][0, j].flatten()\n", |
| 199 | + " matlab_silhouette = silhouette_score(minus_r, matlab_soln, metric='precomputed', random_state=None)\n", |
| 200 | + " matlab_sils.append(matlab_silhouette)\n", |
| 201 | + " \n", |
| 202 | + " \n", |
| 203 | + " k_silhouette = silhouette_score(minus_r, k_labels[i], metric='precomputed', random_state=None)\n", |
| 204 | + " k_sils.append(k_silhouette)\n", |
| 205 | + "\n", |
| 206 | + " kk_silhouette = silhouette_score(minus_r, kk_labels[i], metric='precomputed', random_state=None)\n", |
| 207 | + " kk_sils.append(kk_silhouette)\n", |
| 208 | + "\n", |
| 209 | + " w_silhouette = silhouette_score(minus_r, w_labels[i], metric='precomputed', random_state=None)\n", |
| 210 | + " w_sils.append(w_silhouette)\n", |
| 211 | + " \n", |
| 212 | + " k_labels[i].reshape(-1, 1)\n", |
| 213 | + " kk_labels[i].reshape(-1, 1)\n", |
| 214 | + " w_labels[i].reshape(-1, 1)\n", |
| 215 | + "\n", |
| 216 | + " mk_kk_nmi = normalized_mutual_info_score(matlab_soln, kk_labels[i])\n", |
| 217 | + " mk_kk_mutual_info.append(mk_kk_nmi)\n", |
| 218 | + " \n", |
| 219 | + " mk_pk_nmi = normalized_mutual_info_score(k_labels[i], matlab_soln)\n", |
| 220 | + " mk_pk_mutual_info.append(mk_pk_nmi)\n", |
| 221 | + "\n", |
| 222 | + " mk_w_nmi = normalized_mutual_info_score(matlab_soln, w_labels[i])\n", |
| 223 | + " mk_w_mutual_info.append(mk_w_nmi)\n", |
| 224 | + "\n", |
| 225 | + "np.savetxt('{0}/wards_silhouettes.txt'.format(out_dir), w_sils, delimiter='\\t')\n", |
| 226 | + "np.savetxt('{0}/kernel_kmean_silhouettes.txt'.format(out_dir), kk_sils, delimiter='\\t')\n", |
| 227 | + "np.savetxt('{0}/python_kmean_silhouettes.txt'.format(out_dir), k_sils, delimiter='\\t')\n", |
| 228 | + "np.savetxt('{0}/matlab_kmean_silhouettes.txt'.format(out_dir), matlab_sils, delimiter='\\t')\n", |
| 229 | + "\n", |
| 230 | + "np.savetxt('{0}/mk_kk_nmi.txt'.format(out_dir), mk_kk_mutual_info, delimiter='\\t')\n", |
| 231 | + "np.savetxt('{0}/mk_w_nmi.txt'.format(out_dir), mk_w_mutual_info, delimiter='\\t')\n", |
| 232 | + "np.savetxt('{0}/mk_pk_nmi.txt'.format(out_dir), mk_pk_mutual_info, delimiter='\\t')" |
| 233 | + ] |
| 234 | + }, |
| 235 | + { |
| 236 | + "cell_type": "code", |
| 237 | + "execution_count": 8, |
| 238 | + "metadata": { |
| 239 | + "collapsed": true |
| 240 | + }, |
| 241 | + "outputs": [], |
| 242 | + "source": [ |
| 243 | + "k_clusters_idx = {}\n", |
| 244 | + "kk_clusters_idx = {}\n", |
| 245 | + "w_clusters_idx = {}\n", |
| 246 | + "mk_clusters_idx = \n", |
| 247 | + "\n", |
| 248 | + "for i in k:\n", |
| 249 | + " k_clusters = []\n", |
| 250 | + " kk_clusters = []\n", |
| 251 | + " w_clusters = []\n", |
| 252 | + " for j in range(i):\n", |
| 253 | + " k_clusters.append(list(np.where(k_labels[i-2] == j)[0]))\n", |
| 254 | + " kk_clusters.append(list(np.where(kk_labels[i-2] == j)[0]))\n", |
| 255 | + " w_clusters.append(list(np.where(w_labels[i-2] == j)[0]))\n", |
| 256 | + " k_clusters_idx['Solution {0}'.format(i)] = k_clusters\n", |
| 257 | + " kk_clusters_idx['Solution {0}'.format(i)] = kk_clusters\n", |
| 258 | + " w_clusters_idx['Solution {0}'.format(i)] = w_clusters" |
| 259 | + ] |
| 260 | + }, |
| 261 | + { |
| 262 | + "cell_type": "code", |
| 263 | + "execution_count": 113, |
| 264 | + "metadata": {}, |
| 265 | + "outputs": [], |
| 266 | + "source": [ |
| 267 | + "vi_k = {}\n", |
| 268 | + "vi_kk = {}\n", |
| 269 | + "vi_w = {}\n", |
| 270 | + "vi_mk = {}\n", |
| 271 | + "\n", |
| 272 | + "\n", |
| 273 | + "for i in k[:-1]:\n", |
| 274 | + " j = k_clusters_idx['Solution {0}'.format(i)]\n", |
| 275 | + " z = k_clusters_idx['Solution {0}'.format(i+1)]\n", |
| 276 | + " k_vi = variation_of_information(j, z)\n", |
| 277 | + " vi_k[i+1] = k_vi\n", |
| 278 | + " \n", |
| 279 | + " j = kk_clusters_idx['Solution {0}'.format(i)]\n", |
| 280 | + " z = kk_clusters_idx['Solution {0}'.format(i+1)]\n", |
| 281 | + " kk_vi = variation_of_information(j, z)\n", |
| 282 | + " vi_kk[i+1] = kk_vi\n", |
| 283 | + " \n", |
| 284 | + " j = w_clusters_idx['Solution {0}'.format(i)]\n", |
| 285 | + " z = w_clusters_idx['Solution {0}'.format(i+1)]\n", |
| 286 | + " w_vi = variation_of_information(j, z)\n", |
| 287 | + " vi_w[i+1] = w_vi\n", |
| 288 | + " \n", |
| 289 | + "\n", |
| 290 | + "wards_vi = pd.Series(vi_w)\n", |
| 291 | + "#wards_vi.to_csv('{0}/wards_vi.csv'.format(out_dir), sep=',')\n", |
| 292 | + "\n", |
| 293 | + "kmeans_vi = pd.Series(vi_k)\n", |
| 294 | + "#kmeans_vi.to_csv('{0}/pythonkmean_vi.csv'.format(out_dir), sep=',')\n", |
| 295 | + "\n", |
| 296 | + "kernelk_vi = pd.Series(vi_kk)\n", |
| 297 | + "#kernelk_vi.to_csv('{0}/kernelkmean_vi.csv'.format(out_dir), sep=',')\n", |
| 298 | + "\n", |
| 299 | + "vi_mk = [0.831299855, 0.894586133, 1.064756853, 0.575203598, 0.854956836, 0.809800235, 0.827846322, 1.277838909]\n", |
| 300 | + "matlabk_vi = pd.Series(vi_mk, index=[3, 4, 5, 6, 7, 8, 9, 10])\n", |
| 301 | + "#matlabk_vi.to_csv('{0}/matlabkmean_vi.csv'.format(out_dir), sep=',')" |
| 302 | + ] |
| 303 | + }, |
| 304 | + { |
| 305 | + "cell_type": "code", |
| 306 | + "execution_count": 132, |
| 307 | + "metadata": {}, |
| 308 | + "outputs": [], |
| 309 | + "source": [ |
| 310 | + "variation_info = pd.DataFrame([kernelk_vi, matlabk_vi, kmeans_vi, wards_vi]).transpose()" |
| 311 | + ] |
| 312 | + }, |
| 313 | + { |
| 314 | + "cell_type": "code", |
| 315 | + "execution_count": 133, |
| 316 | + "metadata": {}, |
| 317 | + "outputs": [], |
| 318 | + "source": [ |
| 319 | + "variation_info.to_csv('{0}/variation_of_information.csv'.format(out_dir), sep=',')" |
| 320 | + ] |
| 321 | + } |
| 322 | + ], |
| 323 | + "metadata": { |
| 324 | + "kernelspec": { |
| 325 | + "display_name": "Python 2", |
| 326 | + "language": "python", |
| 327 | + "name": "python2" |
| 328 | + }, |
| 329 | + "language_info": { |
| 330 | + "codemirror_mode": { |
| 331 | + "name": "ipython", |
| 332 | + "version": 2 |
| 333 | + }, |
| 334 | + "file_extension": ".py", |
| 335 | + "mimetype": "text/x-python", |
| 336 | + "name": "python", |
| 337 | + "nbconvert_exporter": "python", |
| 338 | + "pygments_lexer": "ipython2", |
| 339 | + "version": "2.7.13" |
| 340 | + } |
| 341 | + }, |
| 342 | + "nbformat": 4, |
| 343 | + "nbformat_minor": 2 |
| 344 | +} |
0 commit comments