Merge pull request #114 from KrishnaswamyLab/develop

MATLAB MAGIC2
KrishnaswamyLab · Jun 30, 2018 · 21f9f76 · 21f9f76
2 parents e1829d4 + 15bdcd2
commit 21f9f76
Show file tree

Hide file tree

Showing 17 changed files with 615 additions and 343 deletions.
diff --git a/.DS_Store b/.DS_Store
diff --git a/.gitignore b/.gitignore
@@ -20,3 +20,5 @@ python/*.dll
 python/*.egg-info
 python/magic/__pycache__
 python/magic/*.pyc
+.DS_Store
+matlab/EMT.csv
diff --git a/matlab/.DS_Store b/matlab/.DS_Store
diff --git a/matlab/EMT.csv.zip b/matlab/EMT.csv.zip
diff --git a/matlab/compute_kernel.m b/matlab/compute_kernel.m
@@ -0,0 +1,138 @@
+function K = compute_alpha_kernel_sparse(data, varargin)
+% K = computer_alpha_kernel_sparse(data, varargin)
+% Computes sparse alpha-decay kernel
+% varargin: 
+%   'npca' (default = [], no PCA)
+%       Perform fast random PCA before computing distances
+%   'k' (default = 5)
+%       k for the knn distances for the locally adaptive bandwidth
+%   'a' (default = 10)
+%       The alpha exponent in the alpha-decaying kernel
+%   'distfun' (default = 'euclidean')
+%       Input distance function
+k = 5;
+a = 10;
+npca = [];
+distfun = 'euclidean';
+% get the input parameters
+if ~isempty(varargin)
+    for j = 1:length(varargin)
+        % k nearest neighbora
+        if strcmp(varargin{j}, 'k')
+            k = varargin{j+1};
+        end
+        % alpha
+        if strcmp(varargin{j}, 'a')
+            a = varargin{j+1};
+        end
+        % npca to project data
+        if strcmp(varargin{j}, 'npca')
+            npca = varargin{j+1};
+        end
+        % distfun
+        if strcmp(varargin{j}, 'distfun')
+            distfun = varargin{j+1};
+        end
+    end
+end
+
+th = 1e-4;
+
+k_knn = k * 20;
+
+bth=(-log(th))^(1/a);
+
+disp 'Computing alpha decay kernel:'
+
+N = size(data, 1); % number of cells
+
+if ~isempty(npca)
+    disp '   PCA'
+    data_centr = bsxfun(@minus, data, mean(data,1));
+    [U,~,~] = randPCA(data_centr', npca); % fast random svd
+    %[U,~,~] = svds(data', npca);
+    data_pc = data_centr * U; % PCA project
+else
+    data_pc = data;
+end
+
+disp(['Number of samples = ' num2str(N)])
+
+% Initial knn search and set the radius
+disp(['First iteration: k = ' num2str(k_knn)])
+[idx, kdist]=knnsearch(data_pc,data_pc,'k',k_knn,'Distance',distfun);
+epsilon=kdist(:,k+1);
+
+% Find the points that have large enough distance to be below the kernel
+% threshold
+below_thresh=kdist(:,end)>=bth*epsilon;
+
+idx_thresh=find(below_thresh);
+
+if ~isempty(idx_thresh) 
+    K=exp(-(kdist(idx_thresh,:)./epsilon(idx_thresh)).^a);
+    K(K<=th)=0;
+    K=K(:);
+    i = repmat(idx_thresh',1,size(idx,2));
+    i = i(:);
+    idx_temp=idx(idx_thresh,:);
+    j = idx_temp(:);
+end
+
+disp(['Number of samples below the threshold from 1st iter: ' num2str(length(idx_thresh))])
+
+% Loop increasing k by factor of 20 until we cover 90% of the data
+while length(idx_thresh)<.9*N
+    k_knn=min(20*k_knn,N);
+    data_pc2=data_pc(~below_thresh,:);
+    epsilon2=epsilon(~below_thresh);
+    disp(['Next iteration: k= ' num2str(k_knn)])
+    [idx2, kdist2]=knnsearch(data_pc,data_pc2,'k',k_knn,'Distance',distfun);
+
+%     Find the points that have large enough distance
+    below_thresh2=kdist2(:,end)>=bth*epsilon2;
+    idx_thresh2=find(below_thresh2);
+
+    if ~isempty(idx_thresh2)
+        K2=exp(-(kdist2(idx_thresh2,:)./epsilon2(idx_thresh2)).^a);
+        K2(K2<=th)=0;
+        idx_notthresh=find(~below_thresh);
+        i2=repmat(idx_notthresh(idx_thresh2)',1,size(idx2,2));
+        i2=i2(:);
+        idx_temp=idx2(idx_thresh2,:);
+        j2=idx_temp(:);
+
+        i=[i; i2];
+        j=[j; j2];
+        K=[K; K2(:)];
+%         Add the newly thresholded points to the old ones
+        below_thresh(idx_notthresh(idx_thresh2))=1;
+        idx_thresh=find(below_thresh);
+    end
+    disp(['Number of samples below the threshold from the next iter: ' num2str(length(idx_thresh))])
+end
+
+% Radius search for the rest
+if length(idx_thresh)<N
+    disp(['Using radius based search for the rest'])
+    data_pc2=data_pc(~below_thresh,:);
+    epsilon2=epsilon(~below_thresh);
+    [idx2, kdist2]=rangesearch(data_pc,data_pc2,bth*max(epsilon2),'Distance',distfun);
+    idx_notthresh=find(~below_thresh);
+    for m=1:length(idx2)
+        i=[i; idx_notthresh(m)*ones(length(idx2{m}),1)];
+        j=[j; idx2{m}'];
+        K2=exp(-(kdist2{m}./epsilon2(m)).^a);
+        K2(K2<=th)=0;
+        K=[K; K2(:)];
+    end
+
+end
+
+% Build the kernel
+K = sparse(i, j, K);
+
+disp '   Symmetrize affinities'
+K = K + K';
+disp '   Done computing kernel'
+
diff --git a/matlab/compute_optimal_t.m b/matlab/compute_optimal_t.m
@@ -1,80 +1,71 @@
-function [t_opt, r2_vec] = compute_optimal_t(data, DiffOp, varargin)
-% [t_opt, r2_vec] = compute_optimal_t(data, DiffOp, varargin)
-%   data - input data
-%   DiffOp - diffusion operator
-%   varargin:
-%       t_max - max t to try
-%       n_genes - number of random genes to compute optimal t on, should be
-%        at least 100, fewer is faster
-%       make_plots - draw convergence as a function of t with which we
-%        select the optimal t
+function [data_opt_t, t_opt]  = compute_optimal_t(data, DiffOp, varargin)
 
 t_max = 32;
-n_genes = size(data,2);
-make_plots = true;
+make_plot = true;
+th = 1e-3;
+data_opt_t = [];
 
 if ~isempty(varargin)
     for j = 1:length(varargin)
         if strcmp(varargin{j}, 't_max')
             t_max = varargin{j+1};
         end
-        if strcmp(varargin{j}, 'n_genes')
-            n_genes = varargin{j+1};
+        if strcmp(varargin{j}, 'make_plot')
+            make_plot = varargin{j+1};
         end
-        if strcmp(varargin{j}, 'make_plots')
-            make_plots = varargin{j+1};
+        if strcmp(varargin{j}, 'th')
+            th = varargin{j+1};
         end
     end
 end
 
-if ~issparse(DiffOp)
-    DiffOp = sparse(DiffOp);
+data_prev = data;
+if make_plot
+    error_vec = nan(t_max,1);
+    for I=1:t_max
+        disp(['t = ' num2str(I)]);
+        data_curr = DiffOp * data_prev;
+        error_vec(I) = procrustes(data_prev, data_curr);
+        if error_vec(I) < th && isempty(data_opt_t)
+            data_opt_t = data_curr;
+        end
+        data_prev = data_curr;
+    end
+    t_opt = find(error_vec < th, 1, 'first');
+
+    figure;
+    hold all;
+    plot(1:t_max, error_vec, '*-');
+    plot(t_opt, error_vec(t_opt), 'or', 'markersize', 10);
+    xlabel 't'
+    ylabel 'error'
+    axis tight
+    ylim([0 ceil(max(error_vec)*10)/10]);
+    plot(xlim, [th th], '--k');
+    legend({'y' 'optimal t' ['y=' num2str(th)]});
+    set(gca,'xtick',1:t_max);
+    set(gca,'ytick',0:0.1:1);
+else
+    for I=1:t_max
+        disp(['t = ' num2str(I)]);
+        data_curr = DiffOp * data_prev;
+        error = procrustes(data_prev, data_curr);
+        if error < th
+            t_opt = I;
+            data_opt_t = data_curr;
+            break
+        end
+        data_prev = data_curr;
+    end
 end
 
-if n_genes > size(data,2)
-    disp 'n_genes too large, capping n_genes at maximum possible number of genes'
-    n_genes = size(data,2)
-end
+disp(['optimal t = ' num2str(t_opt)]);
+
 
-idx_genes = randsample(size(data,2), n_genes);
-data_imputed = data;
-data_imputed = data_imputed(:,idx_genes);
 
-if min(data_imputed(:)) < 0
-    disp 'data has negative values, shifting to positive'
-    data_imputed = data_imputed - min(data_imputed(:));
-end
 
-r2_vec = nan(t_max,1);
-data_prev = data_imputed;
-data_prev = bsxfun(@rdivide, data_prev, sum(data_prev));
-disp 'computing optimal t'
-for I=1:t_max
-    data_imputed = DiffOp * data_imputed;
-    data_curr = data_imputed;
-    data_curr = bsxfun(@rdivide, data_curr, sum(data_curr));
-    r2 = rsquare(data_prev(:), data_curr(:));
-    r2_vec(I) = 1 - r2;
-    data_prev = data_curr;
-end
 
-t_opt = find(r2_vec < 0.05, 1, 'first') + 1;
 
-disp(['optimal t = ' num2str(t_opt)]);
 
-if make_plots
-    figure;
-    hold all;
-    plot(1:t_max, r2_vec, '*-');
-    plot(t_opt, r2_vec(t_opt), 'or', 'markersize', 10);
-    xlabel 't'
-    ylabel '1 - R^2(data_{t},data_{t-1})'
-    axis tight
-    ylim([0 1]);
-    plot(xlim, [0.05 0.05], '--k');
-    legend({'y' 'optimal t' 'y=0.05'});
-    set(gca,'xtick',1:t_max);
-    set(gca,'ytick',0:0.1:1);
-end
 
 
diff --git a/matlab/compute_optimal_t.m~ b/matlab/compute_optimal_t.m~
@@ -0,0 +1,67 @@
+function [t_opt, error_vec, data_imputed] = compute_optimal_t_fast(data, DiffOp, varargin)
+
+t_max = 20;
+make_plots = true;
+th = 0.001;
+
+if ~isempty(varargin)
+    for j = 1:length(varargin)
+        if strcmp(varargin{j}, 't_max')
+            t_max = varargin{j+1};
+        end
+        if strcmp(varargin{j}, 'make_plots')
+            make_plots = varargin{j+1};
+        end
+    end
+end
+
+if make_plots
+    error_vec = nan(t_max,1);
+    data_prev = data;
+    for I=1:t_max
+        data_curr = DiffOp * data_prev;
+        error_vec(I) = procrustes(data_prev, data_curr);
+        data_prev = data_curr;
+    end
+    t_opt = find(error_vec < th, 1, 'first');
+    disp(['optimal t = ' num2str(t_opt)]);
+end
+
+
+
+
+
+
+error_vec = nan(t_max,1);
+data_prev = data;
+for I=1:t_max
+    data_curr = DiffOp * data_prev;
+    error_vec(I) = procrustes(data_prev, data_curr);
+    if error_vec(I) < th && ~make_plots
+        break
+    end
+    data_prev = data_curr;
+end
+
+t_opt = find(error_vec < th, 1, 'first');
+
+disp(['optimal t = ' num2str(t_opt)]);
+
+data_imputed = 
+
+if make_plots
+    figure;
+    hold all;
+    plot(1:t_max, error_vec, '*-');
+    plot(t_opt, error_vec(t_opt), 'or', 'markersize', 10);
+    xlabel 't'
+    ylabel 'error'
+    axis tight
+    ylim([0 ceil(max(error_vec)*10)/10]);
+    plot(xlim, [th th], '--k');
+    legend({'y' 'optimal t' ['y=' num2str(th)]});
+    set(gca,'xtick',1:t_max);
+    set(gca,'ytick',0:0.1:1);
+end
+
+
diff --git a/matlab/project_genes.m b/matlab/project_genes.m
@@ -0,0 +1,17 @@
+function [M, genes_found, gene_idx] = project_genes(genes, genes_all, pc_imputed, U)
+% project_genes -- obtain gene values from compressed imputed data
+%   [M, genes_found, gene_idx] = project_genes(genes, genes_all, pc_imputed, U) computes
+%   gene values (M) for given gene names (genes) given all gene names (genes_all), loadings
+%   (U), and imputed principal components (pc_imputed).
+%
+%   Since pc_imputed and U are both narrow matrices the imputed data can be
+%   stored in a memory efficient way, without having to store the dense
+%   matrix.
+
+[gene_idx,locb] = ismember(lower(genes_all), lower(genes));
+[~,sidx] = sort(locb(gene_idx));
+idx = find(gene_idx);
+idx = idx(sidx);
+M = pc_imputed * U(idx,:)'; % project
+genes_found = genes_all(idx);
+gene_idx = find(gene_idx);
diff --git a/matlab/project_genes.m~ b/matlab/project_genes.m~
@@ -0,0 +1,13 @@
+function [M, genes_found, gene_idx] = project_genes(genes, genes_all, pc_t, U)
+% project_genes -- obtain gene values from compressed imputed data
+%   [M, genes_found, lia] = project_genes(genes, genes_all, pc_t, U) computes
+%   gene values (M) for genes given all gene names (genes_all), loadings
+%   (U), and imputed principal components (pc_t).
+
+[gene_idx,locb] = ismember(lower(genes_all), lower(genes));
+[~,sidx] = sort(locb(gene_idx));
+idx = find(gene_idx);
+idx = idx(sidx);
+M = pc_t * U(idx,:)';
+genes_found = genes_all(idx);
+gene_idx = find(gene_idx);