Skip to content

Commit

Permalink
Merge pull request #114 from KrishnaswamyLab/develop
Browse files Browse the repository at this point in the history
MATLAB MAGIC2
  • Loading branch information
scottgigante authored Jun 30, 2018
2 parents e1829d4 + 15bdcd2 commit 21f9f76
Show file tree
Hide file tree
Showing 17 changed files with 615 additions and 343 deletions.
Binary file added .DS_Store
Binary file not shown.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,5 @@ python/*.dll
python/*.egg-info
python/magic/__pycache__
python/magic/*.pyc
.DS_Store
matlab/EMT.csv
Binary file modified matlab/.DS_Store
Binary file not shown.
Binary file removed matlab/EMT.csv.zip
Binary file not shown.
138 changes: 138 additions & 0 deletions matlab/compute_kernel.m
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
function K = compute_alpha_kernel_sparse(data, varargin)
% K = computer_alpha_kernel_sparse(data, varargin)
% Computes sparse alpha-decay kernel
% varargin:
% 'npca' (default = [], no PCA)
% Perform fast random PCA before computing distances
% 'k' (default = 5)
% k for the knn distances for the locally adaptive bandwidth
% 'a' (default = 10)
% The alpha exponent in the alpha-decaying kernel
% 'distfun' (default = 'euclidean')
% Input distance function
k = 5;
a = 10;
npca = [];
distfun = 'euclidean';
% get the input parameters
if ~isempty(varargin)
for j = 1:length(varargin)
% k nearest neighbora
if strcmp(varargin{j}, 'k')
k = varargin{j+1};
end
% alpha
if strcmp(varargin{j}, 'a')
a = varargin{j+1};
end
% npca to project data
if strcmp(varargin{j}, 'npca')
npca = varargin{j+1};
end
% distfun
if strcmp(varargin{j}, 'distfun')
distfun = varargin{j+1};
end
end
end

th = 1e-4;

k_knn = k * 20;

bth=(-log(th))^(1/a);

disp 'Computing alpha decay kernel:'

N = size(data, 1); % number of cells

if ~isempty(npca)
disp ' PCA'
data_centr = bsxfun(@minus, data, mean(data,1));
[U,~,~] = randPCA(data_centr', npca); % fast random svd
%[U,~,~] = svds(data', npca);
data_pc = data_centr * U; % PCA project
else
data_pc = data;
end

disp(['Number of samples = ' num2str(N)])

% Initial knn search and set the radius
disp(['First iteration: k = ' num2str(k_knn)])
[idx, kdist]=knnsearch(data_pc,data_pc,'k',k_knn,'Distance',distfun);
epsilon=kdist(:,k+1);

% Find the points that have large enough distance to be below the kernel
% threshold
below_thresh=kdist(:,end)>=bth*epsilon;

idx_thresh=find(below_thresh);

if ~isempty(idx_thresh)
K=exp(-(kdist(idx_thresh,:)./epsilon(idx_thresh)).^a);
K(K<=th)=0;
K=K(:);
i = repmat(idx_thresh',1,size(idx,2));
i = i(:);
idx_temp=idx(idx_thresh,:);
j = idx_temp(:);
end

disp(['Number of samples below the threshold from 1st iter: ' num2str(length(idx_thresh))])

% Loop increasing k by factor of 20 until we cover 90% of the data
while length(idx_thresh)<.9*N
k_knn=min(20*k_knn,N);
data_pc2=data_pc(~below_thresh,:);
epsilon2=epsilon(~below_thresh);
disp(['Next iteration: k= ' num2str(k_knn)])
[idx2, kdist2]=knnsearch(data_pc,data_pc2,'k',k_knn,'Distance',distfun);

% Find the points that have large enough distance
below_thresh2=kdist2(:,end)>=bth*epsilon2;
idx_thresh2=find(below_thresh2);

if ~isempty(idx_thresh2)
K2=exp(-(kdist2(idx_thresh2,:)./epsilon2(idx_thresh2)).^a);
K2(K2<=th)=0;
idx_notthresh=find(~below_thresh);
i2=repmat(idx_notthresh(idx_thresh2)',1,size(idx2,2));
i2=i2(:);
idx_temp=idx2(idx_thresh2,:);
j2=idx_temp(:);

i=[i; i2];
j=[j; j2];
K=[K; K2(:)];
% Add the newly thresholded points to the old ones
below_thresh(idx_notthresh(idx_thresh2))=1;
idx_thresh=find(below_thresh);
end
disp(['Number of samples below the threshold from the next iter: ' num2str(length(idx_thresh))])
end

% Radius search for the rest
if length(idx_thresh)<N
disp(['Using radius based search for the rest'])
data_pc2=data_pc(~below_thresh,:);
epsilon2=epsilon(~below_thresh);
[idx2, kdist2]=rangesearch(data_pc,data_pc2,bth*max(epsilon2),'Distance',distfun);
idx_notthresh=find(~below_thresh);
for m=1:length(idx2)
i=[i; idx_notthresh(m)*ones(length(idx2{m}),1)];
j=[j; idx2{m}'];
K2=exp(-(kdist2{m}./epsilon2(m)).^a);
K2(K2<=th)=0;
K=[K; K2(:)];
end

end

% Build the kernel
K = sparse(i, j, K);

disp ' Symmetrize affinities'
K = K + K';
disp ' Done computing kernel'

105 changes: 48 additions & 57 deletions matlab/compute_optimal_t.m
Original file line number Diff line number Diff line change
@@ -1,80 +1,71 @@
function [t_opt, r2_vec] = compute_optimal_t(data, DiffOp, varargin)
% [t_opt, r2_vec] = compute_optimal_t(data, DiffOp, varargin)
% data - input data
% DiffOp - diffusion operator
% varargin:
% t_max - max t to try
% n_genes - number of random genes to compute optimal t on, should be
% at least 100, fewer is faster
% make_plots - draw convergence as a function of t with which we
% select the optimal t
function [data_opt_t, t_opt] = compute_optimal_t(data, DiffOp, varargin)

t_max = 32;
n_genes = size(data,2);
make_plots = true;
make_plot = true;
th = 1e-3;
data_opt_t = [];

if ~isempty(varargin)
for j = 1:length(varargin)
if strcmp(varargin{j}, 't_max')
t_max = varargin{j+1};
end
if strcmp(varargin{j}, 'n_genes')
n_genes = varargin{j+1};
if strcmp(varargin{j}, 'make_plot')
make_plot = varargin{j+1};
end
if strcmp(varargin{j}, 'make_plots')
make_plots = varargin{j+1};
if strcmp(varargin{j}, 'th')
th = varargin{j+1};
end
end
end

if ~issparse(DiffOp)
DiffOp = sparse(DiffOp);
data_prev = data;
if make_plot
error_vec = nan(t_max,1);
for I=1:t_max
disp(['t = ' num2str(I)]);
data_curr = DiffOp * data_prev;
error_vec(I) = procrustes(data_prev, data_curr);
if error_vec(I) < th && isempty(data_opt_t)
data_opt_t = data_curr;
end
data_prev = data_curr;
end
t_opt = find(error_vec < th, 1, 'first');

figure;
hold all;
plot(1:t_max, error_vec, '*-');
plot(t_opt, error_vec(t_opt), 'or', 'markersize', 10);
xlabel 't'
ylabel 'error'
axis tight
ylim([0 ceil(max(error_vec)*10)/10]);
plot(xlim, [th th], '--k');
legend({'y' 'optimal t' ['y=' num2str(th)]});
set(gca,'xtick',1:t_max);
set(gca,'ytick',0:0.1:1);
else
for I=1:t_max
disp(['t = ' num2str(I)]);
data_curr = DiffOp * data_prev;
error = procrustes(data_prev, data_curr);
if error < th
t_opt = I;
data_opt_t = data_curr;
break
end
data_prev = data_curr;
end
end

if n_genes > size(data,2)
disp 'n_genes too large, capping n_genes at maximum possible number of genes'
n_genes = size(data,2)
end
disp(['optimal t = ' num2str(t_opt)]);


idx_genes = randsample(size(data,2), n_genes);
data_imputed = data;
data_imputed = data_imputed(:,idx_genes);

if min(data_imputed(:)) < 0
disp 'data has negative values, shifting to positive'
data_imputed = data_imputed - min(data_imputed(:));
end

r2_vec = nan(t_max,1);
data_prev = data_imputed;
data_prev = bsxfun(@rdivide, data_prev, sum(data_prev));
disp 'computing optimal t'
for I=1:t_max
data_imputed = DiffOp * data_imputed;
data_curr = data_imputed;
data_curr = bsxfun(@rdivide, data_curr, sum(data_curr));
r2 = rsquare(data_prev(:), data_curr(:));
r2_vec(I) = 1 - r2;
data_prev = data_curr;
end

t_opt = find(r2_vec < 0.05, 1, 'first') + 1;

disp(['optimal t = ' num2str(t_opt)]);

if make_plots
figure;
hold all;
plot(1:t_max, r2_vec, '*-');
plot(t_opt, r2_vec(t_opt), 'or', 'markersize', 10);
xlabel 't'
ylabel '1 - R^2(data_{t},data_{t-1})'
axis tight
ylim([0 1]);
plot(xlim, [0.05 0.05], '--k');
legend({'y' 'optimal t' 'y=0.05'});
set(gca,'xtick',1:t_max);
set(gca,'ytick',0:0.1:1);
end


67 changes: 67 additions & 0 deletions matlab/compute_optimal_t.m~
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
function [t_opt, error_vec, data_imputed] = compute_optimal_t_fast(data, DiffOp, varargin)

t_max = 20;
make_plots = true;
th = 0.001;

if ~isempty(varargin)
for j = 1:length(varargin)
if strcmp(varargin{j}, 't_max')
t_max = varargin{j+1};
end
if strcmp(varargin{j}, 'make_plots')
make_plots = varargin{j+1};
end
end
end

if make_plots
error_vec = nan(t_max,1);
data_prev = data;
for I=1:t_max
data_curr = DiffOp * data_prev;
error_vec(I) = procrustes(data_prev, data_curr);
data_prev = data_curr;
end
t_opt = find(error_vec < th, 1, 'first');
disp(['optimal t = ' num2str(t_opt)]);
end






error_vec = nan(t_max,1);
data_prev = data;
for I=1:t_max
data_curr = DiffOp * data_prev;
error_vec(I) = procrustes(data_prev, data_curr);
if error_vec(I) < th && ~make_plots
break
end
data_prev = data_curr;
end

t_opt = find(error_vec < th, 1, 'first');

disp(['optimal t = ' num2str(t_opt)]);

data_imputed =

if make_plots
figure;
hold all;
plot(1:t_max, error_vec, '*-');
plot(t_opt, error_vec(t_opt), 'or', 'markersize', 10);
xlabel 't'
ylabel 'error'
axis tight
ylim([0 ceil(max(error_vec)*10)/10]);
plot(xlim, [th th], '--k');
legend({'y' 'optimal t' ['y=' num2str(th)]});
set(gca,'xtick',1:t_max);
set(gca,'ytick',0:0.1:1);
end


17 changes: 17 additions & 0 deletions matlab/project_genes.m
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
function [M, genes_found, gene_idx] = project_genes(genes, genes_all, pc_imputed, U)
% project_genes -- obtain gene values from compressed imputed data
% [M, genes_found, gene_idx] = project_genes(genes, genes_all, pc_imputed, U) computes
% gene values (M) for given gene names (genes) given all gene names (genes_all), loadings
% (U), and imputed principal components (pc_imputed).
%
% Since pc_imputed and U are both narrow matrices the imputed data can be
% stored in a memory efficient way, without having to store the dense
% matrix.

[gene_idx,locb] = ismember(lower(genes_all), lower(genes));
[~,sidx] = sort(locb(gene_idx));
idx = find(gene_idx);
idx = idx(sidx);
M = pc_imputed * U(idx,:)'; % project
genes_found = genes_all(idx);
gene_idx = find(gene_idx);
13 changes: 13 additions & 0 deletions matlab/project_genes.m~
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
function [M, genes_found, gene_idx] = project_genes(genes, genes_all, pc_t, U)
% project_genes -- obtain gene values from compressed imputed data
% [M, genes_found, lia] = project_genes(genes, genes_all, pc_t, U) computes
% gene values (M) for genes given all gene names (genes_all), loadings
% (U), and imputed principal components (pc_t).

[gene_idx,locb] = ismember(lower(genes_all), lower(genes));
[~,sidx] = sort(locb(gene_idx));
idx = find(gene_idx);
idx = idx(sidx);
M = pc_t * U(idx,:)';
genes_found = genes_all(idx);
gene_idx = find(gene_idx);
Loading

0 comments on commit 21f9f76

Please sign in to comment.