Skip to content

Commit b53d723

Browse files
authored
Merge pull request #21 from pkathail/develop
Latest develop changes
2 parents 2054d0e + b7873e9 commit b53d723

File tree

11 files changed

+1008
-551
lines changed

11 files changed

+1008
-551
lines changed

README.md

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,29 @@
11
Markov Affinity-based Graph Imputation of Cells (MAGIC)
22
-------------------------------------------------------
33

4+
MAGIC has been implemented in Python3 and Matlab.
5+
6+
#### Installation and dependencies for the Python version
7+
1. The Python3 version of MAGIC can be installed using:
48

5-
#### Installation and dependencies
6-
1. MAGIC has been implemented in Python3 and can be installed using
7-
2.
89
$> git clone git://github.com/pkathail/magic.git
910
$> cd magic
10-
$> sudo pip3 install .
11+
$> sudo -H pip3 install .
1112

1213
2. MAGIC depends on a number of `python3` packages available on pypi and these dependencies are listed in `setup.py`
1314
All the dependencies will be automatically installed using the above commands
1415

1516
#### Usage
1617

1718
##### Command line
18-
A tutorial on MAGIC usage and results visualization for single cell RNA-seq data can be found in this notebook: http://nbviewer.jupyter.org/github/pkathail/magic/blob/magic_develop/notebooks/Magic_single_cell_RNAseq.ipynb
19+
A tutorial on MAGIC usage and results visualization for single cell RNA-seq data can be found in this notebook: http://nbviewer.jupyter.org/github/pkathail/magic/blob/develop/notebooks/Magic_single_cell_RNAseq.ipynb
1920

2021

2122
##### GUI
2223
A python GUI is now available for MAGIC. After following the installation steps listed below, the GUI can be invoked using
2324

2425
$> magic_gui.py
2526

27+
#### Installation and dependencies for the Matlab version
28+
1. Matlab implementation of MAGIC uses Mauro Maggioni's Diffusion Geometry code. Download from here: http://www.math.jhu.edu/~mauro/Code/DiffusionGeometry_01.zip or use included DiffusionGeometry_01.zip
29+
2. test_magic.m shows how to run MAGIC. Also included is a function for loading 10x format data (load_10x.m)

docs/magic_tutorial.pptx

4.44 MB
Binary file not shown.

matlab/DiffusionGeometry_01.zip

1.13 MB
Binary file not shown.

matlab/load_10x.m

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
function [data, gene_names, gene_ids, cells] = load_10x(data_dir, varargin)
2+
% [data, gene_names, gene_ids, cells] = load_10x(data_dir, varargin)
3+
% loads 10x sparse format data
4+
% data_dir is dir that contains matrix.mtx, genes.tsv and barcodes.tsv
5+
% varargin
6+
% 'sparse', true -- returns data matrix in sparse format (default 'false')
7+
8+
return_sparse = false;
9+
10+
if isempty(data_dir)
11+
data_dir = './';
12+
elseif data_dir(end) ~= '/'
13+
data_dir = [data_dir '/'];
14+
end
15+
16+
for i=1:length(varargin)-1
17+
if (strcmp(varargin{i}, 'sparse'))
18+
return_sparse = varargin{i+1};
19+
end
20+
end
21+
22+
filename_dataMatrix = [data_dir 'matrix.mtx'];
23+
filename_genes = [data_dir 'genes.tsv'];
24+
filename_cells = [data_dir 'barcodes.tsv'];
25+
26+
27+
% Read in gene expression matrix (sparse matrix)
28+
% Rows = genes, columns = cells
29+
fprintf('LOADING\n')
30+
dataMatrix = mmread(filename_dataMatrix);
31+
fprintf(' Data matrix (%i cells x %i genes): %s\n', ...
32+
size(dataMatrix'), ['''' filename_dataMatrix '''' ])
33+
34+
% Read in row names (gene names / IDs)
35+
dataMatrix_genes = table2cell( ...
36+
readtable(filename_genes, ...
37+
'FileType','text','ReadVariableNames',0));
38+
dataMatrix_cells = table2cell( ...
39+
readtable(filename_cells, ...
40+
'FileType','text','ReadVariableNames',0));
41+
42+
% Remove empty cells
43+
col_keep = any(dataMatrix,1);
44+
dataMatrix = dataMatrix(:,col_keep);
45+
dataMatrix_cells = dataMatrix_cells(col_keep,:);
46+
fprintf(' Removed %i empty cells\n', full(sum(~col_keep)))
47+
48+
% Remove empty genes
49+
genes_keep = any(dataMatrix,2);
50+
dataMatrix = dataMatrix(genes_keep,:);
51+
dataMatrix_genes = dataMatrix_genes(genes_keep,:);
52+
fprintf(' Removed %i empty genes\n', full(sum(~genes_keep)))
53+
54+
data = dataMatrix';
55+
if ~return_sparse
56+
data = full(data);
57+
end
58+
gene_names = dataMatrix_genes(:,2);
59+
gene_ids = dataMatrix_genes(:,1);
60+
cells = dataMatrix_cells;

matlab/run_magic.m

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
function data_imputed = run_magic(data, t, varargin)
2+
% run MAGIC
3+
4+
% data must have cells on the rows and genes on the columns
5+
% t is diffusion time
6+
% varargin:
7+
% 'npca' (defauklt = 20)
8+
% perform fast random PCA before computing distances
9+
% 'ka' (default = 10)
10+
% k of adaptive kernel
11+
% 0 for non-adaptive (standard gaussian) kernel with bandwidth sigma
12+
% 'k' (defauklt = 30)
13+
% k of kNN graph
14+
% 'sigma' (default = 1)
15+
% sigma of kernel bandwidth
16+
% for adaptive kernel (ka>0) the effective bandwidth is sigma * the
17+
% distance to the ka-th neighbor
18+
% 'rescale_to' (default = 0, no rescale)
19+
% rescale genes to 'rescale_to' percentile
20+
% set to 0 for log scaled data
21+
22+
% set up default parameters
23+
k = 30;
24+
ka = 10;
25+
npca = 20;
26+
sigma = 1;
27+
rescale_to = 0;
28+
29+
% get the input parameters
30+
if ~isempty(varargin)
31+
for j = 1:length(varargin)
32+
% k nearest neighbor
33+
if strcmp(varargin{j}, 'ka')
34+
ka = varargin{j+1};
35+
end
36+
% for knn-autotune
37+
if strcmp(varargin{j}, 'k')
38+
k = varargin{j+1};
39+
end
40+
% npca to project data
41+
if strcmp(varargin{j}, 'npca')
42+
npca = varargin{j+1};
43+
end
44+
% sigma of kernel bandwidth
45+
if strcmp(varargin{j}, 'sigma')
46+
sigma = varargin{j+1};
47+
end
48+
% sigma of kernel bandwidth
49+
if strcmp(varargin{j}, 'rescale_to')
50+
rescale_to = varargin{j+1};
51+
end
52+
end
53+
end
54+
55+
% Kernel
56+
disp 'Computing kernel'
57+
Options.Display = 1;
58+
Options.Epsilon = sigma;
59+
Options.kNN = k;
60+
Options.kNNAutotune = ka;
61+
Options.NNMaxDim = npca;
62+
Options.Normalization = 'markov';
63+
G = GraphDiffusion(data', 0, Options);
64+
L = full(G.T);
65+
66+
% Diffuse
67+
disp(['Diffusing for ' num2str(t) ' steps']);
68+
L_t = L^t;
69+
70+
% Impute
71+
disp 'Imputing'
72+
data_imputed = L_t * data;
73+
74+
% Rescale
75+
if rescale_to > 0
76+
if ~any(data(:)<0)
77+
disp 'Rescaling'
78+
MR = prctile(data, rescale_to);
79+
M = max(data);
80+
MR(MR == 0) = M(MR == 0);
81+
MR_new = prctile(data_imputed, rescale_to);
82+
M_new = max(data_imputed);
83+
MR_new(MR_new == 0) = M_new(MR_new == 0);
84+
max_ratio = MR ./ MR_new;
85+
data_imputed = data_imputed .* repmat(max_ratio, size(data,1), 1);
86+
else
87+
disp('Negative values detected (log scaled?) so no rescale is done.')
88+
end
89+
end
90+
91+
disp 'done'

matlab/test_magic.m

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
%% init
2+
% Matlab implementation of MAGIC uses Mauro Maggioni's Diffusion Geometry code
3+
% download from: http://www.math.jhu.edu/~mauro/Code/DiffusionGeometry_01.zip
4+
addpath(genpath('DiffusionGeometry/'));
5+
6+
%% load data (e.g. 10x data)
7+
% data should be cells as rows and genes as columns
8+
sample_dir = 'path_to_data/';
9+
[data, gene_names, gene_ids, cells] = load_10x(sample_dir);
10+
11+
%% library size normalization
12+
libsize = sum(data,2);
13+
data = bsxfun(@rdivide, data, libsize) * median(libsize);
14+
15+
%% log transform -- some data requires log transform
16+
%data = log(data + 0.1); % 0.1 is pseudocount
17+
18+
%% MAGIC
19+
npca = 20; % ususally between 10 and 200
20+
ka = 10; % can be smaller, eg 3
21+
k = 30; % can be smaller, eg 9
22+
t = 6; % usually between 6 and 12, smaller ka/k requitres bigger t
23+
rescale_to = 99; % 0 (no rescale) if data is log scaled
24+
data_imputed = run_magic(data, t, 'npca', npca, 'ka', ka, 'k', k, 'rescale_to', rescale_to);
25+
26+
%% plot
27+
plot_genes = {'Cdh1', 'Vim', 'Fn1', 'Zeb1'};
28+
ms = 20;
29+
v = [-45 20];
30+
% before MAGIC
31+
x = data(:, ismember(lower(gene_names), lower(plot_genes{1})));
32+
y = data(:, ismember(lower(gene_names), lower(plot_genes{2})));
33+
z = data(:, ismember(lower(gene_names), lower(plot_genes{3})));
34+
c = data(:, ismember(lower(gene_names), lower(plot_genes{4})));
35+
figure;
36+
subplot(2,2,1);
37+
scatter(y, x, ms, c, 'filled');
38+
colormap(parula);
39+
axis tight
40+
xlabel(plot_genes{2});
41+
ylabel(plot_genes{1});
42+
%h = colorbar;
43+
%ylabel(h,plot_genes{4});
44+
title 'Before MAGIC'
45+
46+
subplot(2,2,2);
47+
scatter3(x, y, z, ms, c, 'filled');
48+
colormap(parula);
49+
axis tight
50+
xlabel(plot_genes{1});
51+
ylabel(plot_genes{2});
52+
zlabel(plot_genes{3});
53+
h = colorbar;
54+
ylabel(h,plot_genes{4});
55+
view(v);
56+
title 'Before MAGIC'
57+
58+
% plot after MAGIC
59+
x = data_imputed(:, ismember(lower(gene_names), lower(plot_genes{1})));
60+
y = data_imputed(:, ismember(lower(gene_names), lower(plot_genes{2})));
61+
z = data_imputed(:, ismember(lower(gene_names), lower(plot_genes{3})));
62+
c = data_imputed(:, ismember(lower(gene_names), lower(plot_genes{4})));
63+
subplot(2,2,3);
64+
scatter(y, x, ms, c, 'filled');
65+
colormap(parula);
66+
axis tight
67+
xlabel(plot_genes{2});
68+
ylabel(plot_genes{1});
69+
%h = colorbar;
70+
%ylabel(h,plot_genes{4});
71+
title 'After MAGIC'
72+
73+
subplot(2,2,4);
74+
scatter3(x, y, z, ms, c, 'filled');
75+
colormap(parula);
76+
axis tight
77+
xlabel(plot_genes{1});
78+
ylabel(plot_genes{2});
79+
zlabel(plot_genes{3});
80+
h = colorbar;
81+
ylabel(h,plot_genes{4});
82+
view(v);
83+
title 'After MAGIC'
84+
85+
86+
87+

notebooks/Magic_single_cell_RNAseq.ipynb

Lines changed: 151 additions & 88 deletions
Large diffs are not rendered by default.

setup.py

Lines changed: 2 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,13 @@
11
import os
22
import sys
33
import shutil
4-
from subprocess import call
54
from setuptools import setup
65
from warnings import warn
76

87
if sys.version_info.major != 3:
98
raise RuntimeError('Magic requires Python 3')
109

1110

12-
# install phenograph
13-
call(['pip3', 'install', 'git+https://github.com/jacoblevine/phenograph.git'])
14-
15-
1611
setup(name='magic',
1712
version='0.0',
1813
description='MAGIC',
@@ -29,17 +24,11 @@
2924
'sklearn',
3025
'networkx',
3126
'fcsparser',
32-
'statsmodels'],
27+
'statsmodels',
28+
],
3329
scripts=['src/magic/magic_gui.py'],
3430
)
3531

3632

3733
# get location of setup.py
3834
setup_dir = os.path.dirname(os.path.realpath(__file__))
39-
40-
# Copy test data
41-
data_dir = os.path.expanduser('~/.magic/data')
42-
if os.path.isdir(data_dir):
43-
shutil.rmtree(data_dir)
44-
shutil.copytree(setup_dir + '/data/', data_dir)
45-

src/magic/MAGIC.py

Lines changed: 13 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -7,25 +7,18 @@
77
from scipy.spatial.distance import squareform
88
from sklearn.neighbors import NearestNeighbors
99

10-
def magic(data, kernel='gaussian', n_pca_components=20, random_pca=True,
11-
t=6, knn=30, knn_autotune=10, epsilon=1, rescale=99, k_knn=100, perplexity=30):
10+
def magic(data, n_pca_components=20, random_pca=True,
11+
t=6, knn=30, knn_autotune=10, epsilon=1, rescale=99):
1212

13-
if kernel not in ['gaussian']:
14-
raise RuntimeError('Invalid kernel type. Must be "gaussian".')
15-
16-
#library size normalization
17-
#create data_norm
18-
19-
#always pass in data_norm
2013
if n_pca_components != None:
14+
print('doing PCA')
2115
pca_projected_data = run_pca(data, n_components=n_pca_components, random=random_pca)
2216
else:
2317
pca_projected_data = data
2418

25-
if kernel == 'gaussian':
26-
#run diffusion maps to get markov matrix
27-
L = compute_markov(pca_projected_data, knn=knn, epsilon=epsilon,
28-
distance_metric='euclidean', knn_autotune=knn_autotune)
19+
#run diffusion maps to get markov matrix
20+
L = compute_markov(pca_projected_data, knn=knn, epsilon=epsilon,
21+
distance_metric='euclidean', knn_autotune=knn_autotune)
2922

3023
#remove tsne kernel for now
3124
# else:
@@ -37,7 +30,6 @@ def magic(data, kernel='gaussian', n_pca_components=20, random_pca=True,
3730
# P = _joint_probabilities(distances, perplexity, 1)
3831
# P = squareform(P)
3932

40-
## QUESTION -- should this happen for gaussian kernel too??
4133
# #markov normalize P
4234
# L = np.divide(P, np.sum(P, axis=1))
4335

@@ -75,6 +67,11 @@ def impute_fast(data, L, t, rescale_percent=0, L_t=None, tprev=None):
7567

7668
#rescale data
7769
if rescale_percent != 0:
70+
if len(np.where(data_new < 0)[0]) > 0:
71+
print('Rescaling should not be performed on log-transformed '
72+
'(or other negative) values. Imputed data returned unscaled.')
73+
return data_new, L_t
74+
7875
M99 = np.percentile(data, rescale_percent, axis=0)
7976
M100 = data.max(axis=0)
8077
indices = np.where(M99 == 0)[0]
@@ -94,6 +91,7 @@ def compute_markov(data, knn=10, epsilon=1, distance_metric='euclidean', knn_aut
9491
N = data.shape[0]
9592

9693
# Nearest neighbors
94+
print('Computing distances')
9795
nbrs = NearestNeighbors(n_neighbors=knn, metric=distance_metric).fit(data)
9896
distances, indices = nbrs.kneighbors(data)
9997

@@ -108,6 +106,7 @@ def compute_markov(data, knn=10, epsilon=1, distance_metric='euclidean', knn_aut
108106
distances[j] = np.divide(distances[j], temp[lMaxTempIdxs])
109107

110108
# Adjacency matrix
109+
print('Computing kernel')
111110
rows = np.zeros(N * knn, dtype=np.int32)
112111
cols = np.zeros(N * knn, dtype=np.int32)
113112
dists = np.zeros(N * knn)

0 commit comments

Comments
 (0)