From ecd4841ccff92527ecbc5961ed0cd82963512421 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Thu, 30 Aug 2018 16:34:23 -0400 Subject: [PATCH 01/32] don't return a sparse dataframe --- python/magic/magic.py | 3 ++- python/magic/utils.py | 7 +++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/python/magic/magic.py b/python/magic/magic.py index 963a6028..4b617a61 100644 --- a/python/magic/magic.py +++ b/python/magic/magic.py @@ -504,7 +504,8 @@ def transform(self, X=None, genes=None, t_max=20, else: X_magic = graph.inverse_transform(X_magic, columns=genes) # convert back to pandas dataframe, if necessary - X_magic = utils.convert_to_same_format(X_magic, X, columns=genes) + X_magic = utils.convert_to_same_format(X_magic, X, columns=genes, + prevent_sparse=True) return X_magic def fit_transform(self, X, **kwargs): diff --git a/python/magic/utils.py b/python/magic/utils.py index 3140e680..654bb83d 100644 --- a/python/magic/utils.py +++ b/python/magic/utils.py @@ -111,10 +111,13 @@ def matrix_is_equivalent(X, Y): np.sum((X != Y).sum()) == 0) -def convert_to_same_format(data, target_data, columns=None): +def convert_to_same_format(data, target_data, columns=None, prevent_sparse=False): # create new data object if isinstance(target_data, pd.SparseDataFrame): - data = pd.SparseDataFrame(data) + if prevent_sparse: + data = pd.DataFrame(data) + else: + data = pd.SparseDataFrame(data) pandas = True elif isinstance(target_data, pd.DataFrame): data = pd.DataFrame(data) From ca8f6dbef33a0c44dca8b098ffd190a0d960ddfa Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Fri, 31 Aug 2018 18:06:00 -0400 Subject: [PATCH 02/32] more informative error message on load failure; address https://github.com/KrishnaswamyLab/phateR/issues/21 --- Rmagic/R/utils.R | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/Rmagic/R/utils.R b/Rmagic/R/utils.R index c7346900..edfc87c5 100644 --- a/Rmagic/R/utils.R +++ b/Rmagic/R/utils.R @@ -10,8 +10,14 @@ null_equal <- function(x, y) { } load_pymagic <- function(delay_load = FALSE) { - result <- try(pymagic <<- reticulate::import("magic", delay_load = delay_load)) - if (methods::is(result, "try-error")) { + if (is.null(pymagic)) { + result <- try(pymagic <<- reticulate::import("magic", delay_load = delay_load)) + } else { + result <- try(reticulate::import("magic", delay_load = delay_load)) + } + if (methods::is(result, "try-error") && + (length(grep("ModuleNotFoundError: No module named 'magic'", result)) > 0 || + length(grep("ImportError: No module named magic", result)) > 0)) { install.magic() } } From 6e38540f842372a9a4d6387dd385ea9220c86d90 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Sun, 2 Sep 2018 11:51:25 -0400 Subject: [PATCH 03/32] preferantially load python with magic installed --- Rmagic/R/utils.R | 1 + 1 file changed, 1 insertion(+) diff --git a/Rmagic/R/utils.R b/Rmagic/R/utils.R index edfc87c5..bf413f78 100644 --- a/Rmagic/R/utils.R +++ b/Rmagic/R/utils.R @@ -63,5 +63,6 @@ install.magic <- function(envname = "r-reticulate", method = "auto", pymagic <- NULL .onLoad <- function(libname, pkgname) { + py_config <- reticulate::py_discover_config(required_module = "magic") load_pymagic(delay_load = TRUE) } From 14bd1c9ace562f9bcc8b1c995580db8c37bf1d1a Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Mon, 3 Sep 2018 11:23:50 -0400 Subject: [PATCH 04/32] recommend people remove the r-reticulate environment if things aren't working --- Rmagic/R/utils.R | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Rmagic/R/utils.R b/Rmagic/R/utils.R index bf413f78..dd27b6f3 100644 --- a/Rmagic/R/utils.R +++ b/Rmagic/R/utils.R @@ -19,6 +19,13 @@ load_pymagic <- function(delay_load = FALSE) { (length(grep("ModuleNotFoundError: No module named 'magic'", result)) > 0 || length(grep("ImportError: No module named magic", result)) > 0)) { install.magic() + } else if (grep("r\-reticulate", reticulate::py_config()$python)) { + message("Consider removing the 'r-reticulate' environment by running:") + if (grep("virtualenvs", reticulate::py_config()$python)) { + message("reticulate::virtualenv_remove('r-reticulate')") + } else { + message("reticulate::conda_remove('r-reticulate')") + } } } From 9cc139c295e570c66b852236d588c7ea2a7156fd Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Mon, 3 Sep 2018 11:40:11 -0400 Subject: [PATCH 05/32] fix grep typo --- Rmagic/R/utils.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Rmagic/R/utils.R b/Rmagic/R/utils.R index dd27b6f3..875da29a 100644 --- a/Rmagic/R/utils.R +++ b/Rmagic/R/utils.R @@ -19,7 +19,7 @@ load_pymagic <- function(delay_load = FALSE) { (length(grep("ModuleNotFoundError: No module named 'magic'", result)) > 0 || length(grep("ImportError: No module named magic", result)) > 0)) { install.magic() - } else if (grep("r\-reticulate", reticulate::py_config()$python)) { + } else if (grep("r\\-reticulate", reticulate::py_config()$python)) { message("Consider removing the 'r-reticulate' environment by running:") if (grep("virtualenvs", reticulate::py_config()$python)) { message("reticulate::virtualenv_remove('r-reticulate')") From 62c84732de738c835cb0b9efab1ae9b210965bf6 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Mon, 3 Sep 2018 12:04:42 -0400 Subject: [PATCH 06/32] fix grep length zero --- Rmagic/R/utils.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Rmagic/R/utils.R b/Rmagic/R/utils.R index 875da29a..4d6033c2 100644 --- a/Rmagic/R/utils.R +++ b/Rmagic/R/utils.R @@ -19,7 +19,7 @@ load_pymagic <- function(delay_load = FALSE) { (length(grep("ModuleNotFoundError: No module named 'magic'", result)) > 0 || length(grep("ImportError: No module named magic", result)) > 0)) { install.magic() - } else if (grep("r\\-reticulate", reticulate::py_config()$python)) { + } else if (length(grep("r\\-reticulate", reticulate::py_config()$python)) > 0) { message("Consider removing the 'r-reticulate' environment by running:") if (grep("virtualenvs", reticulate::py_config()$python)) { message("reticulate::virtualenv_remove('r-reticulate')") From 6035c0a4f1080aed45ad5e55e01509723736eb7b Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Mon, 3 Sep 2018 12:33:50 -0400 Subject: [PATCH 07/32] bump tasklogger version --- python/requirements.txt | 2 +- python/setup.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/python/requirements.txt b/python/requirements.txt index 637a35a9..27005d46 100644 --- a/python/requirements.txt +++ b/python/requirements.txt @@ -5,4 +5,4 @@ matplotlib scikit-learn>=0.19.1 graphtools>=0.1.8 future -tasklogger>=0.2 +tasklogger>=0.2.1 diff --git a/python/setup.py b/python/setup.py index 87fd29c4..8371b82d 100644 --- a/python/setup.py +++ b/python/setup.py @@ -8,12 +8,13 @@ 'scipy>=1.1.0', 'matplotlib', 'scikit-learn>=0.19.1', - 'tasklogger>=0.2', + 'tasklogger>=0.2.1', 'graphtools>=0.1.9', ] test_requires = [ 'nose2', + 'scprep', ] if sys.version_info[0] == 3: From bc6ea1b66483b2228a20d9cf72e1ce961f4091ec Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Mon, 3 Sep 2018 12:35:00 -0400 Subject: [PATCH 08/32] use scprep --- python/magic/io.py | 15 +++++++++++++++ python/magic/preprocessing.py | 5 +++++ python/magic/test.py | 25 ++++++++++++++----------- 3 files changed, 34 insertions(+), 11 deletions(-) diff --git a/python/magic/io.py b/python/magic/io.py index 44abcebd..b0525ec6 100644 --- a/python/magic/io.py +++ b/python/magic/io.py @@ -203,6 +203,9 @@ def load_csv(filename, cell_axis='row', delimiter=',', ------- data : pd.DataFrame """ + warnings.warn("magic.io is deprecated. Please use scprep.io instead. " + "Read more at http://scprep.readthedocs.io", + FutureWarning) if cell_axis not in ['row', 'column', 'col']: raise ValueError( "cell_axis {} not recognized. Expected 'row' or 'column'".format( @@ -309,6 +312,9 @@ def load_fcs(filename, gene_names=True, cell_names=True, ------- data : pd.DataFrame """ + warnings.warn("magic.io is deprecated. Please use scprep.io instead. " + "Read more at http://scprep.readthedocs.io", + FutureWarning) if cell_names is True: cell_names = None if gene_names is True: @@ -347,6 +353,9 @@ def load_mtx(mtx_file, cell_axis='row', ------- data : pd.DataFrame """ + warnings.warn("magic.io is deprecated. Please use scprep.io instead. " + "Read more at http://scprep.readthedocs.io", + FutureWarning) if cell_axis not in ['row', 'column', 'col']: raise ValueError( "cell_axis {} not recognized. Expected 'row' or 'column'".format( @@ -435,6 +444,9 @@ def load_10X(data_dir, sparse=True, gene_labels='symbol', data: pandas.DataFrame shape = (n_cell, n_genes) imported data matrix """ + warnings.warn("magic.io is deprecated. Please use scprep.io instead. " + "Read more at http://scprep.readthedocs.io", + FutureWarning) if gene_labels not in ['id', 'symbol', 'both']: raise ValueError("gene_labels not in ['id', 'symbol', 'both']") @@ -551,6 +563,9 @@ def load_10X_HDF5(filename, genome=None, sparse=True, gene_labels='symbol', If sparse, data will be a pd.SparseDataFrame. Otherwise, data will be a pd.DataFrame. """ + warnings.warn("magic.io is deprecated. Please use scprep.io instead. " + "Read more at http://scprep.readthedocs.io", + FutureWarning) with tables.open_file(filename, 'r') as f: if genome is None: genomes = [node._v_name for node in f.list_nodes(f.root)] diff --git a/python/magic/preprocessing.py b/python/magic/preprocessing.py index 85700c08..a4b0bc91 100644 --- a/python/magic/preprocessing.py +++ b/python/magic/preprocessing.py @@ -6,6 +6,7 @@ import numpy as np from scipy import sparse import pandas as pd +import warnings def library_size_normalize(data, verbose=False): @@ -25,6 +26,10 @@ def library_size_normalize(data, verbose=False): data_norm : ndarray [n, p] 2 dimensional array with normalized gene expression values """ + warnings.warn("magic.preprocessing is deprecated. " + "Please use scprep.normalize instead. " + "Read more at http://scprep.readthedocs.io", + FutureWarning) if verbose: print("Normalizing library sizes for %s cells" % (data.shape[0])) diff --git a/python/magic/test.py b/python/magic/test.py index 2f9d00e4..55006037 100644 --- a/python/magic/test.py +++ b/python/magic/test.py @@ -3,8 +3,8 @@ from __future__ import print_function, division, absolute_import import magic -import pandas as pd import numpy as np +import scprep try: import anndata except (ImportError, SyntaxError): @@ -13,23 +13,26 @@ def test_scdata(): - scdata = pd.read_csv("../data/test_data.csv") - scdata_norm = magic.preprocessing.library_size_normalize(scdata) + scdata = scprep.io.read_csv("../data/test_data.csv") + scdata_norm = scprep.filter.remove_empty_cells(scdata) + scdata_norm = scprep.filter.remove_empty_genes(scdata) + scdata_norm = scprep.normalize.library_size_normalize(scdata_norm) + scdata_norm = scprep.transform.sqrt(scdata_norm) assert scdata.shape == scdata_norm.shape - fast_magic_operator = magic.MAGIC(t='auto', a=20, k=10) - str_gene_magic = fast_magic_operator.fit_transform( + magic_op = magic.MAGIC(t='auto', a=20, k=10) + str_gene_magic = magic_op.fit_transform( scdata_norm, genes=['VIM', 'ZEB1']) - int_gene_magic = fast_magic_operator.fit_transform( + int_gene_magic = magic_op.fit_transform( scdata_norm, genes=[-2, -1]) assert str_gene_magic.shape[0] == scdata_norm.shape[0] assert np.all(str_gene_magic == int_gene_magic) - pca_magic = fast_magic_operator.fit_transform( + pca_magic = magic_op.fit_transform( scdata_norm, genes="pca_only") assert pca_magic.shape[0] == scdata_norm.shape[0] - assert pca_magic.shape[1] == fast_magic_operator.n_pca - fast_magic = fast_magic_operator.fit_transform(scdata_norm, - genes="all_genes") - assert scdata_norm.shape == fast_magic.shape + assert pca_magic.shape[1] == magic_op.n_pca + magic_all_genes = magic_op.fit_transform(scdata_norm, + genes="all_genes") + assert scdata_norm.shape == magic_all_genes.shape def test_anndata(): From b9ce6cfea5f50fa12e237db68c5bb24de19c66e6 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Mon, 3 Sep 2018 12:56:23 -0400 Subject: [PATCH 09/32] filter on scdata --- python/magic/test.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python/magic/test.py b/python/magic/test.py index 55006037..f2163f2d 100644 --- a/python/magic/test.py +++ b/python/magic/test.py @@ -13,10 +13,10 @@ def test_scdata(): - scdata = scprep.io.read_csv("../data/test_data.csv") - scdata_norm = scprep.filter.remove_empty_cells(scdata) - scdata_norm = scprep.filter.remove_empty_genes(scdata) - scdata_norm = scprep.normalize.library_size_normalize(scdata_norm) + scdata = scprep.io.load_csv("../data/test_data.csv") + scdata = scprep.filter.remove_empty_cells(scdata) + scdata = scprep.filter.remove_empty_genes(scdata) + scdata_norm = scprep.normalize.library_size_normalize(scdata) scdata_norm = scprep.transform.sqrt(scdata_norm) assert scdata.shape == scdata_norm.shape magic_op = magic.MAGIC(t='auto', a=20, k=10) From bba449fdeb4da49cf5f8bc607f8b474d75c8f486 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Thu, 6 Sep 2018 16:52:58 -0400 Subject: [PATCH 10/32] request before installation --- Rmagic/R/utils.R | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/Rmagic/R/utils.R b/Rmagic/R/utils.R index 4d6033c2..4e20c661 100644 --- a/Rmagic/R/utils.R +++ b/Rmagic/R/utils.R @@ -18,7 +18,9 @@ load_pymagic <- function(delay_load = FALSE) { if (methods::is(result, "try-error") && (length(grep("ModuleNotFoundError: No module named 'magic'", result)) > 0 || length(grep("ImportError: No module named magic", result)) > 0)) { - install.magic() + if (menu(c("Yes", "No"), title="Install MAGIC Python package with reticulate?") == 1) { + install.magic() + } } else if (length(grep("r\\-reticulate", reticulate::py_config()$python)) > 0) { message("Consider removing the 'r-reticulate' environment by running:") if (grep("virtualenvs", reticulate::py_config()$python)) { @@ -57,6 +59,7 @@ install.magic <- function(envname = "r-reticulate", method = "auto", envname = envname, method = method, conda = conda, pip=pip, ... ) + message("Install complete. Please restart R and try again.") }, error = function(e) { stop(paste0( From 9664a99cc451500520913d81b4a3690c3c642e34 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Thu, 6 Sep 2018 16:53:55 -0400 Subject: [PATCH 11/32] allow passing of a precomputed graph --- python/magic/magic.py | 58 +++++++++++++++++++------------------------ 1 file changed, 26 insertions(+), 32 deletions(-) diff --git a/python/magic/magic.py b/python/magic/magic.py index 4b617a61..db6b03d5 100644 --- a/python/magic/magic.py +++ b/python/magic/magic.py @@ -59,11 +59,9 @@ class MAGIC(BaseEstimator): roughly log(n_samples) time. knn_dist : string, optional, default: 'euclidean' - recommended values: 'euclidean', 'cosine', 'precomputed' + recommended values: 'euclidean', 'cosine' Any metric from `scipy.spatial.distance` can be used - distance metric for building kNN graph. If 'precomputed', - `data` should be an n_samples x n_samples distance or - affinity matrix + distance metric for building kNN graph. n_jobs : integer, optional, default: 1 The number of jobs to use for the computation. @@ -178,7 +176,7 @@ def _check_params(self): a=self.a) utils.check_if_not('auto', utils.check_positive, utils.check_int, t=self.t) - utils.check_in(['euclidean', 'precomputed', 'cosine', 'correlation', + utils.check_in(['euclidean', 'cosine', 'correlation', 'cityblock', 'l1', 'l2', 'manhattan', 'braycurtis', 'canberra', 'chebyshev', 'dice', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis', 'matching', 'minkowski', @@ -221,11 +219,9 @@ def set_params(self, **params): roughly log(n_samples) time. knn_dist : string, optional, default: 'euclidean' - recommended values: 'euclidean', 'cosine', 'precomputed' + recommended values: 'euclidean', 'cosine' Any metric from `scipy.spatial.distance` can be used - distance metric for building kNN graph. If 'precomputed', - `data` should be an n_samples x n_samples distance or - affinity matrix + distance metric for building kNN graph. n_jobs : integer, optional, default: 1 The number of jobs to use for the computation. @@ -297,7 +293,7 @@ def set_params(self, **params): self._check_params() return self - def fit(self, X): + def fit(self, X, graph=None): """Computes the diffusion operator Parameters @@ -306,42 +302,34 @@ def fit(self, X): input data with `n_samples` samples and `n_features` dimensions. Accepted data types: `numpy.ndarray`, `scipy.sparse.spmatrix`, `pd.DataFrame`, `anndata.AnnData`. + graph : `graphtools.Graph`, optional (default: None) + If given, provides a precomputed kernel matrix with which to + perform diffusion. Returns ------- magic_operator : MAGIC The estimator object """ - if self.knn_dist == 'precomputed': - if isinstance(X, sparse.coo_matrix): - X = X.tocsr() - if X[0, 0] == 0: - precomputed = "distance" - else: - precomputed = "affinity" - tasklogger.log_info( - "Using precomputed {} matrix...".format(precomputed)) + if self.n_pca is None or X.shape[1] <= self.n_pca: n_pca = None else: - precomputed = None - if self.n_pca is None or X.shape[1] <= self.n_pca: - n_pca = None - else: - n_pca = self.n_pca + n_pca = self.n_pca + if graph is None: + graph = self.graph - if self.graph is not None: + if graph is not None: if self.X is not None and not \ utils.matrix_is_equivalent(X, self.X): """ If the same data is used, we can reuse existing kernel and diffusion matrices. Otherwise we have to recompute. """ - self.graph = None + graph = None else: try: - self.graph.set_params( + graph.set_params( decay=self.a, knn=self.k + 1, distance=self.knn_dist, - precomputed=precomputed, n_jobs=self.n_jobs, verbose=self.verbose, n_pca=n_pca, thresh=1e-4, random_state=self.random_state) tasklogger.log_info( @@ -350,7 +338,7 @@ def fit(self, X): # something changed that should have invalidated the graph tasklogger.log_debug( "Reset graph due to {}".format(str(e))) - self.graph = None + graph = None self.X = X @@ -358,7 +346,7 @@ def fit(self, X): warnings.warn("Input matrix contains unexpressed genes. " "Please remove them prior to running MAGIC.") - if self.graph is None: + if graph is None: # reset X_magic in case it was previously set self.X_magic = None tasklogger.log_start("graph and diffusion operator") @@ -372,6 +360,8 @@ def fit(self, X): verbose=self.verbose, random_state=self.random_state) tasklogger.log_complete("graph and diffusion operator") + else: + self.graph = graph return self @@ -508,7 +498,7 @@ def transform(self, X=None, genes=None, t_max=20, prevent_sparse=True) return X_magic - def fit_transform(self, X, **kwargs): + def fit_transform(self, X, graph=None, **kwargs): """Computes the diffusion operator and the position of the cells in the embedding space @@ -519,6 +509,10 @@ def fit_transform(self, X, **kwargs): dimensions. Accepted data types: `numpy.ndarray`, `scipy.sparse.spmatrix`, `pd.DataFrame`, `anndata.AnnData`. + graph : `graphtools.Graph`, optional (default: None) + If given, provides a precomputed kernel matrix with which to + perform diffusion. + kwargs : further arguments for `PHATE.transform()` Keyword arguments as specified in :func:`~phate.PHATE.transform` @@ -528,7 +522,7 @@ def fit_transform(self, X, **kwargs): The gene expression values after diffusion """ tasklogger.log_start('MAGIC') - self.fit(X) + self.fit(X, graph=graph) X_magic = self.transform(**kwargs) tasklogger.log_complete('MAGIC') return X_magic From f6a7124a5b5f8a4f04e28556300c5551810eaf68 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Sun, 9 Sep 2018 20:04:26 -0400 Subject: [PATCH 12/32] avoid plt.show --- python/magic/plot.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/magic/plot.py b/python/magic/plot.py index c0d03724..a3bea232 100644 --- a/python/magic/plot.py +++ b/python/magic/plot.py @@ -158,6 +158,6 @@ def animate(i): plt.close() elif show: plt.tight_layout() - plt.show(block=False) + fig.show() return ani From 2954a53788e524bbba8050a54624621eaf62cbfb Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Sun, 9 Sep 2018 20:04:36 -0400 Subject: [PATCH 13/32] mention CRAN help docs --- README.md | 4 ++-- Rmagic/README.Rmd | 2 +- Rmagic/README.md | 5 ++++- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index ca93bfa9..fc599f73 100644 --- a/README.md +++ b/README.md @@ -76,7 +76,7 @@ The following code runs MAGIC on test data located in the MAGIC repository. #### Tutorials -We have included two tutorial notebooks on MAGIC usage and results visualization for single cell RNA-seq data. +You can read the MAGIC documentation at https://magic.readthedocs.io/. We have included two tutorial notebooks on MAGIC usage and results visualization for single cell RNA-seq data. EMT data notebook: http://nbviewer.jupyter.org/github/KrishnaswamyLab/MAGIC/blob/master/python/tutorial_notebooks/emt_tutorial.ipynb @@ -134,7 +134,7 @@ After installing the package, MAGIC can be run by loading the library and callin #### Tutorials -For a working example, see the Rmarkdown tutorials at and or in `Rmagic/inst/examples`. +You can read the MAGIC tutorial by running `help(Rmagic::magic)`. For a working example, see the Rmarkdown tutorials at and or in `Rmagic/inst/examples`. ## Help diff --git a/Rmagic/README.Rmd b/Rmagic/README.Rmd index a63421ec..16c5ea41 100644 --- a/Rmagic/README.Rmd +++ b/Rmagic/README.Rmd @@ -181,4 +181,4 @@ ggplot(data_PHATE) + ## Help -If you have any questions or require assistance using MAGIC, please contact us at . +Please let us know of any issues at the [GitHub repo](https://github.com/KrishnaswamyLab/MAGIC/issues). If you have any questions or require assistance using MAGIC, please read the documentation by running `help(Rmagic::magic)` or contact us at . diff --git a/Rmagic/README.md b/Rmagic/README.md index 1b301256..7ed5ee7d 100644 --- a/Rmagic/README.md +++ b/Rmagic/README.md @@ -236,4 +236,7 @@ ggplot(data_PHATE) + ## Help -If you have any questions or require assistance using MAGIC, please contact us at . +Please let us know of any issues at the [GitHub +repo](https://github.com/KrishnaswamyLab/MAGIC/issues). If you have any +questions or require assistance using MAGIC, please read the documentation +by running `help(Rmagic::magic)` or contact us at . From 47ca2adc63b24efc6fb3f96d80c4fcbb11da83d3 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Mon, 10 Sep 2018 14:04:36 -0400 Subject: [PATCH 14/32] document --- Rmagic/DESCRIPTION | 2 +- Rmagic/man/install.magic.Rd | 4 ++-- Rmagic/man/magic.Rd | 7 ++++--- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/Rmagic/DESCRIPTION b/Rmagic/DESCRIPTION index 7b13561f..322b70b3 100644 --- a/Rmagic/DESCRIPTION +++ b/Rmagic/DESCRIPTION @@ -20,4 +20,4 @@ Suggests: phateR License: GPL-2 | file LICENSE LazyData: true -RoxygenNote: 6.0.1 +RoxygenNote: 6.1.0 diff --git a/Rmagic/man/install.magic.Rd b/Rmagic/man/install.magic.Rd index 6f216fe0..e25bd0da 100644 --- a/Rmagic/man/install.magic.Rd +++ b/Rmagic/man/install.magic.Rd @@ -4,8 +4,8 @@ \alias{install.magic} \title{Install MAGIC Python Package} \usage{ -install.magic(envname = "r-reticulate", method = "auto", conda = "auto", - pip = TRUE, ...) +install.magic(envname = "r-reticulate", method = "auto", + conda = "auto", pip = TRUE, ...) } \arguments{ \item{envname}{Name of environment to install packages into} diff --git a/Rmagic/man/magic.Rd b/Rmagic/man/magic.Rd index 484e9e0d..95dd856c 100644 --- a/Rmagic/man/magic.Rd +++ b/Rmagic/man/magic.Rd @@ -4,9 +4,10 @@ \alias{magic} \title{Perform MAGIC on a data matrix} \usage{ -magic(data, genes = NULL, k = 10, alpha = 15, t = "auto", npca = 100, - init = NULL, t.max = 20, knn.dist.method = "euclidean", verbose = 1, - n.jobs = 1, seed = NULL) +magic(data, genes = NULL, k = 10, alpha = 15, t = "auto", + npca = 100, init = NULL, t.max = 20, + knn.dist.method = "euclidean", verbose = 1, n.jobs = 1, + seed = NULL) } \arguments{ \item{data}{input data matrix} From c171495a8f5b8c9399ce9374bc2f9b237b8f3538 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Mon, 10 Sep 2018 19:26:58 -0400 Subject: [PATCH 15/32] manually import utils::menu --- Rmagic/DESCRIPTION | 2 +- Rmagic/R/utils.R | 2 +- Rmagic/README.Rmd | 2 +- Rmagic/README.md | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Rmagic/DESCRIPTION b/Rmagic/DESCRIPTION index 322b70b3..36d12081 100644 --- a/Rmagic/DESCRIPTION +++ b/Rmagic/DESCRIPTION @@ -1,7 +1,7 @@ Package: Rmagic Type: Package Title: MAGIC - Markov Affinity-Based Graph Imputation of Cells -Version: 1.0.0 +Version: 1.1.0 Authors@R: c(person(given = "David", family = "van Dijk", email = "davidvandijk@gmail.com", role = c("aut")), person(given = 'Scott', family = 'Gigante', email = 'scott.gigante@yale.edu', role = 'cre', comment = c(ORCID = '0000-0002-4544-2764'))) Maintainer: Scott Gigante diff --git a/Rmagic/R/utils.R b/Rmagic/R/utils.R index 4e20c661..ef458f19 100644 --- a/Rmagic/R/utils.R +++ b/Rmagic/R/utils.R @@ -18,7 +18,7 @@ load_pymagic <- function(delay_load = FALSE) { if (methods::is(result, "try-error") && (length(grep("ModuleNotFoundError: No module named 'magic'", result)) > 0 || length(grep("ImportError: No module named magic", result)) > 0)) { - if (menu(c("Yes", "No"), title="Install MAGIC Python package with reticulate?") == 1) { + if (utils::menu(c("Yes", "No"), title="Install MAGIC Python package with reticulate?") == 1) { install.magic() } } else if (length(grep("r\\-reticulate", reticulate::py_config()$python)) > 0) { diff --git a/Rmagic/README.Rmd b/Rmagic/README.Rmd index 16c5ea41..1d9a2d30 100644 --- a/Rmagic/README.Rmd +++ b/Rmagic/README.Rmd @@ -1,5 +1,5 @@ --- -title : Rmagic v1.0.0 +title : Rmagic v1.1.0 output: github_document toc: true --- diff --git a/Rmagic/README.md b/Rmagic/README.md index 7ed5ee7d..d6bdf6d2 100644 --- a/Rmagic/README.md +++ b/Rmagic/README.md @@ -1,4 +1,4 @@ -Rmagic v1.0.0 +Rmagic v1.1.0 ================ From 88581c1271119d7109be75842340e1cf6a4215b6 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Mon, 17 Sep 2018 15:24:22 -0400 Subject: [PATCH 16/32] add dremi function to magic op - resolves #20 --- python/magic/magic.py | 100 ++++++++++++++++++++++++++++++++---------- python/magic/test.py | 5 +++ 2 files changed, 81 insertions(+), 24 deletions(-) diff --git a/python/magic/magic.py b/python/magic/magic.py index db6b03d5..d79cc0c5 100644 --- a/python/magic/magic.py +++ b/python/magic/magic.py @@ -19,6 +19,7 @@ import pandas as pd import numbers import tasklogger +import scprep from . import utils @@ -116,6 +117,7 @@ class MAGIC(BaseEstimator): >>> plt.show() >>> magic.plot.animate_magic(X, gene_x='VIM', gene_y='CDH1', ... gene_color='ZEB1', operator=magic_operator) + >>> dremi = magic_operator.knnDREMI('VIM', 'CDH1', plot=True) References ---------- @@ -315,30 +317,32 @@ def fit(self, X, graph=None): n_pca = None else: n_pca = self.n_pca + if graph is None: graph = self.graph - - if graph is not None: if self.X is not None and not \ utils.matrix_is_equivalent(X, self.X): """ If the same data is used, we can reuse existing kernel and diffusion matrices. Otherwise we have to recompute. """ + tasklogger.log_debug( + "Reset graph due to difference in input data") + graph = None + + if graph is not None: + try: + graph.set_params( + decay=self.a, knn=self.k + 1, distance=self.knn_dist, + n_jobs=self.n_jobs, verbose=self.verbose, n_pca=n_pca, + thresh=1e-4, random_state=self.random_state) + tasklogger.log_info( + "Using precomputed graph and diffusion operator...") + except ValueError as e: + # something changed that should have invalidated the graph + tasklogger.log_debug( + "Reset graph due to {}".format(str(e))) graph = None - else: - try: - graph.set_params( - decay=self.a, knn=self.k + 1, distance=self.knn_dist, - n_jobs=self.n_jobs, verbose=self.verbose, n_pca=n_pca, - thresh=1e-4, random_state=self.random_state) - tasklogger.log_info( - "Using precomputed graph and diffusion operator...") - except ValueError as e: - # something changed that should have invalidated the graph - tasklogger.log_debug( - "Reset graph due to {}".format(str(e))) - graph = None self.X = X @@ -482,8 +486,8 @@ def transform(self, X=None, genes=None, t_max=20, if store_result and self.X_magic is not None: X_magic = self.X_magic else: - X_magic = self.impute(graph, t_max=t_max, - plot=plot_optimal_t, ax=ax) + X_magic = self._impute(graph, t_max=t_max, + plot=plot_optimal_t, ax=ax) if store_result: self.X_magic = X_magic @@ -527,8 +531,8 @@ def fit_transform(self, X, graph=None, **kwargs): tasklogger.log_complete('MAGIC') return X_magic - def calculate_error(self, data, data_prev=None, weights=None, - subsample_genes=None): + def _calculate_error(self, data, data_prev=None, weights=None, + subsample_genes=None): """Calculates difference before and after diffusion Parameters @@ -562,8 +566,8 @@ def calculate_error(self, data, data_prev=None, weights=None, error = None return error, data - def impute(self, data, t_max=20, plot=False, ax=None, - max_genes_compute_t=500, threshold=0.001): + def _impute(self, data, t_max=20, plot=False, ax=None, + max_genes_compute_t=500, threshold=0.001): """Peform MAGIC imputation Parameters @@ -603,7 +607,7 @@ def impute(self, data, t_max=20, plot=False, ax=None, else: weights = None if self.t == 'auto': - _, data_prev = self.calculate_error( + _, data_prev = self._calculate_error( data_imputed, data_prev=None, weights=weights, subsample_genes=subsample_genes) @@ -634,7 +638,7 @@ def impute(self, data, t_max=20, plot=False, ax=None, i += 1 data_imputed = self.diff_op.dot(data_imputed) if self.t == 'auto': - error, data_prev = self.calculate_error( + error, data_prev = self._calculate_error( data_imputed, data_prev, weights=weights, subsample_genes=subsample_genes) @@ -659,7 +663,7 @@ def impute(self, data, t_max=20, plot=False, ax=None, while i < t_max: i += 1 data_overimputed = self.diff_op.dot(data_overimputed) - error, data_prev = self.calculate_error( + error, data_prev = self._calculate_error( data_overimputed, data_prev, weights=weights, subsample_genes=subsample_genes) @@ -687,3 +691,51 @@ def impute(self, data, t_max=20, plot=False, ax=None, plt.show(block=False) return data_imputed + + def knnDREMI(self, gene_x, gene_y, + k=10, n_bins=20, n_mesh=3, n_jobs=1, + plot=False, **kwargs): + """Calculate kNN-DREMI on MAGIC output + + Calculates k-Nearest Neighbor conditional Density Resampled Estimate of + Mutual Information as defined in Van Dijk et al, 2018. [1]_ + + Note that kNN-DREMI, like Mutual Information and DREMI, is not + symmetric. Here we are estimating I(Y|X). + + Parameters + ---------- + gene_x : array-like, shape=[n_samples] + Gene shown on the x axis (independent feature) + gene_y : array-like, shape=[n_samples] + Gene shown on the y axis (dependent feature) + k : int, range=[0:n_samples), optional (default: 10) + Number of neighbors + n_bins : int, range=[0:inf), optional (default: 20) + Number of bins for density resampling + n_mesh : int, range=[0:inf), optional (default: 3) + In each bin, density will be calculcated around (mesh ** 2) points + n_jobs : int, optional (default: 1) + Number of threads used for kNN calculation + plot : bool, optional (default: False) + If True, DREMI create plots of the data like those seen in + Fig 5C/D of van Dijk et al. 2018. (doi:10.1016/j.cell.2018.05.061). + **kwargs : additional arguments for `scprep.stats.plot_knnDREMI` + + Returns + ------- + dremi : float + kNN condtional Density resampled estimate of mutual information + + References + ---------- + .. [1] van Dijk D *et al.* (2018), + *Recovering Gene Interactions from Single-Cell Data Using Data + Diffusion*, `Cell `_. + """ + data = self.transform(genes=[gene_x, gene_y]) + dremi = scprep.stats.knnDREMI( + data[gene_x], data[gene_y], + k=k, n_bins=n_bins, n_mesh=n_mesh, n_jobs=n_jobs, + plot=plot, **kwargs) + return dremi diff --git a/python/magic/test.py b/python/magic/test.py index f2163f2d..c6344ef0 100644 --- a/python/magic/test.py +++ b/python/magic/test.py @@ -2,6 +2,8 @@ from __future__ import print_function, division, absolute_import +import matplotlib as mpl +mpl.use("agg") import magic import numpy as np import scprep @@ -19,6 +21,7 @@ def test_scdata(): scdata_norm = scprep.normalize.library_size_normalize(scdata) scdata_norm = scprep.transform.sqrt(scdata_norm) assert scdata.shape == scdata_norm.shape + np.random.seed(42) magic_op = magic.MAGIC(t='auto', a=20, k=10) str_gene_magic = magic_op.fit_transform( scdata_norm, genes=['VIM', 'ZEB1']) @@ -33,6 +36,8 @@ def test_scdata(): magic_all_genes = magic_op.fit_transform(scdata_norm, genes="all_genes") assert scdata_norm.shape == magic_all_genes.shape + dremi = magic_op.knnDREMI("VIM", "ZEB1", plot=True) + np.testing.assert_allclose(dremi, 1.5687165) def test_anndata(): From acb3cd3c46e9d9d201e0fbfad6a54691cbfbe49a Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Mon, 17 Sep 2018 17:29:50 -0400 Subject: [PATCH 17/32] fix example typo --- Rmagic/R/magic.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Rmagic/R/magic.R b/Rmagic/R/magic.R index f3a57e83..013f6390 100644 --- a/Rmagic/R/magic.R +++ b/Rmagic/R/magic.R @@ -39,7 +39,7 @@ #' @param seed int or `NULL`, random state (default: `NULL`) #' #' @examples -#' if (reticulate::py_module_available("phate")) { +#' if (reticulate::py_module_available("magic")) { #' #' data(magic_testdata) #' From bac8a70919a6a72f9823494b4f606607ac62605a Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Mon, 17 Sep 2018 18:04:48 -0400 Subject: [PATCH 18/32] require scprep --- python/requirements.txt | 1 + python/setup.py | 1 + 2 files changed, 2 insertions(+) diff --git a/python/requirements.txt b/python/requirements.txt index 27005d46..0621bdda 100644 --- a/python/requirements.txt +++ b/python/requirements.txt @@ -6,3 +6,4 @@ scikit-learn>=0.19.1 graphtools>=0.1.8 future tasklogger>=0.2.1 +scprep>=0.7.1 diff --git a/python/setup.py b/python/setup.py index 8371b82d..2e4c99d0 100644 --- a/python/setup.py +++ b/python/setup.py @@ -10,6 +10,7 @@ 'scikit-learn>=0.19.1', 'tasklogger>=0.2.1', 'graphtools>=0.1.9', + 'scprep>=0.7.1' ] test_requires = [ From 535dab8995fe88004a63de06e70342ae485d4453 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Mon, 17 Sep 2018 18:24:32 -0400 Subject: [PATCH 19/32] put tests outside module --- python/{magic => test}/test.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename python/{magic => test}/test.py (100%) diff --git a/python/magic/test.py b/python/test/test.py similarity index 100% rename from python/magic/test.py rename to python/test/test.py From 0304ccc7d31ee8595ae89a66d2285db71f93ba02 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Mon, 17 Sep 2018 18:30:07 -0400 Subject: [PATCH 20/32] ignore pycache --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index 4a1d8971..2c49dd8e 100644 --- a/.gitignore +++ b/.gitignore @@ -20,6 +20,9 @@ python/*.egg-info python/magic/__pycache__ python/magic/*.pyc python/tutorial_notebooks/.ipynb_checkpoints +__pycache__ +.eggs + matlab/EMT.csv From aaf4c58d2a5746e94867c847fd7c9095350e42a5 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Mon, 17 Sep 2018 19:08:42 -0400 Subject: [PATCH 21/32] reduce test value --- python/test/test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/test/test.py b/python/test/test.py index c6344ef0..6a575426 100644 --- a/python/test/test.py +++ b/python/test/test.py @@ -37,7 +37,7 @@ def test_scdata(): genes="all_genes") assert scdata_norm.shape == magic_all_genes.shape dremi = magic_op.knnDREMI("VIM", "ZEB1", plot=True) - np.testing.assert_allclose(dremi, 1.5687165) + np.testing.assert_allclose(dremi, 1.568716) def test_anndata(): From 5e6e9d3f8c9daa22f39484ab576549911bb7d64f Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Mon, 17 Sep 2018 19:09:44 -0400 Subject: [PATCH 22/32] remove redundant reference --- python/magic/magic.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/python/magic/magic.py b/python/magic/magic.py index d79cc0c5..83cdbcfe 100644 --- a/python/magic/magic.py +++ b/python/magic/magic.py @@ -726,12 +726,6 @@ def knnDREMI(self, gene_x, gene_y, ------- dremi : float kNN condtional Density resampled estimate of mutual information - - References - ---------- - .. [1] van Dijk D *et al.* (2018), - *Recovering Gene Interactions from Single-Cell Data Using Data - Diffusion*, `Cell `_. """ data = self.transform(genes=[gene_x, gene_y]) dremi = scprep.stats.knnDREMI( From 13189470fde9bc8f17d4c21668bf29878cc1365c Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Mon, 17 Sep 2018 20:30:27 -0400 Subject: [PATCH 23/32] slightly different floating point values on py2/3 --- python/test/test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/test/test.py b/python/test/test.py index 6a575426..5602bb0d 100644 --- a/python/test/test.py +++ b/python/test/test.py @@ -37,7 +37,7 @@ def test_scdata(): genes="all_genes") assert scdata_norm.shape == magic_all_genes.shape dremi = magic_op.knnDREMI("VIM", "ZEB1", plot=True) - np.testing.assert_allclose(dremi, 1.568716) + np.testing.assert_allclose(dremi, 1.5687165, atol=0.0000005) def test_anndata(): From 846fc835bb988319f88d990802f013c4dea67a59 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Mon, 17 Sep 2018 20:59:01 -0400 Subject: [PATCH 24/32] bump version --- python/magic/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/magic/version.py b/python/magic/version.py index a955fdae..67bc602a 100644 --- a/python/magic/version.py +++ b/python/magic/version.py @@ -1 +1 @@ -__version__ = "1.2.1" +__version__ = "1.3.0" From 73aea1a6aef5ffeae10c2a9b5b35d743e6758ac6 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Mon, 17 Sep 2018 21:02:19 -0400 Subject: [PATCH 25/32] bump version and document --- Rmagic/DESCRIPTION | 10 ++++++---- Rmagic/README.Rmd | 2 +- Rmagic/README.md | 2 +- Rmagic/man/magic.Rd | 2 +- 4 files changed, 9 insertions(+), 7 deletions(-) diff --git a/Rmagic/DESCRIPTION b/Rmagic/DESCRIPTION index 36d12081..5aacc0b5 100644 --- a/Rmagic/DESCRIPTION +++ b/Rmagic/DESCRIPTION @@ -1,9 +1,10 @@ Package: Rmagic Type: Package Title: MAGIC - Markov Affinity-Based Graph Imputation of Cells -Version: 1.1.0 +Version: 1.3.0 Authors@R: c(person(given = "David", family = "van Dijk", email = "davidvandijk@gmail.com", role = c("aut")), - person(given = 'Scott', family = 'Gigante', email = 'scott.gigante@yale.edu', role = 'cre', comment = c(ORCID = '0000-0002-4544-2764'))) + person(given = 'Scott', family = 'Gigante', email = 'scott.gigante@yale.edu', role = 'cre', + comment = c(ORCID = '0000-0002-4544-2764'))) Maintainer: Scott Gigante Description: MAGIC (Markov affinity-based graph imputation of cells) is a method for addressing technical noise in single-cell data, including under-sampling of mRNA molecules, often termed "dropout" which can severely obscure important gene-gene relationships. MAGIC shares information across similar cells, via data diffusion, to denoise the cell count matrix and fill in missing transcripts. Read more: van Dijk et al. (2018) . Depends: @@ -15,9 +16,10 @@ Imports: reticulate (>= 1.4), ggplot2 Suggests: - readr, + readr, viridis, - phateR + phateR License: GPL-2 | file LICENSE LazyData: true RoxygenNote: 6.1.0 +Encoding: UTF-8 diff --git a/Rmagic/README.Rmd b/Rmagic/README.Rmd index 1d9a2d30..c4fe868a 100644 --- a/Rmagic/README.Rmd +++ b/Rmagic/README.Rmd @@ -1,5 +1,5 @@ --- -title : Rmagic v1.1.0 +title : Rmagic v1.3.0 output: github_document toc: true --- diff --git a/Rmagic/README.md b/Rmagic/README.md index d6bdf6d2..4b1ae017 100644 --- a/Rmagic/README.md +++ b/Rmagic/README.md @@ -1,4 +1,4 @@ -Rmagic v1.1.0 +Rmagic v1.3.0 ================ diff --git a/Rmagic/man/magic.Rd b/Rmagic/man/magic.Rd index 95dd856c..092079f6 100644 --- a/Rmagic/man/magic.Rd +++ b/Rmagic/man/magic.Rd @@ -61,7 +61,7 @@ applied to single-cell RNA sequencing data, as described in van Dijk et al, 2018. } \examples{ -if (reticulate::py_module_available("phate")) { +if (reticulate::py_module_available("magic")) { data(magic_testdata) From 58bb5e38c6bf4784b20a9340e0c98ec90d63ac82 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Fri, 21 Sep 2018 14:02:02 -0400 Subject: [PATCH 26/32] fix load_pymagic with delay_load, only check for r-reticulate on fail --- Rmagic/R/utils.R | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/Rmagic/R/utils.R b/Rmagic/R/utils.R index ef458f19..8ae098a2 100644 --- a/Rmagic/R/utils.R +++ b/Rmagic/R/utils.R @@ -10,23 +10,24 @@ null_equal <- function(x, y) { } load_pymagic <- function(delay_load = FALSE) { - if (is.null(pymagic)) { + if (is.null(pymagic)) { result <- try(pymagic <<- reticulate::import("magic", delay_load = delay_load)) } else { result <- try(reticulate::import("magic", delay_load = delay_load)) } - if (methods::is(result, "try-error") && - (length(grep("ModuleNotFoundError: No module named 'magic'", result)) > 0 || - length(grep("ImportError: No module named magic", result)) > 0)) { + if (methods::is(result, "try-error")) { + if ((!delay_load) && length(grep("ModuleNotFoundError: No module named 'magic'", result)) > 0 || + length(grep("ImportError: No module named magic", result)) > 0) { if (utils::menu(c("Yes", "No"), title="Install MAGIC Python package with reticulate?") == 1) { install.magic() } - } else if (length(grep("r\\-reticulate", reticulate::py_config()$python)) > 0) { - message("Consider removing the 'r-reticulate' environment by running:") - if (grep("virtualenvs", reticulate::py_config()$python)) { - message("reticulate::virtualenv_remove('r-reticulate')") - } else { - message("reticulate::conda_remove('r-reticulate')") + } else if (length(grep("r\\-reticulate", reticulate::py_config()$python)) > 0) { + message("Consider removing the 'r-reticulate' environment by running:") + if (grep("virtualenvs", reticulate::py_config()$python)) { + message("reticulate::virtualenv_remove('r-reticulate')") + } else { + message("reticulate::conda_remove('r-reticulate')") + } } } } From bc1606e93b3cc349c2cc356d07c944f2bd6d37ec Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Fri, 21 Sep 2018 14:07:42 -0400 Subject: [PATCH 27/32] clean up precomputed graph logic: --- python/magic/magic.py | 38 +++++++++++++++++++++----------------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/python/magic/magic.py b/python/magic/magic.py index 83cdbcfe..57e9d2f2 100644 --- a/python/magic/magic.py +++ b/python/magic/magic.py @@ -329,20 +329,22 @@ def fit(self, X, graph=None): tasklogger.log_debug( "Reset graph due to difference in input data") graph = None - - if graph is not None: - try: - graph.set_params( - decay=self.a, knn=self.k + 1, distance=self.knn_dist, - n_jobs=self.n_jobs, verbose=self.verbose, n_pca=n_pca, - thresh=1e-4, random_state=self.random_state) - tasklogger.log_info( - "Using precomputed graph and diffusion operator...") - except ValueError as e: - # something changed that should have invalidated the graph - tasklogger.log_debug( - "Reset graph due to {}".format(str(e))) - graph = None + elif graph is not None: + try: + graph.set_params( + decay=self.a, knn=self.k + 1, distance=self.knn_dist, + n_jobs=self.n_jobs, verbose=self.verbose, n_pca=n_pca, + thresh=1e-4, random_state=self.random_state) + except ValueError as e: + # something changed that should have invalidated the graph + tasklogger.log_debug( + "Reset graph due to {}".format(str(e))) + graph = None + else: + self.k = graph.knn - 1 + self.alpha = graph.decay + self.n_pca = graph.n_pca + self.knn_dist = graph.distance self.X = X @@ -350,7 +352,11 @@ def fit(self, X, graph=None): warnings.warn("Input matrix contains unexpressed genes. " "Please remove them prior to running MAGIC.") - if graph is None: + if graph is not None: + tasklogger.log_info( + "Using precomputed graph and diffusion operator...") + self.graph = graph + else: # reset X_magic in case it was previously set self.X_magic = None tasklogger.log_start("graph and diffusion operator") @@ -364,8 +370,6 @@ def fit(self, X, graph=None): verbose=self.verbose, random_state=self.random_state) tasklogger.log_complete("graph and diffusion operator") - else: - self.graph = graph return self From d79c449dab7d4b21d7b6b80c70801fab386d8ee6 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Fri, 21 Sep 2018 14:14:03 -0400 Subject: [PATCH 28/32] Properly deprecate magic.io --- python/magic/io.py | 386 +++------------------------------------------ 1 file changed, 22 insertions(+), 364 deletions(-) diff --git a/python/magic/io.py b/python/magic/io.py index b0525ec6..6346a2a5 100644 --- a/python/magic/io.py +++ b/python/magic/io.py @@ -11,167 +11,7 @@ import zipfile import tempfile import shutil -try: - import fcsparser -except ImportError: - pass -try: - import tables -except ImportError: - pass - -try: - FileNotFoundError -except NameError: - # py2 compatibility - FileNotFoundError = OSError - - -def with_fcsparser(fun): - def wrapped_fun(*args, **kwargs): - try: - fcsparser - except NameError: - raise ImportError( - "fcsparser not found. " - "Please install it with e.g. `pip install --user fcsparser`") - return fun(*args, **kwargs) - return wrapped_fun - - -def with_tables(fun): - def wrapped_fun(*args, **kwargs): - try: - tables - except NameError: - raise ImportError( - "tables not found. " - "Please install it with e.g. `pip install --user tables`") - return fun(*args, **kwargs) - return wrapped_fun - - -def _parse_header(header, n_expected, header_type="gene_names"): - """ - Parameters - ---------- - header : `str` filename, array-like or `None` - - n_expected : `int` - Expected header length - - header_type : argument name for error printing - """ - if header is None or header is False: - return None - elif isinstance(header, str): - # treat as a file - if header.endswith("tsv"): - delimiter = "\t" - else: - delimiter = "," - columns = pd.read_csv(header, delimiter=delimiter, - header=None).values.reshape(-1) - if not len(columns) == n_expected: - raise ValueError("Expected {} entries in {}. Got {}".format( - n_expected, header, len(columns))) - else: - # treat as list - columns = header - if not len(columns) == n_expected: - raise ValueError("Expected {} entries in {}. Got {}".format( - n_expected, header_type, len(columns))) - return columns - - -def _parse_gene_names(header, data): - return _parse_header(header, data.shape[1], - header_type="gene_names") - - -def _parse_cell_names(header, data): - return _parse_header(header, data.shape[0], - header_type="cell_names") - - -def _matrix_to_data_frame(data, gene_names=None, cell_names=None, sparse=None): - """Return the optimal data type given data, gene names and cell names. - - Parameters - ---------- - - data : array-like - - gene_names : `str`, array-like or `None` (default: None) - Either a filename or an array containing a list of gene symbols or ids. - - cell_names : `str`, array-like or `None` (default: None) - Either a filename or an array containing a list of cell barcodes. - - sparse : `bool` or `None` (default: None) - If not `None`, overrides default sparsity of the data. - """ - if gene_names is None and cell_names is None and \ - not isinstance(data, pd.DataFrame): - # just a matrix - if sparse is not None: - if sparse: - if not sp.issparse(data): - # return scipy.sparse.csr_matrix - data = sp.csr_matrix(data) - elif sp.issparse(data) and not sparse: - # return numpy.ndarray - data = data.toarray() - else: - # return data as is - pass - return data - else: - gene_names = _parse_gene_names(gene_names, data) - cell_names = _parse_cell_names(cell_names, data) - # dataframe with index and/or columns - if sparse is None: - # let the input data decide - sparse = isinstance(data, pd.SparseDataFrame) or sp.issparse(data) - if sparse and gene_names is not None and \ - len(np.unique(gene_names)) < len(gene_names): - warnings.warn( - "Duplicate gene names detected! Forcing dense matrix", - RuntimeWarning) - sparse = False - if sparse: - # return pandas.SparseDataFrame - if isinstance(data, pd.DataFrame): - if gene_names is not None: - data.columns = gene_names - if cell_names is not None: - data.index = cell_names - if not isinstance(data, pd.SparseDataFrame): - data = data.to_sparse(fill_value=0.0) - else: - data = pd.SparseDataFrame(data, default_fill_value=0.0, - index=cell_names, columns=gene_names) - else: - # return pandas.DataFrame - if isinstance(data, pd.DataFrame): - if gene_names is not None: - data.columns = gene_names - if cell_names is not None: - data.index = cell_names - if isinstance(data, pd.SparseDataFrame): - data = data.to_dense() - else: - if sp.issparse(data): - data = data.toarray() - data = pd.DataFrame(data, index=cell_names, columns=gene_names) - return data - - -def _read_csv_sparse(filename, chunksize=1000000, fill_value=0.0, **kwargs): - chunks = pd.read_csv(filename, chunksize=chunksize, **kwargs) - data = pd.concat(chunk.to_sparse(fill_value=fill_value) - for chunk in chunks) - return data +import scprep def load_csv(filename, cell_axis='row', delimiter=',', @@ -206,49 +46,9 @@ def load_csv(filename, cell_axis='row', delimiter=',', warnings.warn("magic.io is deprecated. Please use scprep.io instead. " "Read more at http://scprep.readthedocs.io", FutureWarning) - if cell_axis not in ['row', 'column', 'col']: - raise ValueError( - "cell_axis {} not recognized. Expected 'row' or 'column'".format( - cell_axis)) - - if 'index_col' in kwargs: - # override - index_col = kwargs['index_col'] - cell_names = None - del kwargs['index_col'] - elif cell_names is True: - index_col = 0 - cell_names = None - else: - index_col = None - - if 'header' in kwargs: - # override - header = kwargs['header'] - del kwargs['header'] - gene_names = None - elif gene_names is True: - header = 0 - gene_names = None - else: - header = None - - # Read in csv file - if sparse: - read_fun = _read_csv_sparse - else: - read_fun = pd.read_csv - data = read_fun(filename, delimiter=delimiter, - header=header, index_col=index_col, - **kwargs) - - if cell_axis in ['column', 'col']: - data = data.T - - data = _matrix_to_data_frame( - data, gene_names=gene_names, - cell_names=cell_names, sparse=sparse) - return data + return scprep.io.load_csv(filename=filename, cell_axis=cell_axis, delimiter=delimiter, + gene_names=gene_names, cell_names=cell_names, + sparse=sparse, **kwargs) def load_tsv(filename, cell_axis='row', delimiter='\t', @@ -285,7 +85,6 @@ def load_tsv(filename, cell_axis='row', delimiter='\t', sparse=sparse, **kwargs) -@with_fcsparser def load_fcs(filename, gene_names=True, cell_names=True, sparse=None, metadata_channels=['Time', 'Event_length', 'DNA1', 'DNA2', @@ -315,19 +114,10 @@ def load_fcs(filename, gene_names=True, cell_names=True, warnings.warn("magic.io is deprecated. Please use scprep.io instead. " "Read more at http://scprep.readthedocs.io", FutureWarning) - if cell_names is True: - cell_names = None - if gene_names is True: - gene_names = None - # Parse the fcs file - meta, data = fcsparser.parse(filename) - metadata_channels = data.columns.intersection(metadata_channels) - data_channels = data.columns.difference(metadata_channels) - metadata = data[metadata_channels] - data = data[data_channels] - data = _matrix_to_data_frame(data, gene_names=gene_names, - cell_names=cell_names, sparse=sparse) - return metadata, data + return scprep.io.load_fcs(filename=filename, gene_names=gene_names, + cell_names=cell_names, + sparse=sparse, + metadata_channels=metadata_channels) def load_mtx(mtx_file, cell_axis='row', @@ -356,60 +146,9 @@ def load_mtx(mtx_file, cell_axis='row', warnings.warn("magic.io is deprecated. Please use scprep.io instead. " "Read more at http://scprep.readthedocs.io", FutureWarning) - if cell_axis not in ['row', 'column', 'col']: - raise ValueError( - "cell_axis {} not recognized. Expected 'row' or 'column'".format( - cell_axis)) - # Read in mtx file - data = sio.mmread(mtx_file) - if cell_axis in ['column', 'col']: - data = data.T - data = _matrix_to_data_frame( - data, gene_names=gene_names, - cell_names=cell_names, sparse=sparse) - return data - - -def _combine_gene_id(symbols, ids): - """Creates gene labels of the form SYMBOL (ID) - - Parameters - ---------- - - genes: pandas.DataFrame with columns['symbol', 'id'] - - Returns - ------- - - pandas.Index with combined gene symbols and ids - """ - columns = np.core.defchararray.add( - np.array(symbols, dtype=str), ' (') - columns = np.core.defchararray.add( - columns, np.array(ids, dtype=str)) - columns = np.core.defchararray.add(columns, ')') - return columns - - -def _parse_10x_genes(symbols, ids, gene_labels='symbol', - allow_duplicates=True): - if gene_labels not in ['symbol', 'id', 'both']: - raise ValueError("gene_labels='{}' not recognized. Choose from " - "['symbol', 'id', 'both']") - if gene_labels == 'both': - columns = _combine_gene_id(symbols, ids) - if gene_labels == 'symbol': - columns = symbols - if not allow_duplicates and len(np.unique(columns)) < len(columns): - warnings.warn( - "Duplicate gene names detected! Forcing `gene_labels='id'`. " - "Alternatively, try `gene_labels='both'`, " - "`allow_duplicates=True`, or load the matrix" - " with `sparse=False`", RuntimeWarning) - gene_labels = 'id' - if gene_labels == 'id': - columns = ids - return columns + return scprep.io.load_mtx(mtx_file=mtx_file, cell_axis=cell_axis, + gene_names=gene_names, cell_names=cell_names, + sparse=sparse) def load_10X(data_dir, sparse=True, gene_labels='symbol', @@ -447,38 +186,8 @@ def load_10X(data_dir, sparse=True, gene_labels='symbol', warnings.warn("magic.io is deprecated. Please use scprep.io instead. " "Read more at http://scprep.readthedocs.io", FutureWarning) - - if gene_labels not in ['id', 'symbol', 'both']: - raise ValueError("gene_labels not in ['id', 'symbol', 'both']") - - if not os.path.isdir(data_dir): - raise FileNotFoundError( - "{} is not a directory".format(data_dir)) - - try: - m = sio.mmread(os.path.join(data_dir, "matrix.mtx")) - genes = pd.read_csv(os.path.join(data_dir, "genes.tsv"), - delimiter='\t', header=None) - genes.columns = ['id', 'symbol'] - barcodes = pd.read_csv(os.path.join(data_dir, "barcodes.tsv"), - delimiter='\t', header=None) - - except (FileNotFoundError, OSError): - raise FileNotFoundError( - "'matrix.mtx', 'genes.tsv', and 'barcodes.tsv' must be present " - "in {}".format(data_dir)) - - cell_names = barcodes[0] - if allow_duplicates is None: - allow_duplicates = not sparse - gene_names = _parse_10x_genes(genes['symbol'], genes['id'], - gene_labels=gene_labels, - allow_duplicates=allow_duplicates) - - data = _matrix_to_data_frame(m.T, cell_names=cell_names, - gene_names=gene_names, - sparse=sparse) - return data + return scprep.io.load_10X(data_dir=data_dir, sparse=sparse, gene_labels=gene_labels, + allow_duplicates=allow_duplicates) def load_10X_zip(filename, sparse=True, gene_labels='symbol', @@ -507,33 +216,11 @@ def load_10X_zip(filename, sparse=True, gene_labels='symbol', data: pandas.DataFrame shape = (n_cell, n_genes) imported data matrix """ - tmpdir = tempfile.mkdtemp() - with zipfile.ZipFile(filename) as handle: - files = handle.namelist() - if len(files) != 4: - valid = False - else: - dirname = files[0].strip("/") - subdir_files = [f.split("/")[-1] for f in files] - if "barcodes.tsv" not in subdir_files: - valid = False - elif "genes.tsv" not in subdir_files: - valid = False - elif "matrix.mtx" not in subdir_files: - valid = False - else: - valid = True - if not valid: - raise ValueError( - "Expected a single zipped folder containing 'matrix.mtx', " - "'genes.tsv', and 'barcodes.tsv'. Got {}".format(files)) - handle.extractall(path=tmpdir) - data = load_10X(os.path.join(tmpdir, dirname)) - shutil.rmtree(tmpdir) - return data - - -@with_tables + return scprep.io.load_10X_zip(filename=filename, sparse=sparse, + gene_labels=gene_labels, + allow_duplicates=allow_duplicates) + + def load_10X_HDF5(filename, genome=None, sparse=True, gene_labels='symbol', allow_duplicates=None): """Basic IO for HDF5 10X data produced from the 10X Cellranger pipeline. @@ -566,36 +253,7 @@ def load_10X_HDF5(filename, genome=None, sparse=True, gene_labels='symbol', warnings.warn("magic.io is deprecated. Please use scprep.io instead. " "Read more at http://scprep.readthedocs.io", FutureWarning) - with tables.open_file(filename, 'r') as f: - if genome is None: - genomes = [node._v_name for node in f.list_nodes(f.root)] - print_genomes = ", ".join(genomes) - genome = genomes[0] - if len(genomes) > 1: - print("Available genomes: {}. Selecting {} by default".format( - print_genomes, genome)) - try: - group = f.get_node(f.root, genome) - except tables.NoSuchNodeError: - genomes = [node._v_name for node in f.list_nodes(f.root)] - print_genomes = ", ".join(genomes) - raise ValueError( - "Genome {} not found in {}. " - "Available genomes: {}".format(genome, filename, print_genomes)) - if allow_duplicates is None: - allow_duplicates = not sparse - gene_names = _parse_10x_genes( - symbols=[g.decode() for g in getattr(group, 'gene_names').read()], - ids=[g.decode() for g in getattr(group, 'genes').read()], - gene_labels=gene_labels, allow_duplicates=allow_duplicates) - cell_names = [b.decode() for b in getattr(group, 'barcodes').read()] - data = getattr(group, 'data').read() - indices = getattr(group, 'indices').read() - indptr = getattr(group, 'indptr').read() - shape = getattr(group, 'shape').read() - data = sp.csc_matrix((data, indices, indptr), shape=shape) - data = _matrix_to_data_frame(data.T, - gene_names=gene_names, - cell_names=cell_names, - sparse=sparse) - return data + return scprep.io.load_10X_HDF5(filename=filename, genome=genome, + sparse=sparse, + gene_labels=gene_labels, + allow_duplicates=allow_duplicates) From 69722d07e5bb19809fcf1c7de7769415e32f640e Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Fri, 21 Sep 2018 14:15:14 -0400 Subject: [PATCH 29/32] Clean up imports --- python/magic/io.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/python/magic/io.py b/python/magic/io.py index 6346a2a5..23f16e82 100644 --- a/python/magic/io.py +++ b/python/magic/io.py @@ -2,15 +2,7 @@ # (C) 2018 Krishnaswamy Lab GPLv2 from __future__ import print_function, division -import pandas as pd -import scipy.io as sio -import scipy.sparse as sp import warnings -import numpy as np -import os -import zipfile -import tempfile -import shutil import scprep @@ -46,7 +38,8 @@ def load_csv(filename, cell_axis='row', delimiter=',', warnings.warn("magic.io is deprecated. Please use scprep.io instead. " "Read more at http://scprep.readthedocs.io", FutureWarning) - return scprep.io.load_csv(filename=filename, cell_axis=cell_axis, delimiter=delimiter, + return scprep.io.load_csv(filename=filename, cell_axis=cell_axis, + delimiter=delimiter, gene_names=gene_names, cell_names=cell_names, sparse=sparse, **kwargs) @@ -186,7 +179,8 @@ def load_10X(data_dir, sparse=True, gene_labels='symbol', warnings.warn("magic.io is deprecated. Please use scprep.io instead. " "Read more at http://scprep.readthedocs.io", FutureWarning) - return scprep.io.load_10X(data_dir=data_dir, sparse=sparse, gene_labels=gene_labels, + return scprep.io.load_10X(data_dir=data_dir, sparse=sparse, + gene_labels=gene_labels, allow_duplicates=allow_duplicates) From 9a725359b2f7288b7db73ed4408350680076c704 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Fri, 21 Sep 2018 14:15:53 -0400 Subject: [PATCH 30/32] Properly deprecate magic.proprocessing --- python/magic/preprocessing.py | 59 ++--------------------------------- 1 file changed, 2 insertions(+), 57 deletions(-) diff --git a/python/magic/preprocessing.py b/python/magic/preprocessing.py index a4b0bc91..32a470e4 100644 --- a/python/magic/preprocessing.py +++ b/python/magic/preprocessing.py @@ -2,11 +2,8 @@ # (C) 2017 Krishnaswamy Lab GPLv2 from __future__ import print_function, division -from sklearn.preprocessing import normalize -import numpy as np -from scipy import sparse -import pandas as pd import warnings +import scprep def library_size_normalize(data, verbose=False): @@ -30,56 +27,4 @@ def library_size_normalize(data, verbose=False): "Please use scprep.normalize instead. " "Read more at http://scprep.readthedocs.io", FutureWarning) - if verbose: - print("Normalizing library sizes for %s cells" % (data.shape[0])) - - # pandas support - columns, index = None, None - if isinstance(data, pd.SparseDataFrame) or \ - pd.api.types.is_sparse(data): - columns, index = data.columns, data.index - data = data.to_coo() - elif isinstance(data, pd.DataFrame): - columns, index = data.columns, data.index - - median_transcript_count = np.median(np.array(data.sum(axis=1))) - if sparse.issparse(data) and data.nnz >= 2**31: - # check we can access elements by index - try: - data[0, 0] - except TypeError: - data = sparse.csr_matrix(data) - # normalize in chunks - sklearn doesn't does with more - # than 2**31 non-zero elements - # - # determine maximum chunk size - split = 2**30 // (data.nnz // data.shape[0]) - size_ok = False - while not size_ok: - for i in range(0, data.shape[0], split): - if data[i:i + split, :].nnz >= 2**31: - split = split // 2 - break - size_ok = True - # normalize - data_norm = [] - for i in range(0, data.shape[0], split): - data_norm.append(normalize(data[i:i + split, :], 'l1', axis=1)) - # combine chunks - data_norm = sparse.vstack(data_norm) - else: - data_norm = normalize(data, norm='l1', axis=1) - - # norm = 'l1' computes the L1 norm which computes the - # axis = 1 independently normalizes each sample - - data_norm = data_norm * median_transcript_count - if columns is not None: - # pandas dataframe - if sparse.issparse(data_norm): - data_norm = pd.SparseDataFrame(data_norm, default_fill_value=0) - else: - data_norm = pd.DataFrame(data_norm) - data_norm.columns = columns - data_norm.index = index - return data_norm + return scprep.normalize.library_size_normalize(data) From 19d03da0034ec6ac9fe94a08de08b18c939c5114 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Fri, 21 Sep 2018 14:16:35 -0400 Subject: [PATCH 31/32] remove unnecessary test_requires scprep --- python/setup.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/setup.py b/python/setup.py index 2e4c99d0..5cffd732 100644 --- a/python/setup.py +++ b/python/setup.py @@ -15,7 +15,6 @@ test_requires = [ 'nose2', - 'scprep', ] if sys.version_info[0] == 3: From dbea6889c07ccf0c26090ec59f1790e1830b6163 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Fri, 21 Sep 2018 14:37:59 -0400 Subject: [PATCH 32/32] don't document io and preprocessing --- python/doc/source/api.rst | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/python/doc/source/api.rst b/python/doc/source/api.rst index a5e37ecd..66b6e079 100644 --- a/python/doc/source/api.rst +++ b/python/doc/source/api.rst @@ -9,22 +9,6 @@ MAGIC :inherited-members: :show-inheritance: -File Input/Output ------------------ - -.. automodule:: magic.io - :members: - :inherited-members: - :show-inheritance: - -Data Preprocessing ------------------- - -.. automodule:: magic.preprocessing - :members: - :inherited-members: - :show-inheritance: - Plotting --------