diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index a2fe8c79..1193ee6b 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -8,7 +8,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [3.6, 3.7, 3.8] + python-version: [3.7, 3.8] steps: - uses: actions/checkout@v2 diff --git a/README.md b/README.md index 0f6b5ef4..510eddff 100644 --- a/README.md +++ b/README.md @@ -50,5 +50,6 @@ Commands: paga Trajectory inference by abstract graph analysis. dpt Calculate diffusion pseudotime relative to the root cells. integrate Integrate cells from different experimental batches. + multiplet Execute methods for multiplet removal. plot Visualise data. ``` diff --git a/scanpy-scripts-tests.bats b/scanpy-scripts-tests.bats index 941f22aa..f2c17ec6 100755 --- a/scanpy-scripts-tests.bats +++ b/scanpy-scripts-tests.bats @@ -12,6 +12,11 @@ setup() { read_obj="${output_dir}/read.h5ad" filter_opt="--save-raw -p n_genes 200 2500 -p c:n_counts 0 50000 -p n_cells 3 inf -p pct_counts_mito 0 0.2 -c mito '!True' --show-obj stdout" filter_obj="${output_dir}/filter.h5ad" + scrublet_tsv="${output_dir}/scrublet.tsv" + scrublet_png="${output_dir}/scrublet.png" + scrublet_obj="${output_dir}/scrublet.h5ad" + scrublet_simulate_obj="${output_dir}/scrublet_simulate.h5ad" + scrublet_opt="--input-obj-sim ${scrublet_simulate_obj} --filter --export-table ${scrublet_tsv}" norm_mtx="${output_dir}/norm" norm_opt="--save-layer filtered -t 10000 -l all -n after -X ${norm_mtx} --show-obj stdout" norm_obj="${output_dir}/norm.h5ad" @@ -173,6 +178,46 @@ setup() { [ -f "$hvg_obj" ] } +# Do separate doublet simulation step (normally we'd just let the main scrublet +# process do this). + +@test "Run Scrublet doublet simulation" { + if [ "$resume" = 'true' ] && [ -f "$scrublet_simulate_obj" ]; then + skip "$scrublet_simulate_obj exists and resume is set to 'true'" + fi + + run rm -f $srublet_simulate_obj && eval "$scanpy multiplet scrublet_simulate_doublets $hvg_obj $scrublet_simulate_obj" + + [ "$status" -eq 0 ] + [ -f "$scrublet_simulate_obj" ] +} + +# Detect multiplets with Scrublet + +@test "Run Scrublet for multiplet detection" { + if [ "$resume" = 'true' ] && [ -f "$scrublet_obj" ]; then + skip "$scrublet_obj exists and resume is set to 'true'" + fi + + run rm -f $scrublet_obj && eval "$scanpy multiplet scrublet $scrublet_opt $hvg_obj $scrublet_obj" + + [ "$status" -eq 0 ] + [ -f "$scrublet_obj" ] && [ -f "$scrublet_tsv" ] +} + +# Run the doublet plot from Scrublet + +@test "Run Scrublet score distribution plot" { + if [ "$resume" = 'true' ] && [ -f "$scrublet_png" ]; then + skip "$scrublet_png exists and resume is set to 'true'" + fi + + run rm -f $scrublet_png && eval "$scanpy plot scrublet $scrublet_obj $scrublet_png" + + [ "$status" -eq 0 ] + [ -f "$scrublet_png" ] +} + # Regress out variables @test "Regress out unwanted variable" { diff --git a/scanpy_scripts/cli.py b/scanpy_scripts/cli.py index b5ed0bfc..c521ac95 100755 --- a/scanpy_scripts/cli.py +++ b/scanpy_scripts/cli.py @@ -31,6 +31,9 @@ PLOT_MATRIX_CMD, PLOT_HEATMAP_CMD, HARMONY_INTEGRATE_CMD, + SCRUBLET_MULTIPLET_CMD, + SCRUBLET_MULTIPLET_SIMULATE_CMD, + SCRUBLET_MULTIPLET_PLOT_CMD, BBKNN_CMD, MNN_CORRECT_CMD, COMBAT_CMD, @@ -114,6 +117,13 @@ def integrate(): integrate.add_command(MNN_CORRECT_CMD) integrate.add_command(COMBAT_CMD) +@cli.group(cls=NaturalOrderGroup) +def multiplet(): + """Execute methods for multiplet removal.""" + +multiplet.add_command(SCRUBLET_MULTIPLET_CMD) +multiplet.add_command(SCRUBLET_MULTIPLET_SIMULATE_CMD) + @cli.group(cls=NaturalOrderGroup) def plot(): @@ -126,3 +136,4 @@ def plot(): plot.add_command(PLOT_DOT_CMD) plot.add_command(PLOT_MATRIX_CMD) plot.add_command(PLOT_HEATMAP_CMD) +plot.add_command(SCRUBLET_MULTIPLET_PLOT_CMD) diff --git a/scanpy_scripts/cmd_options.py b/scanpy_scripts/cmd_options.py index 114a8aee..7a769afe 100644 --- a/scanpy_scripts/cmd_options.py +++ b/scanpy_scripts/cmd_options.py @@ -194,7 +194,14 @@ ), ], - + 'neighbor_metric': click.option( + '--metric', '-t', + type=click.Choice(['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan', 'braycurtis', 'canberra', 'chebyshev', 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule']), + default='euclidean', + show_default=True, + help='A known metric’s name.' + ), + 'layer':click.option( '--layer', type=CommaSeparatedText(simplify=True), @@ -232,7 +239,7 @@ 'use_raw': click.option( '--use-raw/--no-raw', 'use_raw', - default=True, + default=None, show_default=True, help='Use expression values in `.raw` if present.', ), @@ -537,6 +544,28 @@ show_default=True, help="Layer to batch correct. By default corrects the contents of .X." ), + + 'scrublet': [ + click.option( + '--sim-doublet-ratio', + type=click.FLOAT, + default=2.0, + show_default=True, + help='Number of doublets to simulate relative to the number of ' + 'observed transcriptomes.', + ), + click.option( + '--synthetic-doublet-umi-subsampling', + type=click.FLOAT, + default=1.0, + show_default=True, + help='Where input_obj_sim not suplied, rate for sampling UMIs when ' + 'creating synthetic doublets. If 1.0, each doublet is created by ' + 'simply adding the UMI counts from two randomly sampled observed ' + 'transcriptomes. For values less than 1, the UMI counts are added ' + 'and then randomly sampled at the specified rate.' + ), + ], } COMMON_OPTIONS['opt_output'] = [ @@ -899,13 +928,7 @@ 'connectivities. Use rapids for the RAPIDS implementation of UMAP ' '(experimental, GPU only).' ), - click.option( - '--metric', '-t', - type=click.Choice(['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan', 'braycurtis', 'canberra', 'chebyshev', 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule']), - default='euclidean', - show_default=True, - help='A known metric’s name.' - ), + COMMON_OPTIONS['neighbor_metric'], ], 'umap': [ @@ -1585,13 +1608,7 @@ 'potentially increasing the degree of batch correction. Use this flag to disable ' 'that behaviour.', ), - click.option( - '--metric', '-t', - type=click.Choice(['angular', 'cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan', 'braycurtis', 'canberra', 'chebyshev', 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule']), - default='angular', - show_default=True, - help='A known metric’s name.' - ), + COMMON_OPTIONS['neighbor_metric'], click.option( '--neighbors-within-batch', type=click.INT, @@ -1613,7 +1630,7 @@ 'each cell. Set to 0 to skip.' ), click.option( - '--n-trees', + '--annoy-n-trees', type=click.INT, default=10, show_default=True, @@ -1650,6 +1667,171 @@ 'connectivity value of 1)' ), ], + + 'scrublet': [ + *COMMON_OPTIONS['input'], + *COMMON_OPTIONS['output'], + click.option( + '--input-obj-sim', 'adata_sim', + type=click.Path(exists=True, dir_okay=False), + default=None, + help='(Advanced use case) Optional annData object generated by ' + 'sc.external.pp.scrublet_simulate_doublets(), with same number of ' + 'vars as adata. This should have been built from input_obj after ' + 'filtering genes and cells and selcting highly-variable genes.' + ), + click.option( + '--threshold', + type=click.FLOAT, + default=None, + show_default=True, + help='Doublet score threshold for calling a transcriptome a ' + 'doublet. If not set, this is set automatically by looking for the ' + 'minimum between the two modes of the doublet_scores_sim_ histogram. ' + 'It is best practice to check the threshold visually using the ' + 'doublet_scores_sim_ histogram and/or based on co-localization of ' + 'predicted doublets in a 2-D embedding.' + ), + *COMMON_OPTIONS['scrublet'], + click.option( + '--expected-doublet-rate', + type=click.FLOAT, + default=0.05, + show_default=True, + help='Where input_obj_sim not suplied, the estimated doublet rate ' + 'for the experiment.' + ), + click.option( + '--stdev-doublet-rate', + type=click.FLOAT, + default=0.02, + show_default=True, + help='Where input_obje_sim not suplied, uncertainty in the expected ' + 'doublet rate.' + ), + click.option( + '--knn-dist-metric', '-t', + type=click.Choice(['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan', 'braycurtis', 'canberra', 'chebyshev', 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule']), + default='euclidean', + show_default=True, + help='A known metric’s name.' + ), + click.option( + '--no-normalize-variance', 'normalize_variance', + is_flag=True, + default=True, + help='Default is to normalize the data such that each gene has a ' + 'variance of 1. sklearn.decomposition.TruncatedSVD will be used for ' + 'dimensionality reduction, if --no-mean-center is set. Use this flag ' + 'to disable that behaviour.' + ), + click.option( + '--log-transform', + is_flag=True, + default=False, + show_default=True, + help='Whether to use :func:~scanpy.pp.log1p to log-transform the ' + 'data prior to PCA.' + ), + click.option( + '--no-mean-center', 'mean_center', + is_flag=True, + default=True, + help='If True, center the data such that each gene has a mean of 0. ' + 'sklearn.decomposition.PCA will be used for dimensionality ' + 'reduction.' + ), + click.option( + '--n-pcs', 'n_prin_comps', + type=click.INT, + default=30, + show_default=True, + help='Number of principal components used to embed the ' + 'transcriptomes prior to k-nearest-neighbor graph construction.' + ), + click.option( + '--no-approx', 'use_approx_neighbors', + is_flag=True, + default=True, + help='Default behaviour is to use the approximate nearest neighbor ' + 'method (annoy) for the KNN classifier. Use this flag to disable ' + 'that behaviour.' + ), + click.option( + '--get-doublet-neighbor-parents', + is_flag=True, + default=False, + show_default=True, + help='If set, return (in .uns) the parent transcriptomes that ' + 'generated the doublet neighbors of each observed transcriptome. ' + 'This information can be used to infer the cell states that ' + 'generated a given doublet state.' + ), + click.option( + '--n-neighbors', '-k', + type=CommaSeparatedText(click.INT, simplify=True), + default=None, + show_default=True, + help='Number of neighbors used to construct the KNN graph of ' + 'observed transcriptomes and simulated doublets. If not set, this is ' + 'automatically set to np.round(0.5 * np.sqrt(n_obs)).' + ), + click.option( + '--filter', 'filter', + is_flag=True, + default=False, + help='By default, the output object is annotated but not filtered ' + 'according to the scrublet status. Setting this flag will cause ' + 'predicted multiplet elements to be removed.' + ), + click.option( + '--no-verbose', 'verbose', + is_flag=True, + default=True, + help='Default behaviour is to print progress updates. Use this flag ' + 'to disable that.' + ), + click.option( + '--export-table', + type=click.Path(dir_okay=False, writable=True), + default=None, + show_default=True, + help='Export a table of double scores and calls to the specified file.', + ), + COMMON_OPTIONS['random_state'], + ], + + 'plot_scrublet': [ + *COMMON_OPTIONS['input'], + *COMMON_OPTIONS['plot'], + click.option( + '--scale-hist-obs', '-b', + type=click.Choice(['linear', 'log', 'symlog', 'logit']), + default='log', + show_default=True, + help='Set y axis scale transformation in matplotlib for the plot of observed transcriptomes.' + ), + click.option( + '--scale-hist-sim', '-s', + type=click.Choice(['linear', 'log', 'symlog', 'logit']), + default='linear', + show_default=True, + help='Set y axis scale transformation in matplotlib for the plot of observed transcriptomes.' + ), + ], + + 'scrublet_simulate_doublets': [ + *COMMON_OPTIONS['input'], + *COMMON_OPTIONS['output'], + *COMMON_OPTIONS['scrublet'], + click.option( + '--layer', '-l', + type=click.STRING, + default=None, + help="Layer of adata where raw values are stored, or ‘X’ if values " + "are in .X." + ), + ], 'embed': [ *COMMON_OPTIONS['input'], @@ -1818,10 +2000,10 @@ ), click.option( '--color', - type=click.STRING, + type=CommaSeparatedText(simplify=True), default=None, show_default=True, - help='Key for annotation of observations/cells or variables/genes.', + help='Key(s) for annotation of observations/cells or variables/genes. Comma-separated if more than one', ), click.option( '--legend-loc', diff --git a/scanpy_scripts/cmd_utils.py b/scanpy_scripts/cmd_utils.py index a6caf52d..ecace3b5 100644 --- a/scanpy_scripts/cmd_utils.py +++ b/scanpy_scripts/cmd_utils.py @@ -5,9 +5,11 @@ import click import pandas as pd import scanpy as sc +import scanpy.external as sce from .cmd_options import CMD_OPTIONS from .lib._paga import plot_paga from .obj_utils import _save_matrix +from .lib._scrublet import plot_scrublet def make_subcmd(cmd_name, func, cmd_desc, arg_desc, opt_set = None): """ diff --git a/scanpy_scripts/cmds.py b/scanpy_scripts/cmds.py index 1085e0ff..c60ecf82 100644 --- a/scanpy_scripts/cmds.py +++ b/scanpy_scripts/cmds.py @@ -30,6 +30,7 @@ from .lib._bbknn import bbknn from .lib._mnn import mnn_correct from .lib._combat import combat +from .lib._scrublet import scrublet, scrublet_simulate_doublets LANG = os.environ.get('LANG', None) @@ -243,3 +244,25 @@ cmd_desc='Correct batch effects by matching mutual nearest neighbors [Haghverdi18] [Kang18].', arg_desc=_IO_DESC, ) + +SCRUBLET_MULTIPLET_CMD = make_subcmd( + 'scrublet', + scrublet, + cmd_desc='Filter out likely multiplets from droplet data using Scrublet [Wolock2019].', + arg_desc=_IO_DESC, +) + +SCRUBLET_MULTIPLET_SIMULATE_CMD = make_subcmd( + 'scrublet_simulate_doublets', + scrublet_simulate_doublets, + cmd_desc='Simulate doublets with random transcriptome pairs for Scrublet [Wolock2019].', + arg_desc=_IO_DESC, +) + +SCRUBLET_MULTIPLET_PLOT_CMD = make_subcmd( + 'scrublet', + make_plot_function('plot_scrublet', 'scrublet_score_distribution'), + cmd_desc='Plot histogram of doublet scores for observed transcriptomes and simulated doublets..', + arg_desc=_IP_DESC, + opt_set='plot_scrublet' +) diff --git a/scanpy_scripts/lib/_diffexp.py b/scanpy_scripts/lib/_diffexp.py index b901423b..20efecfa 100644 --- a/scanpy_scripts/lib/_diffexp.py +++ b/scanpy_scripts/lib/_diffexp.py @@ -8,7 +8,7 @@ def diffexp( adata, - use_raw=True, + use_raw=None, n_genes=None, key_added='rank_genes_groups', layer=None, diff --git a/scanpy_scripts/lib/_leiden.py b/scanpy_scripts/lib/_leiden.py index 8ca18eb4..0b2b7b0c 100644 --- a/scanpy_scripts/lib/_leiden.py +++ b/scanpy_scripts/lib/_leiden.py @@ -3,7 +3,7 @@ """ import scanpy as sc -from ..obj_utils import write_cluster +from ..obj_utils import write_obs def leiden( @@ -59,6 +59,6 @@ def leiden( )) if export_cluster: - write_cluster(adata, keys, export_cluster) + write_obs(adata, keys, export_cluster) return keys diff --git a/scanpy_scripts/lib/_louvain.py b/scanpy_scripts/lib/_louvain.py index 3130da8c..dc380bad 100644 --- a/scanpy_scripts/lib/_louvain.py +++ b/scanpy_scripts/lib/_louvain.py @@ -3,7 +3,7 @@ """ import scanpy as sc -from ..obj_utils import write_cluster +from ..obj_utils import write_obs def louvain( @@ -61,6 +61,6 @@ def louvain( )) if export_cluster: - write_cluster(adata, keys, export_cluster) + write_obs(adata, keys, export_cluster) return keys diff --git a/scanpy_scripts/lib/_paga.py b/scanpy_scripts/lib/_paga.py index 44722dce..0aace0d3 100644 --- a/scanpy_scripts/lib/_paga.py +++ b/scanpy_scripts/lib/_paga.py @@ -93,6 +93,7 @@ def plot_paga( adata, layout=layout, init_pos=init_pos, + color=color, title=title, show=show, save=save, diff --git a/scanpy_scripts/lib/_scrublet.py b/scanpy_scripts/lib/_scrublet.py new file mode 100644 index 00000000..3b918058 --- /dev/null +++ b/scanpy_scripts/lib/_scrublet.py @@ -0,0 +1,61 @@ +""" +scanpy external scrublet +""" + +import scanpy as sc +import scanpy.external as sce +import numpy as np +from ..obj_utils import write_obs + +# Wrapper for scrublet allowing text export and filtering + +def scrublet(adata, adata_sim=None, filter=False, export_table=None, **kwargs): + """ + Wrapper function for sce.pp.scrublet(), to allow filtering of resulting object + """ + + # Do we need to read an object with the doublet simulations? + + if adata_sim: + adata_sim = sc.read(adata_sim) + + sce.pp.scrublet(adata, adata_sim=adata_sim, **kwargs) + + # Do any export before optional filtering + + if export_table: + write_obs(adata, ["doublet_score", "predicted_doublet"], export_table) + + # Filter out predited doublets + + if filter: + adata._inplace_subset_obs(np.invert(adata.obs["predicted_doublet"])) + + return adata + +# Run the doublet simulation. + +def scrublet_simulate_doublets(adata, **kwargs): + adata_sim = sce.pp.scrublet_simulate_doublets(adata, **kwargs) + adata._init_as_actual( + X=adata_sim.X, obs=adata_sim.obs, obsm=adata_sim.obsm, uns=adata.uns + ) + +# Just absorb the extra plotting args before passing to +# scanpy.external.pl.scrublet_score_distribution + + +def plot_scrublet( + adata, scale_hist_obs="log", scale_hist_sim="linear", fig_size=(8, 3), **kwargs +): + """ + Wrapper function for sce.pl.scrublet_score_distribution(), to allow + plotting of score distribution + """ + sce.pl.scrublet_score_distribution( + adata, + scale_hist_obs=scale_hist_obs, + scale_hist_sim=scale_hist_sim, + figsize=fig_size, + **kwargs + ) diff --git a/scanpy_scripts/obj_utils.py b/scanpy_scripts/obj_utils.py index 2dcf2e6f..13fc45a6 100644 --- a/scanpy_scripts/obj_utils.py +++ b/scanpy_scripts/obj_utils.py @@ -5,7 +5,7 @@ import scanpy as sc import pandas as pd -def write_cluster(adata, keys, cluster_fn, sep='\t'): +def write_obs(adata, keys, obs_fn, sep='\t'): """Export cell clustering as a text table """ if not isinstance(keys, (list, tuple)): @@ -14,7 +14,7 @@ def write_cluster(adata, keys, cluster_fn, sep='\t'): if key not in adata.obs.keys(): raise KeyError(f'{key} is not a valid `.uns` key') adata.obs[keys].reset_index(level=0).rename(columns={'index': 'cells'}).to_csv( - cluster_fn, sep=sep, header=True, index=False) + obs_fn, sep=sep, header=True, index=False) def write_embedding(adata, key, embed_fn, n_comp=None, sep='\t', key_added=None): diff --git a/setup.py b/setup.py index a5fee6fb..84ed0b75 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setup( name='scanpy-scripts', - version='0.4.1', + version='1.0.0', author='nh3', author_email='nh3@users.noreply.github.com', description='Scripts for using scanpy from the command line', @@ -41,7 +41,7 @@ 'matplotlib', 'pandas', 'h5py<3.0.0', - 'scanpy>=1.6.0', + 'scanpy>=1.8.0', 'louvain', 'leidenalg', 'loompy', @@ -49,7 +49,8 @@ 'Click<8', 'umap-learn<0.4.0', 'harmonypy>=0.0.5', - 'bbknn>=1.3.12,<1.5.0', - 'mnnpy>=0.1.9.5' + 'bbknn>=1.5.0', + 'mnnpy>=0.1.9.5', + 'scrublet' ], )