Merge branch 'release/v1.0.0' into master

ebi-gene-expression-group · Jul 2, 2021 · 3431fe9 · 3431fe9
2 parents 38554ab + 4bc7499
commit 3431fe9
Show file tree

Hide file tree

Showing 14 changed files with 358 additions and 31 deletions.
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
@@ -8,7 +8,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: [3.6, 3.7, 3.8]
+        python-version: [3.7, 3.8]
 
     steps:
     - uses: actions/checkout@v2

diff --git a/README.md b/README.md
@@ -50,5 +50,6 @@ Commands:
   paga       Trajectory inference by abstract graph analysis.
   dpt        Calculate diffusion pseudotime relative to the root cells.
   integrate  Integrate cells from different experimental batches.
+  multiplet  Execute methods for multiplet removal.
   plot       Visualise data.
   ```
diff --git a/scanpy-scripts-tests.bats b/scanpy-scripts-tests.bats
@@ -12,6 +12,11 @@ setup() {
     read_obj="${output_dir}/read.h5ad"
     filter_opt="--save-raw -p n_genes 200 2500 -p c:n_counts 0 50000 -p n_cells 3 inf -p pct_counts_mito 0 0.2 -c mito '!True' --show-obj stdout"
     filter_obj="${output_dir}/filter.h5ad"
+    scrublet_tsv="${output_dir}/scrublet.tsv"
+    scrublet_png="${output_dir}/scrublet.png"
+    scrublet_obj="${output_dir}/scrublet.h5ad"
+    scrublet_simulate_obj="${output_dir}/scrublet_simulate.h5ad"
+    scrublet_opt="--input-obj-sim ${scrublet_simulate_obj} --filter --export-table ${scrublet_tsv}"
     norm_mtx="${output_dir}/norm"
     norm_opt="--save-layer filtered -t 10000 -l all -n after -X ${norm_mtx} --show-obj stdout"
     norm_obj="${output_dir}/norm.h5ad"
@@ -173,6 +178,46 @@ setup() {
     [ -f  "$hvg_obj" ]
 }
 
+# Do separate doublet simulation step (normally we'd just let the main scrublet
+# process do this).
+
+@test "Run Scrublet doublet simulation" {
+    if [ "$resume" = 'true' ] && [ -f "$scrublet_simulate_obj" ]; then
+        skip "$scrublet_simulate_obj exists and resume is set to 'true'"
+    fi
+
+    run rm -f $srublet_simulate_obj && eval "$scanpy multiplet scrublet_simulate_doublets $hvg_obj $scrublet_simulate_obj"
+
+    [ "$status" -eq 0 ]
+    [ -f  "$scrublet_simulate_obj" ]
+}
+
+# Detect multiplets with Scrublet
+
+@test "Run Scrublet for multiplet detection" {
+    if [ "$resume" = 'true' ] && [ -f "$scrublet_obj" ]; then
+        skip "$scrublet_obj exists and resume is set to 'true'"
+    fi
+
+    run rm -f $scrublet_obj && eval "$scanpy multiplet scrublet $scrublet_opt $hvg_obj $scrublet_obj"
+
+    [ "$status" -eq 0 ]
+    [ -f  "$scrublet_obj" ] && [ -f "$scrublet_tsv" ]
+}
+
+# Run the doublet plot from Scrublet
+
+@test "Run Scrublet score distribution plot" {
+    if [ "$resume" = 'true' ] && [ -f "$scrublet_png" ]; then
+        skip "$scrublet_png exists and resume is set to 'true'"
+    fi
+
+    run rm -f $scrublet_png && eval "$scanpy plot scrublet $scrublet_obj $scrublet_png"
+
+    [ "$status" -eq 0 ]
+    [ -f  "$scrublet_png" ]
+}
+
 # Regress out variables
 
 @test "Regress out unwanted variable" {

diff --git a/scanpy_scripts/cli.py b/scanpy_scripts/cli.py
@@ -31,6 +31,9 @@
     PLOT_MATRIX_CMD,
     PLOT_HEATMAP_CMD,
     HARMONY_INTEGRATE_CMD,
+    SCRUBLET_MULTIPLET_CMD,
+    SCRUBLET_MULTIPLET_SIMULATE_CMD,
+    SCRUBLET_MULTIPLET_PLOT_CMD,
     BBKNN_CMD,
     MNN_CORRECT_CMD,
     COMBAT_CMD,
@@ -114,6 +117,13 @@ def integrate():
 integrate.add_command(MNN_CORRECT_CMD)
 integrate.add_command(COMBAT_CMD)
 
+@cli.group(cls=NaturalOrderGroup)
+def multiplet():
+    """Execute methods for multiplet removal."""
+
+multiplet.add_command(SCRUBLET_MULTIPLET_CMD)
+multiplet.add_command(SCRUBLET_MULTIPLET_SIMULATE_CMD)
+
 
 @cli.group(cls=NaturalOrderGroup)
 def plot():
@@ -126,3 +136,4 @@ def plot():
 plot.add_command(PLOT_DOT_CMD)
 plot.add_command(PLOT_MATRIX_CMD)
 plot.add_command(PLOT_HEATMAP_CMD)
+plot.add_command(SCRUBLET_MULTIPLET_PLOT_CMD)
diff --git a/scanpy_scripts/cmd_options.py b/scanpy_scripts/cmd_options.py
@@ -194,7 +194,14 @@
         ),
     ],
 
-
+    'neighbor_metric': click.option(
+        '--metric', '-t',
+        type=click.Choice(['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan', 'braycurtis', 'canberra', 'chebyshev', 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule']),
+        default='euclidean',
+        show_default=True,
+        help='A known metric’s name.'
+    ),
+
     'layer':click.option(
         '--layer',
         type=CommaSeparatedText(simplify=True),
@@ -232,7 +239,7 @@
 
     'use_raw': click.option(
         '--use-raw/--no-raw', 'use_raw',
-        default=True,
+        default=None,
         show_default=True,
         help='Use expression values in `.raw` if present.',
     ),
@@ -537,6 +544,28 @@
         show_default=True,
         help="Layer to batch correct. By default corrects the contents of .X."
     ),
+
+    'scrublet': [
+        click.option(
+            '--sim-doublet-ratio',
+            type=click.FLOAT,
+            default=2.0,
+            show_default=True,
+            help='Number of doublets to simulate relative to the number of '
+            'observed transcriptomes.',
+        ),
+        click.option(
+            '--synthetic-doublet-umi-subsampling',
+            type=click.FLOAT,
+            default=1.0,
+            show_default=True,
+            help='Where input_obj_sim not suplied, rate for sampling UMIs when '
+            'creating synthetic doublets. If 1.0, each doublet is created by '
+            'simply adding the UMI counts from two randomly sampled observed '
+            'transcriptomes.  For values less than 1, the UMI counts are added '
+            'and then randomly sampled at the specified rate.'
+        ),
+    ],
 }
 
 COMMON_OPTIONS['opt_output'] = [
@@ -899,13 +928,7 @@
             'connectivities. Use rapids for the RAPIDS implementation of UMAP '
             '(experimental, GPU only).'
         ),
-        click.option(
-            '--metric', '-t',
-            type=click.Choice(['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan', 'braycurtis', 'canberra', 'chebyshev', 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule']),
-            default='euclidean',
-            show_default=True,
-            help='A known metric’s name.'
-        ),
+        COMMON_OPTIONS['neighbor_metric'],
     ],
 
     'umap': [
@@ -1585,13 +1608,7 @@
             'potentially increasing the degree of batch correction. Use this flag to disable '
             'that behaviour.',
         ),
-        click.option(
-            '--metric', '-t',
-            type=click.Choice(['angular', 'cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan', 'braycurtis', 'canberra', 'chebyshev', 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule']),
-            default='angular',
-            show_default=True,
-            help='A known metric’s name.'
-        ),
+        COMMON_OPTIONS['neighbor_metric'],
         click.option(
             '--neighbors-within-batch',
             type=click.INT,
@@ -1613,7 +1630,7 @@
             'each cell. Set to 0 to skip.'
         ),
         click.option(
-            '--n-trees',
+            '--annoy-n-trees',
             type=click.INT,
             default=10,
             show_default=True,
@@ -1650,6 +1667,171 @@
             'connectivity value of 1)'
         ),
     ],
+
+    'scrublet': [
+        *COMMON_OPTIONS['input'],
+        *COMMON_OPTIONS['output'],
+        click.option(
+            '--input-obj-sim', 'adata_sim',
+            type=click.Path(exists=True, dir_okay=False),
+            default=None,
+            help='(Advanced use case) Optional annData object generated by '
+            'sc.external.pp.scrublet_simulate_doublets(), with same number of  '
+            'vars as adata. This should have been built from input_obj after '
+            'filtering genes and cells and selcting highly-variable genes.'
+        ),
+        click.option(
+            '--threshold',
+            type=click.FLOAT,
+            default=None,
+            show_default=True,
+            help='Doublet score threshold for calling a transcriptome a '
+            'doublet. If not set, this is set automatically by looking for the '
+            'minimum between the two modes of the doublet_scores_sim_ histogram. '
+            'It is best practice to check the threshold visually using the '
+            'doublet_scores_sim_ histogram and/or based on co-localization of '
+            'predicted doublets in a 2-D embedding.'
+        ),
+        *COMMON_OPTIONS['scrublet'],
+        click.option(
+            '--expected-doublet-rate',
+            type=click.FLOAT,
+            default=0.05,
+            show_default=True,
+            help='Where input_obj_sim not suplied, the estimated doublet rate '
+            'for the experiment.'
+        ),
+        click.option(
+            '--stdev-doublet-rate',
+            type=click.FLOAT,
+            default=0.02,
+            show_default=True,
+            help='Where input_obje_sim not suplied, uncertainty in the expected '
+            'doublet rate.'
+        ),
+         click.option(
+            '--knn-dist-metric', '-t',
+            type=click.Choice(['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan', 'braycurtis', 'canberra', 'chebyshev', 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule']),
+            default='euclidean',
+            show_default=True,
+            help='A known metric’s name.'
+        ),
+        click.option(
+            '--no-normalize-variance', 'normalize_variance',
+            is_flag=True,
+            default=True,
+            help='Default is to normalize the data such that each gene has a '
+            'variance of 1. sklearn.decomposition.TruncatedSVD will be used for '
+            'dimensionality reduction, if --no-mean-center is set. Use this flag '
+            'to disable that behaviour.'
+        ),
+        click.option(
+            '--log-transform',
+            is_flag=True,
+            default=False,
+            show_default=True,
+            help='Whether to use :func:~scanpy.pp.log1p to log-transform the '
+            'data prior to PCA.'
+        ),
+        click.option(
+            '--no-mean-center', 'mean_center',
+            is_flag=True,
+            default=True,
+            help='If True, center the data such that each gene has a mean of 0. '
+            'sklearn.decomposition.PCA will be used for dimensionality '
+            'reduction.'
+        ),
+        click.option(
+            '--n-pcs', 'n_prin_comps',
+            type=click.INT,
+            default=30,
+            show_default=True,
+            help='Number of principal components used to embed the '
+            'transcriptomes prior to k-nearest-neighbor graph construction.'
+        ),
+        click.option(
+            '--no-approx', 'use_approx_neighbors',
+            is_flag=True,
+            default=True,
+            help='Default behaviour is to use the approximate nearest neighbor '
+            'method (annoy) for the KNN classifier. Use this flag to disable '
+            'that behaviour.'
+        ),
+        click.option(
+            '--get-doublet-neighbor-parents',
+            is_flag=True,
+            default=False,
+            show_default=True,
+            help='If set, return (in .uns) the parent transcriptomes that '
+            'generated the doublet neighbors of each observed transcriptome. '
+            'This information can be used to infer the cell states that '
+            'generated a given doublet state.'
+        ),
+        click.option(
+            '--n-neighbors', '-k',
+            type=CommaSeparatedText(click.INT, simplify=True),
+            default=None,
+            show_default=True,
+            help='Number of neighbors used to construct the KNN graph of '
+            'observed transcriptomes and simulated doublets. If not set, this is '
+            'automatically set to np.round(0.5 * np.sqrt(n_obs)).'
+        ),
+        click.option(
+            '--filter', 'filter',
+            is_flag=True,
+            default=False,
+            help='By default, the output object is annotated but not filtered '
+            'according to the scrublet status. Setting this flag will cause '
+            'predicted multiplet elements to be removed.'
+        ),
+        click.option(
+            '--no-verbose', 'verbose',
+            is_flag=True,
+            default=True,
+            help='Default behaviour is to print progress updates. Use this flag '
+            'to disable that.'
+        ),
+        click.option(
+            '--export-table',
+            type=click.Path(dir_okay=False, writable=True),
+            default=None,
+            show_default=True,
+            help='Export a table of double scores and calls to the specified file.',
+        ),
+        COMMON_OPTIONS['random_state'],
+    ],
+
+    'plot_scrublet': [
+        *COMMON_OPTIONS['input'],
+        *COMMON_OPTIONS['plot'],
+         click.option(
+            '--scale-hist-obs', '-b',
+            type=click.Choice(['linear', 'log', 'symlog', 'logit']),
+            default='log',
+            show_default=True,
+            help='Set y axis scale transformation in matplotlib for the plot of observed transcriptomes.'
+        ),
+         click.option(
+            '--scale-hist-sim', '-s',
+            type=click.Choice(['linear', 'log', 'symlog', 'logit']),
+            default='linear',
+            show_default=True,
+            help='Set y axis scale transformation in matplotlib for the plot of observed transcriptomes.'
+        ),
+     ],
+
+    'scrublet_simulate_doublets': [
+        *COMMON_OPTIONS['input'],
+        *COMMON_OPTIONS['output'],
+        *COMMON_OPTIONS['scrublet'],
+        click.option(
+            '--layer', '-l',
+            type=click.STRING,
+            default=None,
+            help="Layer of adata where raw values are stored, or ‘X’ if values "
+            "are in .X."
+        ),
+    ],
 
     'embed': [
         *COMMON_OPTIONS['input'],
@@ -1818,10 +2000,10 @@
         ),
         click.option(
             '--color',
-            type=click.STRING,
+            type=CommaSeparatedText(simplify=True),
             default=None,
             show_default=True,
-            help='Key for annotation of observations/cells or variables/genes.',
+            help='Key(s) for annotation of observations/cells or variables/genes. Comma-separated if more than one',
         ),
         click.option(
             '--legend-loc',

diff --git a/scanpy_scripts/cmd_utils.py b/scanpy_scripts/cmd_utils.py
@@ -5,9 +5,11 @@
 import click
 import pandas as pd
 import scanpy as sc
+import scanpy.external as sce
 from .cmd_options import CMD_OPTIONS
 from .lib._paga import plot_paga
 from .obj_utils import _save_matrix
+from .lib._scrublet import plot_scrublet
 
 def make_subcmd(cmd_name, func, cmd_desc, arg_desc, opt_set = None):
     """