From 2b926f1a756cce69ad686135bd0bcd5bc2c7cebc Mon Sep 17 00:00:00 2001
From: Pablo Moreno <pcm32@users.noreply.github.com>
Date: Fri, 9 Feb 2024 12:28:54 +0000
Subject: [PATCH 1/8] Move to scanpy 1.9.3 and other improvements (#117)

* Use mito as bool not changing to category

* Trials scanpy 1.9.1 for the mnn - numba issue.

* Fix 1.9.1 hgv not finding base on log1p

* Please black

* Try log fix regardless of option

* Check direct installation

* Set base explicitly to avoid it being dropped

* Please black

* Add igrahp and reinstate leidenalg

* Also louvain is needed

* Pin back h5py

* Passing all tests locally

* Keep todo

* Try github actions with mamba

* Black formatting

* Python versions, better mamba

* Avoid treating versions as numbers

* Actions changes

* Black with no options

* Black fixes

* Check co structure

* Why do we get extra files?

* Black manually

* Make sure env is activated

* pytest fix

* Try with original extra dir for pytest

* Type

* missing extra dir

* Use importlib.metadata instead

* pip install before tests

* impose pin on scipy for mnnpy

* Avoid python 3.10

* Allow single group in one to one marker comp

* Rerun automatic tests

* Try pinning bknn below 1.6.0

* Pin sklearn for bbknn

* Fix package name

* Pin numba for mnnpy

* Downgrade numba even more for mnnpy

* Pin numpy for mmnpy

* Further pin numpy

* Further pin numpy

* More stringent pinning based on 2022 scanpy-scripts latest container

* More pinning for mnnpy

* More mnnpy pinning

* Commented `mnn_batch_correction` test as it fails with scanpy 1.9.1

* adds warning message for `mnn_correct`

* Reformat _mnn.py

* Reformat _mnn.py

---------

Co-authored-by: Iris Diana Yu <irisyu@ebi.ac.uk>
Co-authored-by: Anil Thanki <thanki.anil@gmail.com>
---
 .github/workflows/python-package.yml | 60 +++++++++++++---------------
 .gitignore                           |  3 ++
 README.md                            | 16 ++++++--
 scanpy-scripts-tests.bats            | 23 ++++++-----
 scanpy_scripts/__init__.py           |  4 +-
 scanpy_scripts/cmd_utils.py          |  4 +-
 scanpy_scripts/lib/_diffexp.py       | 37 +++++++++++++++--
 scanpy_scripts/lib/_filter.py        |  2 +-
 scanpy_scripts/lib/_louvain.py       |  1 +
 scanpy_scripts/lib/_mnn.py           |  9 ++++-
 scanpy_scripts/lib/_norm.py          | 10 ++++-
 setup.py                             | 23 ++++++-----
 test-env.yaml                        | 27 +++++++++++++
 13 files changed, 151 insertions(+), 68 deletions(-)
 create mode 100644 test-env.yaml

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index a04f041e..2e9db02f 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -2,54 +2,48 @@ name: Python package
 
 on: [pull_request]
 
+defaults:
+  run:
+    # for conda env activation
+    shell: bash -l {0}
+
 jobs:
   build:
 
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: [3.7, 3.8]
+        python-version: ["3.8", "3.9"]
 
     steps:
-    
-    - uses: actions/checkout@v2
-      with:
-        path: scanpy-scripts
-      
-    - uses: psf/black@stable
-      with:
-        options: '--check --verbose --include="\.pyi?$" .'
-    
     - uses: actions/checkout@v2
-      with:
-        repository: theislab/scanpy 
-        path: scanpy
-        ref: 1.8.1
-    
-    - name: Setup BATS
-      uses: mig4/setup-bats@v1
-      with:
-        bats-version: 1.2.1
     
-    - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v2
+    - name: Setup mamba
+      uses: mamba-org/provision-with-micromamba@main
       with:
-        python-version: ${{ matrix.python-version }}
-    
-    - name: Install dependencies
+        environment-file: test-env.yaml
+        cache-downloads: true
+        channels: conda-forge, bioconda, defaults
+        extra-specs: |
+          python=${{ matrix.python-version }}
+
+    - name: Run black manually
       run: |
-        pushd scanpy
-        patch -p1 < ../scanpy-scripts/scrublet.patch
-        popd
+        black --check --verbose ./
 
-        sudo apt-get install libhdf5-dev
-        pip install -U setuptools>=40.1 wheel 'cmake<3.20' pytest
-        pip install $(pwd)/scanpy-scripts
-        python -m pip install $(pwd)/scanpy --no-deps --ignore-installed -vv
+    # - name: Install dependencies
+    #   run: |
+    #     sudo apt-get install libhdf5-dev
+    #     pip install -U setuptools>=40.1 wheel 'cmake<3.20' pytest
+    #     pip install $(pwd)/scanpy-scripts
+    #     # python -m pip install $(pwd)/scanpy --no-deps --ignore-installed -vv
     
     - name: Run unit tests
-      run: pytest --doctest-modules -v ./scanpy-scripts
+      run: |
+        # needed for __version__ to be available
+        pip install . --no-deps --ignore-installed
+        pytest --doctest-modules -v ./
     
     - name: Test with bats
       run: |
-        ./scanpy-scripts/scanpy-scripts-tests.bats
+        ./scanpy-scripts-tests.bats
diff --git a/.gitignore b/.gitignore
index ccef88a8..8cbff2f4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,3 +7,6 @@
 *.pyc
 /.*history
 /.*swp
+data
+compressed
+uncompressed
diff --git a/README.md b/README.md
index 510eddff..9b38e7e5 100644
--- a/README.md
+++ b/README.md
@@ -4,12 +4,22 @@ A command-line interface for functions of the Scanpy suite, to facilitate flexib
 
 ## Install
 
+The recommended way of installing scanpy-scripts is via conda:
+
 ```bash
 conda install scanpy-scripts
-# or
-pip3 install scanpy-scripts
 ```
 
+pip installation is also possible, however the version of mnnpy is not patched as in the conda version, and so the `integrate` command will not work.
+
+```bash
+pip install scanpy-scripts
+```
+
+For development installation, we suggest following the github actions python-package.yml file.
+
+Currently, tests run on python 3.9, so those are the recommended versions if not installing via conda. BKNN doesn't currently install on Python 3.10 due to a skip in Bioconda.
+
 ## Test installation
 
 There is an example script included:
@@ -22,7 +32,7 @@ This requires the [bats](https://github.com/sstephenson/bats) testing framework
 
 ## Commands
 
-Available commands are described below. Each has usage instructions available via --help, consult function documentation in scanpy for further details.
+Available commands are described below. Each has usage instructions available via `--help`, consult function documentation in scanpy for further details.
 
 ```
 Usage: scanpy-cli [OPTIONS] COMMAND [ARGS]...
diff --git a/scanpy-scripts-tests.bats b/scanpy-scripts-tests.bats
index d5a69103..e3039149 100755
--- a/scanpy-scripts-tests.bats
+++ b/scanpy-scripts-tests.bats
@@ -653,17 +653,18 @@ setup() {
 }
 
 # Do MNN batch correction, using clustering as batch (just for test purposes)
-
-@test "Run MNN batch integration using clustering as batch" {
-    if [ "$resume" = 'true' ] && [ -f "$mnn_obj" ]; then
-        skip "$mnn_obj exists and resume is set to 'true'"
-    fi
-
-    run rm -f $mnn_obj && eval "$scanpy integrate mnn $mnn_opt $louvain_obj $mnn_obj"
-
-    [ "$status" -eq 0 ]
-    [ -f  "$mnn_obj" ]
-}
+# Commented as it fails with scanpy 1.9.1 
+#
+# @test "Run MNN batch integration using clustering as batch" {
+#    if [ "$resume" = 'true' ] && [ -f "$mnn_obj" ]; then
+#        skip "$mnn_obj exists and resume is set to 'true'"
+#    fi
+#
+#    run rm -f $mnn_obj && eval "$scanpy integrate mnn $mnn_opt $louvain_obj $mnn_obj"
+#
+#    [ "$status" -eq 0 ]
+#    [ -f  "$mnn_obj" ]
+#}
 
 # Do ComBat batch correction, using clustering as batch (just for test purposes)
 
diff --git a/scanpy_scripts/__init__.py b/scanpy_scripts/__init__.py
index d408a2cc..2f222bbc 100644
--- a/scanpy_scripts/__init__.py
+++ b/scanpy_scripts/__init__.py
@@ -1,9 +1,9 @@
 """
 Provides version, author and exports
 """
-import pkg_resources
+import importlib.metadata
 
-__version__ = pkg_resources.get_distribution("scanpy-scripts").version
+__version__ = importlib.metadata.version("scanpy-scripts")
 
 __author__ = ", ".join(
     [
diff --git a/scanpy_scripts/cmd_utils.py b/scanpy_scripts/cmd_utils.py
index a9374f9a..20fae902 100644
--- a/scanpy_scripts/cmd_utils.py
+++ b/scanpy_scripts/cmd_utils.py
@@ -6,10 +6,11 @@
 import pandas as pd
 import scanpy as sc
 import scanpy.external as sce
+
 from .cmd_options import CMD_OPTIONS
 from .lib._paga import plot_paga
-from .obj_utils import _save_matrix
 from .lib._scrublet import plot_scrublet
+from .obj_utils import _save_matrix
 
 
 def make_subcmd(cmd_name, func, cmd_desc, arg_desc, opt_set=None):
@@ -313,6 +314,7 @@ def plot_function(
         showfig = True
         if output_fig:
             import os
+
             import matplotlib.pyplot as plt
 
             sc.settings.figdir = os.path.dirname(output_fig) or "."
diff --git a/scanpy_scripts/lib/_diffexp.py b/scanpy_scripts/lib/_diffexp.py
index c19c9ae5..98ce006d 100644
--- a/scanpy_scripts/lib/_diffexp.py
+++ b/scanpy_scripts/lib/_diffexp.py
@@ -2,9 +2,11 @@
 scanpy diffexp
 """
 
+import logging
+import math
+
 import pandas as pd
 import scanpy as sc
-import logging
 
 
 def diffexp(
@@ -22,6 +24,15 @@ def diffexp(
 ):
     """
     Wrapper function for sc.tl.rank_genes_groups.
+
+    Test that we can load a single group.
+    >>> import os
+    >>> from pathlib import Path
+    >>> adata = sc.datasets.krumsiek11()
+    >>> tbl = diffexp(adata, groupby='cell_type', groups='Mo', reference='progenitor')
+    >>> # get the size of the data frame
+    >>> tbl.shape
+    (11, 8)
     """
     if adata.raw is None:
         use_raw = False
@@ -51,6 +62,11 @@ def diffexp(
                 "Singlet groups removed before passing to rank_genes_groups()"
             )
 
+    # avoid issue when groups is a single group as a string simplified by click
+    # https://github.com/ebi-gene-expression-group/scanpy-scripts/issues/123
+    if groups != "all" and isinstance(groups, str):
+        groups = [groups]
+
     sc.tl.rank_genes_groups(
         adata,
         use_raw=use_raw,
@@ -64,17 +80,32 @@ def diffexp(
     de_tbl = extract_de_table(adata.uns[diff_key])
 
     if isinstance(filter_params, dict):
+        key_filtered = diff_key + "_filtered"
         sc.tl.filter_rank_genes_groups(
             adata,
             key=diff_key,
-            key_added=diff_key + "_filtered",
+            key_added=key_filtered,
             use_raw=use_raw,
             **filter_params,
         )
 
-        de_tbl = extract_de_table(adata.uns[diff_key + "_filtered"])
+        # there are non strings on recarray object at this point, in
+        # adata.uns['rank_genes_groups_filtered']['names']
+        # for instance:
+        # adata.uns['rank_genes_groups_filtered']['names'][0]
+        # (nan, nan, 'NKG7', nan, nan, 'PPBP')
+        # this now upsets h5py > 3.0
+        de_tbl = extract_de_table(adata.uns[key_filtered])
         de_tbl = de_tbl.loc[de_tbl.genes.astype(str) != "nan", :]
 
+        # change nan for strings in adata.uns['rank_genes_groups_filtered']['names']
+        # TODO on scanpy updates, check if this is not done within scanpy so that we can remove this
+        for row in range(0, len(adata.uns[key_filtered]["names"])):
+            for col in range(0, len(adata.uns[key_filtered]["names"][row])):
+                element = adata.uns[key_filtered]["names"][row][col]
+                if isinstance(element, float) and math.isnan(element):
+                    adata.uns[key_filtered]["names"][row][col] = "nan"
+
     if save:
         de_tbl.to_csv(save, sep="\t", header=True, index=False)
 
diff --git a/scanpy_scripts/lib/_filter.py b/scanpy_scripts/lib/_filter.py
index c3dd8bda..d05ca67b 100644
--- a/scanpy_scripts/lib/_filter.py
+++ b/scanpy_scripts/lib/_filter.py
@@ -37,7 +37,7 @@ def filter_anndata(
             k_mito = gene_names.str.startswith("MT-")
             if k_mito.sum() > 0:
                 adata.var["mito"] = k_mito
-                adata.var["mito"] = adata.var["mito"].astype("category")
+                # adata.var["mito"] = adata.var["mito"].astype("category")
             else:
                 logging.warning(
                     "No MT genes found, skip calculating "
diff --git a/scanpy_scripts/lib/_louvain.py b/scanpy_scripts/lib/_louvain.py
index 329c313b..38224c5c 100644
--- a/scanpy_scripts/lib/_louvain.py
+++ b/scanpy_scripts/lib/_louvain.py
@@ -3,6 +3,7 @@
 """
 
 import scanpy as sc
+
 from ..obj_utils import write_obs
 
 
diff --git a/scanpy_scripts/lib/_mnn.py b/scanpy_scripts/lib/_mnn.py
index 3c45721d..84c6416f 100644
--- a/scanpy_scripts/lib/_mnn.py
+++ b/scanpy_scripts/lib/_mnn.py
@@ -2,9 +2,10 @@
 scanpy external mnn
 """
 
-import scanpy.external as sce
-import numpy as np
 import click
+import numpy as np
+import scanpy.external as sce
+import logging
 
 # Wrapper for mnn allowing use of non-standard slot
 
@@ -16,6 +17,10 @@ def mnn_correct(adata, key=None, key_added=None, var_subset=None, layer=None, **
 
     # mnn will use .X, so we need to put other layers there for processing
 
+    logging.warning(
+        "Use mnn_correct at your own risk, environment installation seems faulty for this module."
+    )
+
     if layer:
         adata.layers["X_backup"] = adata.X
         adata.X = adata.layers[layer]
diff --git a/scanpy_scripts/lib/_norm.py b/scanpy_scripts/lib/_norm.py
index 25d2b40f..9bbff914 100644
--- a/scanpy_scripts/lib/_norm.py
+++ b/scanpy_scripts/lib/_norm.py
@@ -3,6 +3,7 @@
 """
 
 import scanpy as sc
+import math
 
 
 def normalize(adata, log_transform=True, **kwargs):
@@ -12,6 +13,13 @@ def normalize(adata, log_transform=True, **kwargs):
     """
     sc.pp.normalize_total(adata, **kwargs)
     if log_transform:
-        sc.pp.log1p(adata)
+        # Natural logarithm is the default by scanpy, if base is not set
+        base = math.e
+        sc.pp.log1p(adata, base=base)
+        # scanpy is not setting base in uns['log1p'] keys, but later on asking for it
+        if "log1p" in adata.uns_keys() and "base" not in adata.uns["log1p"]:
+            # Note that setting base to None doesn't solve the problem at other modules that check for base later on
+            # as adata.uns["log1p"]["base"] = None gets dropped at either anndata write or read.
+            adata.uns["log1p"]["base"] = base
 
     return adata
diff --git a/setup.py b/setup.py
index cbc2531f..5d81c797 100644
--- a/setup.py
+++ b/setup.py
@@ -1,11 +1,11 @@
-from setuptools import setup, find_packages
+from setuptools import find_packages, setup
 
 with open("README.md", "r") as fh:
     long_description = fh.read()
 
 setup(
     name="scanpy-scripts",
-    version="1.1.6",
+    version="1.1.9",
     author="nh3",
     author_email="nh3@users.noreply.github.com",
     description="Scripts for using scanpy from the command line",
@@ -35,23 +35,24 @@
         ]
     ),
     install_requires=[
-        "packaging",
-        "anndata",
-        "scipy",
-        "matplotlib",
-        "pandas",
-        "h5py<3.0.0",
-        "scanpy==1.8.1",
+        # "packaging",
+        # "anndata",
+        # "scipy",
+        # "matplotlib",
+        # "pandas",
+        # "h5py<3.0.0",
+        "scanpy==1.9.3",
         "louvain",
+        "igraph",
         "leidenalg",
         "loompy",
         "Click<8",
-        "umap-learn",
+        # "umap-learn",
         "harmonypy>=0.0.5",
         "bbknn>=1.5.0",
         "mnnpy>=0.1.9.5",
         "scrublet",
-        "scikit-misc",
+        # "scikit-misc",
         "fa2",
     ],
 )
diff --git a/test-env.yaml b/test-env.yaml
new file mode 100644
index 00000000..2ad465d1
--- /dev/null
+++ b/test-env.yaml
@@ -0,0 +1,27 @@
+name: scanpy-scripts
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - scanpy=1.9.3
+  - louvain
+  - igraph
+  - leidenalg
+  - loompy
+  - Click <8
+  - harmonypy>=0.0.5
+  - bbknn>=1.5.0,<1.6.0
+  - mnnpy>=0.1.9.5
+  # for mnnpy using n_jobs
+  - scipy <1.8.1
+  - scikit-learn <1.2.0
+  - numba <0.56
+  - numpy <1.22
+  - llvmlite <0.38.1
+  - scrublet
+  - fa2
+  # for testing
+  - bats
+  - black
+  - pytest

From 65c2ca860ae1f598c998f30f8a465a91ad83fc6a Mon Sep 17 00:00:00 2001
From: Pablo Moreno <pcm32@users.noreply.github.com>
Date: Fri, 9 Feb 2024 13:40:21 +0000
Subject: [PATCH 2/8] Remove extra pinning for mmnpy (#126)

---
 test-env.yaml | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/test-env.yaml b/test-env.yaml
index 2ad465d1..83b06361 100644
--- a/test-env.yaml
+++ b/test-env.yaml
@@ -14,11 +14,8 @@ dependencies:
   - bbknn>=1.5.0,<1.6.0
   - mnnpy>=0.1.9.5
   # for mnnpy using n_jobs
-  - scipy <1.8.1
-  - scikit-learn <1.2.0
-  - numba <0.56
-  - numpy <1.22
-  - llvmlite <0.38.1
+  - scipy <1.9.0
+  - scikit-learn <1.3.0
   - scrublet
   - fa2
   # for testing

From 52003ca38d4ded8deee22aef97c51e0fd7ee39a0 Mon Sep 17 00:00:00 2001
From: Pablo Moreno <pcm32@users.noreply.github.com>
Date: Mon, 12 Feb 2024 09:46:38 +0000
Subject: [PATCH 3/8] Change deprecated sc.read() usages for read_h5ad call
 (#128)

---
 scanpy_scripts/cmd_utils.py     | 2 +-
 scanpy_scripts/lib/_scrublet.py | 9 +++++----
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/scanpy_scripts/cmd_utils.py b/scanpy_scripts/cmd_utils.py
index 20fae902..b434a837 100644
--- a/scanpy_scripts/cmd_utils.py
+++ b/scanpy_scripts/cmd_utils.py
@@ -93,7 +93,7 @@ def _fix_booleans(df):
 
 def _read_obj(input_obj, input_format="anndata", **kwargs):
     if input_format == "anndata":
-        adata = sc.read(input_obj, **kwargs)
+        adata = sc.read_h5ad(input_obj, **kwargs)
     elif input_format == "loom":
         adata = sc.read_loom(input_obj, **kwargs)
     else:
diff --git a/scanpy_scripts/lib/_scrublet.py b/scanpy_scripts/lib/_scrublet.py
index d1462de9..fda891cd 100644
--- a/scanpy_scripts/lib/_scrublet.py
+++ b/scanpy_scripts/lib/_scrublet.py
@@ -2,12 +2,13 @@
 scanpy external scrublet
 """
 
+import anndata
+import numpy as np
+import pandas as pd
 import scanpy as sc
 import scanpy.external as sce
-import numpy as np
+
 from ..obj_utils import write_obs
-import anndata
-import pandas as pd
 
 # Wrapper for scrublet allowing text export and filtering
 
@@ -20,7 +21,7 @@ def scrublet(adata, adata_sim=None, filter=False, export_table=None, **kwargs):
     # Do we need to read an object with the doublet simulations?
 
     if adata_sim:
-        adata_sim = sc.read(adata_sim)
+        adata_sim = sc.read_h5ad(adata_sim)
 
     sce.pp.scrublet(adata, adata_sim=adata_sim, **kwargs)
 

From 60a2e63d4e05ca944d0bd7c8e982cacb1728e12d Mon Sep 17 00:00:00 2001
From: Pablo Moreno <pcm32@users.noreply.github.com>
Date: Mon, 19 Feb 2024 21:30:30 +0000
Subject: [PATCH 4/8] HVG genes setting through files (#130)

* HVG genes setting through files

* Always avoid the unexpected arguments
---
 scanpy-scripts-tests.bats     | 28 ++++++++++++++++++++++
 scanpy_scripts/cmd_options.py | 19 +++++++++++++--
 scanpy_scripts/lib/_hvg.py    | 45 +++++++++++++++++++++++++++++++++++
 3 files changed, 90 insertions(+), 2 deletions(-)

diff --git a/scanpy-scripts-tests.bats b/scanpy-scripts-tests.bats
index e3039149..93af736d 100755
--- a/scanpy-scripts-tests.bats
+++ b/scanpy-scripts-tests.bats
@@ -28,7 +28,11 @@ setup() {
     norm_opt="--save-layer filtered -t 10000 -l all -n after -X ${norm_mtx} --show-obj stdout"
     norm_obj="${output_dir}/norm.h5ad"
     hvg_opt="-m 0.0125 3 -d 0.5 inf -s --show-obj stdout"
+    always_hvg="${data_dir}/always_hvg.txt"
+    never_hvg="${data_dir}/never_hvg.txt"
+    hvg_opt_always_never="--always-hv-genes-file ${always_hvg} --never-hv-genes-file ${never_hvg}"
     hvg_obj="${output_dir}/hvg.h5ad"
+    hvg_obj_on_off="${output_dir}/hvg_on_off.h5ad"
     regress_opt="-k n_counts --show-obj stdout"
     regress_obj="${output_dir}/regress.h5ad"
     scale_opt="--save-layer normalised -m 10 --show-obj stdout"
@@ -131,6 +135,22 @@ setup() {
     [ -f "$raw_matrix_from_raw" ]
 }
 
+@test "Add genes to be considered HVGs" {
+    if [ "$resume" = 'true' ] && [ -f "$always_hvg" ]; then
+        skip "$always_hvg exists"
+    fi
+
+    run eval "echo -e 'MIR1302-10\nFAM138A' > $always_hvg"
+}
+
+@test "Add genes not to be considered HVGs" {
+    if [ "$resume" = 'true' ] && [ -f "$never_hvg" ]; then
+        skip "$never_hvg exists"
+    fi
+
+    run eval "echo -e 'ISG15\nTNFRSF4' > $never_hvg"
+}
+
 @test "Test MTX write from layers" {
     if [ "$resume" = 'true' ] && [ -f "$raw_matrix_from_layer" ]; then
         skip "$raw_matrix exists"
@@ -219,6 +239,14 @@ setup() {
     [ -f  "$hvg_obj" ]
 }
 
+@test "Find variable genes with optional turn on/off lists" {
+    if [ "$resume" = 'true' ] && [ -f "$hvg_obj_on_off" ]; then
+        skip "$hvg_obj_on_off exists and resume is set to 'true'"
+    fi
+
+    run rm -f $hvg_obj_on_off && eval "$scanpy hvg $hvg_opt_always_never $norm_obj $hvg_obj_on_off"
+}
+
 # Do separate doublet simulation step (normally we'd just let the main scrublet
 # process do this).
 
diff --git a/scanpy_scripts/cmd_options.py b/scanpy_scripts/cmd_options.py
index d3b88f87..d8c1dac4 100644
--- a/scanpy_scripts/cmd_options.py
+++ b/scanpy_scripts/cmd_options.py
@@ -3,13 +3,14 @@
 """
 
 import click
+
 from .click_utils import (
     CommaSeparatedText,
     Dictionary,
-    valid_limit,
-    valid_parameter_limits,
     mutually_exclusive_with,
     required_by,
+    valid_limit,
+    valid_parameter_limits,
 )
 
 COMMON_OPTIONS = {
@@ -856,6 +857,20 @@
             "'seurat_v3', ties are broken by the median (across batches) rank based on "
             "within-batch normalized variance.",
         ),
+        click.option(
+            "--always-hv-genes-file",
+            "always_hv_genes_file",
+            type=click.Path(exists=True),
+            default=None,
+            help="If specified, the gene identifers in this file will be set as highly variable in the var dataframe after HVGs are computed.",
+        ),
+        click.option(
+            "--never-hv-genes-file",
+            "never_hv_genes_file",
+            type=click.Path(exists=True),
+            default=None,
+            help="If specified, the gene identifers in this file will be removed from highly variable in the var dataframe (set to false) after HVGs are computed.",
+        ),
     ],
     "scale": [
         *COMMON_OPTIONS["input"],
diff --git a/scanpy_scripts/lib/_hvg.py b/scanpy_scripts/lib/_hvg.py
index 1c340c68..7dcf766c 100644
--- a/scanpy_scripts/lib/_hvg.py
+++ b/scanpy_scripts/lib/_hvg.py
@@ -21,6 +21,20 @@ def hvg(
     if "n_top_genes" in kwargs and kwargs["n_top_genes"] is not None:
         kwargs["n_top_genes"] = min(adata.n_vars, kwargs["n_top_genes"])
 
+    always_hv_genes = None
+    if "always_hv_genes_file" in kwargs and kwargs["always_hv_genes_file"] is not None:
+        with open(kwargs["always_hv_genes_file"], "r") as f:
+            always_hv_genes = f.read().splitlines()
+
+    never_hv_genes = None
+    if "never_hv_genes_file" in kwargs and kwargs["never_hv_genes_file"] is not None:
+        with open(kwargs["never_hv_genes_file"], "r") as f:
+            never_hv_genes = f.read().splitlines()
+
+    # to avoid upsetting the scanpy function with unexpected keyword arguments
+    del kwargs["always_hv_genes_file"]
+    del kwargs["never_hv_genes_file"]
+
     sc.pp.highly_variable_genes(
         adata,
         min_mean=mean_limits[0],
@@ -30,4 +44,35 @@ def hvg(
         **kwargs,
     )
 
+    return switch_hvgs(adata, always_hv_genes, never_hv_genes)
+
+
+def switch_hvgs(adata, always_hv_genes=None, never_hv_genes=None):
+    """
+    Function to switch on/off highly variable genes based on a list of genes.
+
+    >>> adata = sc.datasets.pbmc3k()
+    >>> sc.pp.normalize_total(adata)
+    >>> sc.pp.log1p(adata)
+    >>> sc.pp.highly_variable_genes(adata)
+    >>> adata = switch_hvgs(adata, always_hv_genes=['MIR1302-10', 'FAM138A'], never_hv_genes=['ISG15', 'TNFRSF4'])
+    >>> adata.var.loc['ISG15'].highly_variable
+    False
+    >>> adata.var.loc['TNFRSF4'].highly_variable
+    False
+    >>> adata.var.loc['MIR1302-10'].highly_variable
+    True
+    >>> adata.var.loc['CPSF3L'].highly_variable
+    True
+    """
+    if always_hv_genes is not None:
+        adata.var.highly_variable = np.logical_or(
+            adata.var.highly_variable, adata.var_names.isin(always_hv_genes)
+        )
+
+    if never_hv_genes is not None:
+        adata.var.highly_variable = np.logical_and(
+            adata.var.highly_variable, ~adata.var_names.isin(never_hv_genes)
+        )
+
     return adata

From e52fe1e9d00bed20f2cc33a2464fcd50978620fb Mon Sep 17 00:00:00 2001
From: Pablo Moreno <pablo.moreno@astrazeneca.com>
Date: Mon, 19 Feb 2024 22:10:27 +0000
Subject: [PATCH 5/8] Changes for release

---
 README.md      |   2 +-
 scrublet.patch | 328 -------------------------------------------------
 setup.py       |   6 +-
 3 files changed, 5 insertions(+), 331 deletions(-)
 delete mode 100644 scrublet.patch

diff --git a/README.md b/README.md
index 9b38e7e5..34b5e512 100644
--- a/README.md
+++ b/README.md
@@ -4,7 +4,7 @@ A command-line interface for functions of the Scanpy suite, to facilitate flexib
 
 ## Install
 
-The recommended way of installing scanpy-scripts is via conda:
+The recommended way of using this package is through the latest container produced by Bioconda [here](https://quay.io/repository/biocontainers/scanpy-scripts?tab=tags). If you must, prefer installing scanpy-scripts is via conda:
 
 ```bash
 conda install scanpy-scripts
diff --git a/scrublet.patch b/scrublet.patch
deleted file mode 100644
index 5e7085bf..00000000
--- a/scrublet.patch
+++ /dev/null
@@ -1,328 +0,0 @@
-diff --git a/scanpy/external/pl.py b/scanpy/external/pl.py
-index 629b40cc..6eb61c48 100644
---- a/scanpy/external/pl.py
-+++ b/scanpy/external/pl.py
-@@ -340,6 +340,8 @@ def scrublet_score_distribution(
-     The histogram for simulated doublets is useful for determining the correct doublet
-     score threshold.
- 
-+    Scrublet must have been run previously with the input object.
-+
-     Parameters
-     ----------
-     adata
-@@ -372,40 +374,86 @@ def scrublet_score_distribution(
-         simulation separately for advanced usage.
-     """
- 
--    threshold = adata.uns['scrublet']['threshold']
--    fig, axs = plt.subplots(1, 2, figsize=figsize)
-+    if 'scrublet' not in adata.uns:
-+        raise ValueError(
-+            'Please run scrublet before trying to generate the scrublet plot.'
-+        )
- 
--    ax = axs[0]
--    ax.hist(
--        adata.obs['doublet_score'],
--        np.linspace(0, 1, 50),
--        color='gray',
--        linewidth=0,
--        density=True,
--    )
--    ax.set_yscale(scale_hist_obs)
--    yl = ax.get_ylim()
--    ax.set_ylim(yl)
--    ax.plot(threshold * np.ones(2), yl, c='black', linewidth=1)
--    ax.set_title('Observed transcriptomes')
--    ax.set_xlabel('Doublet score')
--    ax.set_ylabel('Prob. density')
--
--    ax = axs[1]
--    ax.hist(
--        adata.uns['scrublet']['doublet_scores_sim'],
--        np.linspace(0, 1, 50),
--        color='gray',
--        linewidth=0,
--        density=True,
--    )
--    ax.set_yscale(scale_hist_sim)
--    yl = ax.get_ylim()
--    ax.set_ylim(yl)
--    ax.plot(threshold * np.ones(2), yl, c='black', linewidth=1)
--    ax.set_title('Simulated doublets')
--    ax.set_xlabel('Doublet score')
--    ax.set_ylabel('Prob. density')
-+    # If batched_by is populated, then we know Scrublet was run over multiple batches
-+
-+    if 'batched_by' in adata.uns['scrublet']:
-+        batch_key = adata.uns['scrublet']['batched_by']
-+
-+        batches = np.unique(adata.obs[batch_key])
-+        adatas = [
-+            adata[
-+                adata.obs[batch_key] == batch,
-+            ]
-+            for batch in batches
-+        ]
-+        figsize = (figsize[0], figsize[1] * len(batches))
-+
-+    else:
-+        adatas = [adata]
-+
-+    fig, axs = plt.subplots(len(adatas), 2, figsize=figsize)
-+
-+    for idx, ad in enumerate(adatas):
-+
-+        # We'll need multiple rows if Scrublet was run in multiple batches
-+
-+        if 'batched_by' in adata.uns['scrublet']:
-+
-+            batch = batches[idx]
-+
-+            threshold = adata.uns['scrublet']['batches'][batch]['threshold']
-+            doublet_scores_sim = adata.uns['scrublet']['batches'][batch][
-+                'doublet_scores_sim'
-+            ]
-+            axis_lab_suffix = " (%s)" % batch
-+            obs_ax = axs[idx][0]
-+            sim_ax = axs[idx][1]
-+
-+        else:
-+            threshold = adata.uns['scrublet']['threshold']
-+            doublet_scores_sim = adata.uns['scrublet']['doublet_scores_sim']
-+            axis_lab_suffix = ''
-+            obs_ax = axs[0]
-+            sim_ax = axs[1]
-+
-+        # Make the observed transcriptomes plot
-+
-+        obs_ax.hist(
-+            ad.obs['doublet_score'],
-+            np.linspace(0, 1, 50),
-+            color='gray',
-+            linewidth=0,
-+            density=True,
-+        )
-+        obs_ax.set_yscale(scale_hist_obs)
-+        yl = obs_ax.get_ylim()
-+        obs_ax.set_ylim(yl)
-+        obs_ax.plot(threshold * np.ones(2), yl, c='black', linewidth=1)
-+        obs_ax.set_title('Observed transcriptomes%s' % axis_lab_suffix)
-+        obs_ax.set_xlabel('Doublet score')
-+        obs_ax.set_ylabel('Prob. density')
-+
-+        # Make the simulated tranascriptomes plot
-+
-+        sim_ax.hist(
-+            doublet_scores_sim,
-+            np.linspace(0, 1, 50),
-+            color='gray',
-+            linewidth=0,
-+            density=True,
-+        )
-+        sim_ax.set_yscale(scale_hist_sim)
-+        yl = sim_ax.get_ylim()
-+        sim_ax.set_ylim(yl)
-+        sim_ax.plot(threshold * np.ones(2), yl, c='black', linewidth=1)
-+        sim_ax.set_title('Simulated doublets%s' % axis_lab_suffix)
-+        sim_ax.set_xlabel('Doublet score')
-+        sim_ax.set_ylabel('Prob. density')
- 
-     fig.tight_layout()
- 
-diff --git a/scanpy/external/pp/_scrublet.py b/scanpy/external/pp/_scrublet.py
-index 7ebb7b5b..85682088 100644
---- a/scanpy/external/pp/_scrublet.py
-+++ b/scanpy/external/pp/_scrublet.py
-@@ -1,6 +1,7 @@
- from anndata import AnnData
- from typing import Optional
- import numpy as np
-+import pandas as pd
- from scipy import sparse
- 
- 
-@@ -12,6 +13,7 @@ from ...get import _get_obs_rep
- def scrublet(
-     adata: AnnData,
-     adata_sim: Optional[AnnData] = None,
-+    batch_key: str = None,
-     sim_doublet_ratio: float = 2.0,
-     expected_doublet_rate: float = 0.05,
-     stdev_doublet_rate: float = 0.02,
-@@ -60,6 +62,8 @@ def scrublet(
-         sc.external.pp.scrublet_simulate_doublets(), with same number of vars
-         as adata. This should have been built from adata_obs after
-         filtering genes and cells and selcting highly-variable genes.
-+    batch_key
-+        Optional `adata.obs` column name discriminating between batches.
-     sim_doublet_ratio
-         Number of doublets to simulate relative to the number of observed
-         transcriptomes.
-@@ -161,72 +165,114 @@ def scrublet(
- 
-     adata_obs = adata.copy()
- 
--    # With no adata_sim we assume the regular use case, starting with raw
--    # counts and simulating doublets
-+    def _run_scrublet(ad_obs, ad_sim=None):
- 
--    if not adata_sim:
-+        # With no adata_sim we assume the regular use case, starting with raw
-+        # counts and simulating doublets
- 
--        pp.filter_genes(adata_obs, min_cells=3)
--        pp.filter_cells(adata_obs, min_genes=3)
-+        if ad_sim is None:
- 
--        # Doublet simulation will be based on the un-normalised counts, but on the
--        # selection of genes following normalisation and variability filtering. So
--        # we need to save the raw and subset at the same time.
-+            pp.filter_genes(ad_obs, min_cells=3)
-+            pp.filter_cells(ad_obs, min_genes=3)
- 
--        adata_obs.layers['raw'] = adata_obs.X
--        pp.normalize_total(adata_obs)
-+            # Doublet simulation will be based on the un-normalised counts, but on the
-+            # selection of genes following normalisation and variability filtering. So
-+            # we need to save the raw and subset at the same time.
- 
--        # HVG process needs log'd data. If we're not using that downstream, then
--        # copy logged data to new object and subset original object based on the
--        # output.
-+            ad_obs.layers['raw'] = ad_obs.X.copy()
-+            pp.normalize_total(ad_obs)
- 
--        if log_transform:
--            pp.log1p(adata_obs)
--            pp.highly_variable_genes(adata_obs, subset=True)
--        else:
--            logged = pp.log1p(adata_obs, copy=True)
--            _ = pp.highly_variable_genes(logged)
--            adata_obs = adata_obs[:, logged.var['highly_variable']]
-+            # HVG process needs log'd data.
- 
--        # Simulate the doublets based on the raw expressions from the normalised
--        # and filtered object.
-+            logged = pp.log1p(ad_obs, copy=True)
-+            pp.highly_variable_genes(logged)
-+            ad_obs = ad_obs[:, logged.var['highly_variable']]
- 
--        adata_sim = scrublet_simulate_doublets(
--            adata_obs,
--            layer='raw',
--            sim_doublet_ratio=sim_doublet_ratio,
--            synthetic_doublet_umi_subsampling=synthetic_doublet_umi_subsampling,
-+            # Simulate the doublets based on the raw expressions from the normalised
-+            # and filtered object.
-+
-+            ad_sim = scrublet_simulate_doublets(
-+                ad_obs,
-+                layer='raw',
-+                sim_doublet_ratio=sim_doublet_ratio,
-+                synthetic_doublet_umi_subsampling=synthetic_doublet_umi_subsampling,
-+            )
-+
-+            # Now normalise simulated and observed in the same way
-+
-+            pp.normalize_total(ad_obs, target_sum=1e6)
-+            pp.normalize_total(ad_sim, target_sum=1e6)
-+
-+            if log_transform:
-+                pp.log1p(ad_obs)
-+                pp.log1p(ad_sim)
-+
-+        ad_obs = _scrublet_call_doublets(
-+            adata_obs=ad_obs,
-+            adata_sim=ad_sim,
-+            n_neighbors=n_neighbors,
-+            expected_doublet_rate=expected_doublet_rate,
-+            stdev_doublet_rate=stdev_doublet_rate,
-+            mean_center=mean_center,
-+            normalize_variance=normalize_variance,
-+            n_prin_comps=n_prin_comps,
-+            use_approx_neighbors=use_approx_neighbors,
-+            knn_dist_metric=knn_dist_metric,
-+            get_doublet_neighbor_parents=get_doublet_neighbor_parents,
-+            threshold=threshold,
-+            random_state=random_state,
-+            verbose=verbose,
-         )
- 
--        # Now normalise simulated and observed in the same way
-+        return {'obs': ad_obs.obs, 'uns': ad_obs.uns['scrublet']}
- 
--        pp.normalize_total(adata_obs, target_sum=1e6)
--        pp.normalize_total(adata_sim, target_sum=1e6)
-+    if batch_key is not None:
-+        if batch_key not in adata.obs.keys():
-+            raise ValueError(
-+                '`batch_key` must be a column of .obs in the input annData object.'
-+            )
- 
--    adata_obs = _scrublet_call_doublets(
--        adata_obs=adata_obs,
--        adata_sim=adata_sim,
--        n_neighbors=n_neighbors,
--        expected_doublet_rate=expected_doublet_rate,
--        stdev_doublet_rate=stdev_doublet_rate,
--        mean_center=mean_center,
--        normalize_variance=normalize_variance,
--        n_prin_comps=n_prin_comps,
--        use_approx_neighbors=use_approx_neighbors,
--        knn_dist_metric=knn_dist_metric,
--        get_doublet_neighbor_parents=get_doublet_neighbor_parents,
--        threshold=threshold,
--        random_state=random_state,
--        verbose=verbose,
--    )
-+        # Run Scrublet independently on batches and return just the
-+        # scrublet-relevant parts of the objects to add to the input object
- 
--    logg.info('    Scrublet finished', time=start)
-+        batches = np.unique(adata.obs[batch_key])
-+        scrubbed = [
-+            _run_scrublet(
-+                adata_obs[
-+                    adata_obs.obs[batch_key] == batch,
-+                ],
-+                adata_sim,
-+            )
-+            for batch in batches
-+        ]
-+        scrubbed_obs = pd.concat([scrub['obs'] for scrub in scrubbed])
-+
-+        # Now reset the obs to get the scrublet scores
-+
-+        adata.obs = scrubbed_obs.loc[adata.obs_names.values]
-+
-+        # Save the .uns from each batch separately
-+
-+        adata.uns['scrublet'] = {}
-+        adata.uns['scrublet']['batches'] = dict(
-+            zip(batches, [scrub['uns'] for scrub in scrubbed])
-+        )
- 
--    # Copy outcomes to input object from our processed version
-+        # Record that we've done batched analysis, so e.g. the plotting
-+        # function knows what to do.
- 
--    adata.obs['doublet_score'] = adata_obs.obs['doublet_score']
--    adata.obs['predicted_doublet'] = adata_obs.obs['predicted_doublet']
--    adata.uns['scrublet'] = adata_obs.uns['scrublet']
-+        adata.uns['scrublet']['batched_by'] = batch_key
-+
-+    else:
-+        scrubbed = _run_scrublet(adata_obs, adata_sim)
-+
-+        # Copy outcomes to input object from our processed version
-+
-+        adata.obs['doublet_score'] = scrubbed['obs']['doublet_score']
-+        adata.obs['predicted_doublet'] = scrubbed['obs']['predicted_doublet']
-+        adata.uns['scrublet'] = scrubbed['uns']
-+
-+    logg.info('    Scrublet finished', time=start)
- 
-     if copy:
-         return adata
diff --git a/setup.py b/setup.py
index 5d81c797..f544bfbc 100644
--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,7 @@
 
 setup(
     name="scanpy-scripts",
-    version="1.1.9",
+    version="1.9.301",
     author="nh3",
     author_email="nh3@users.noreply.github.com",
     description="Scripts for using scanpy from the command line",
@@ -49,8 +49,10 @@
         "Click<8",
         # "umap-learn",
         "harmonypy>=0.0.5",
-        "bbknn>=1.5.0",
+        "bbknn>=1.5.0,<1.6.0",
         "mnnpy>=0.1.9.5",
+        "scipy<1.9.0",
+        "scikit-learn<1.3.0",
         "scrublet",
         # "scikit-misc",
         "fa2",

From 8b620990ec45806f5fb273998cbe1552e930f8fa Mon Sep 17 00:00:00 2001
From: Pablo Moreno <pablo.moreno@astrazeneca.com>
Date: Mon, 19 Feb 2024 22:13:13 +0000
Subject: [PATCH 6/8] Versioning suggestion

---
 README.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/README.md b/README.md
index 34b5e512..9f279dec 100644
--- a/README.md
+++ b/README.md
@@ -63,3 +63,7 @@ Commands:
   multiplet  Execute methods for multiplet removal.
   plot       Visualise data.
   ```
+
+  ## Versioning
+
+  Major and major versions will follow the scanpy versions. The first digit of the patch should follow the scanpy patch version as well, subsequent digits in the patch are reserved for changes in this repository.

From 1e6a6af3446962fddd0fca1e8b0d66d7bbfe058a Mon Sep 17 00:00:00 2001
From: Pablo Moreno <pcm32@users.noreply.github.com>
Date: Tue, 20 Feb 2024 14:15:47 +0000
Subject: [PATCH 7/8] Update setup.py

Co-authored-by: Pedro Madrigal <8195212+pmb59@users.noreply.github.com>
---
 setup.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/setup.py b/setup.py
index f544bfbc..b44ebb97 100644
--- a/setup.py
+++ b/setup.py
@@ -54,7 +54,6 @@
         "scipy<1.9.0",
         "scikit-learn<1.3.0",
         "scrublet",
-        # "scikit-misc",
         "fa2",
     ],
 )

From 3f18e414a6a66589950bea3257bd0bd0a68878fa Mon Sep 17 00:00:00 2001
From: Pablo Moreno <pcm32@users.noreply.github.com>
Date: Tue, 20 Feb 2024 14:16:10 +0000
Subject: [PATCH 8/8] Improve wording of README

Co-authored-by: Pedro Madrigal <8195212+pmb59@users.noreply.github.com>
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 9f279dec..57f7ec1c 100644
--- a/README.md
+++ b/README.md
@@ -4,7 +4,7 @@ A command-line interface for functions of the Scanpy suite, to facilitate flexib
 
 ## Install
 
-The recommended way of using this package is through the latest container produced by Bioconda [here](https://quay.io/repository/biocontainers/scanpy-scripts?tab=tags). If you must, prefer installing scanpy-scripts is via conda:
+The recommended way of using this package is through the latest container produced by Bioconda [here](https://quay.io/repository/biocontainers/scanpy-scripts?tab=tags). If you must, one can install scanpy-scripts via conda:
 
 ```bash
 conda install scanpy-scripts