Merge pull request #116 from sbslee/0.22.0-dev

sbslee · web-flow · commit f6ad1d30241e · 2023-12-10T15:29:37.000-08:00
0.22.0 dev
diff --git a/.readthedocs.yml b/.readthedocs.yml
@@ -5,6 +5,12 @@
 # Required
 version: 2
 
+# Set the OS, Python version and other tools you might need
+build:
+  os: ubuntu-22.04
+  tools:
+    python: "3.7"
+
 # Build documentation in the docs/ directory with Sphinx
 sphinx:
   configuration: docs/conf.py
@@ -19,6 +25,5 @@ sphinx:
 
 # Optionally set the version of Python and requirements required to build your docs
 python:
-  version: 3.7
   install:
     - requirements: docs/requirements.txt
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -1,6 +1,13 @@
 Changelog
 *********
 
+0.22.0 (2023-12-11)
+-------------------
+
+* :issue:`100`: Add new method :meth:`sdk.utils.get_bundle_path` to enable customization of the ``pypgx-bundle`` directory's location instead of the user's home directory.
+* :issue:`114`: Fix bug in :meth:`api.core.get_recommendation` method where string ``'None'`` was treated as missing value by ``pandas.read_csv`` version 2.0 or higher.
+* :issue:`113`: Fix bug in :meth:`api.utils.estimate_phase_beagle` method where Beagle's expectation-maximization algorithm estimated a parameter value that was outside the permitted range.
+
 0.21.0 (2023-08-25)
 -------------------
 
diff --git a/README.rst b/README.rst
@@ -229,19 +229,29 @@ structural variant classifier files in PyPGx are moved to the
 (only those files are moved; other files such as ``allele-table.csv`` and
 ``variant-table.csv`` are intact). Therefore, the user must clone the
 ``pypgx-bundle`` repository with matching PyPGx version to their home
-directory in order for PyPGx to correctly access the moved files:
+directory in order for PyPGx to correctly access the moved files (i.e. replace 
+``x.x.x`` with the version number of PyPGx you're using, such as ``0.18.0``):
 
 .. code-block:: text
 
    $ cd ~
-   $ git clone --branch 0.12.0 --depth 1 https://github.com/sbslee/pypgx-bundle
+   $ git clone --branch x.x.x --depth 1 https://github.com/sbslee/pypgx-bundle
 
 This is undoubtedly annoying, but absolutely necessary for portability
 reasons because PyPGx has been growing exponentially in file size due to the
 increasing number of genes supported and their variation complexity, to the
 point where it now exceeds upload size limit for PyPI (100 Mb). After removal
 of those files, the size of PyPGx has reduced from >100 Mb to <1 Mb.
 
+Starting with version 0.22.0, you can now specify a custom location for the 
+``pypgx-bundle`` directory instead of using the home directory. This can be 
+achieved by setting the bundle location using the ``PYPGX_BUNDLE`` environment 
+variable:
+
+.. code-block:: text
+
+   $ export PYPGX_BUNDLE=/path/to/pypgx-bundle
+
 Structural variation detection
 ==============================
 
@@ -756,7 +766,7 @@ For getting help on the CLI:
        test-cnv-caller     Test CNV caller for target gene.
        train-cnv-caller    Train CNV caller for target gene.
    
-   optional arguments:
+   options:
      -h, --help            Show this help message and exit.
      -v, --version         Show the version number and exit.
 
diff --git a/docs/cli.rst b/docs/cli.rst
@@ -65,7 +65,7 @@ For getting help on the CLI:
        test-cnv-caller     Test CNV caller for target gene.
        train-cnv-caller    Train CNV caller for target gene.
    
-   optional arguments:
+   options:
      -h, --help            Show this help message and exit.
      -v, --version         Show the version number and exit.
 
@@ -409,7 +409,7 @@ estimate-phase-beagle
      -h, --help         Show this help message and exit.
      --panel PATH       VCF file (compressed or uncompressed) corresponding to a
                         reference haplotype panel. By default, the 1KGP panel in
-                        the ~/pypgx-bundle directory will be used.
+                        the pypgx-bundle directory will be used.
      --impute           Perform imputation of missing genotypes.
 
 filter-samples
@@ -700,7 +700,7 @@ predict-cnv
    Optional arguments:
      -h, --help         Show this help message and exit.
      --cnv-caller PATH  Archive file with the semantic type Model[CNV]. By
-                        default, a pre-trained CNV caller in the ~/pypgx-bundle
+                        default, a pre-trained CNV caller in the pypgx-bundle
                         directory will be used.
 
 prepare-depth-of-coverage
@@ -813,7 +813,7 @@ run-chip-pipeline
                            (choices: 'GRCh37', 'GRCh38').
      --panel PATH          VCF file corresponding to a reference haplotype panel
                            (compressed or uncompressed). By default, the 1KGP
-                           panel in the ~/pypgx-bundle directory will be used.
+                           panel in the pypgx-bundle directory will be used.
      --impute              Perform imputation of missing genotypes.
      --force               Overwrite output directory if it already exists.
      --samples TEXT [TEXT ...]
@@ -911,7 +911,7 @@ run-ngs-pipeline
                            (choices: 'GRCh37', 'GRCh38').
      --panel PATH          VCF file corresponding to a reference haplotype panel
                            (compressed or uncompressed). By default, the 1KGP panel
-                           in the ~/pypgx-bundle directory will be used.
+                           in the pypgx-bundle directory will be used.
      --force               Overwrite output directory if it already exists.
      --samples TEXT [TEXT ...]
                            Specify which samples should be included for analysis
@@ -926,7 +926,7 @@ run-ngs-pipeline
      --do-not-plot-allele-fraction
                            Do not plot allele fraction profile.
      --cnv-caller PATH     Archive file with the semantic type Model[CNV]. By
-                           default, a pre-trained CNV caller in the ~/pypgx-bundle
+                           default, a pre-trained CNV caller in the pypgx-bundle
                            directory will be used.
    
    [Example] To genotype the CYP3A5 gene, which does not have SV, from WGS data:
diff --git a/docs/create.py b/docs/create.py
@@ -256,19 +256,29 @@
 (only those files are moved; other files such as ``allele-table.csv`` and
 ``variant-table.csv`` are intact). Therefore, the user must clone the
 ``pypgx-bundle`` repository with matching PyPGx version to their home
-directory in order for PyPGx to correctly access the moved files:
+directory in order for PyPGx to correctly access the moved files (i.e. replace 
+``x.x.x`` with the version number of PyPGx you're using, such as ``0.18.0``):
 
 .. code-block:: text
 
    $ cd ~
-   $ git clone --branch 0.12.0 --depth 1 https://github.com/sbslee/pypgx-bundle
+   $ git clone --branch x.x.x --depth 1 https://github.com/sbslee/pypgx-bundle
 
 This is undoubtedly annoying, but absolutely necessary for portability
 reasons because PyPGx has been growing exponentially in file size due to the
 increasing number of genes supported and their variation complexity, to the
 point where it now exceeds upload size limit for PyPI (100 Mb). After removal
 of those files, the size of PyPGx has reduced from >100 Mb to <1 Mb.
 
+Starting with version 0.22.0, you can now specify a custom location for the 
+``pypgx-bundle`` directory instead of using the home directory. This can be 
+achieved by setting the bundle location using the ``PYPGX_BUNDLE`` environment 
+variable:
+
+.. code-block:: text
+
+   $ export PYPGX_BUNDLE=/path/to/pypgx-bundle
+
 Structural variation detection
 ==============================
 
diff --git a/docs/faq.rst b/docs/faq.rst
@@ -143,3 +143,24 @@ CYP2D6*21, PyPGx will first check which of the two haplotypes contains
 2851C>T and 4181G>C and then assign 2580_2581insC to that haplotype. Note
 that the phase-by-extension algorithm can handle multiallelic sites in
 addition to biallelic sites.
+
+Genotyping multiple genes
+=========================
+
+Many users have asked if it's possible to genotype multiple genes 
+simultaneously using a pipeline command (e.g. :command:`run-ngs-pipeline`). 
+The short answer is no; all the genotyping pipelines are designed to 
+investigate a single gene at a time. However, one can easily loop through the 
+target genes to achieve the same results:
+
+.. code-block:: text
+
+   for gene in `pypgx create-regions-bed --target-genes | awk '{print $4}'`
+   do
+     pypgx run-ngs-pipeline \
+     $gene \
+     grch37-$gene-pipeline \
+     --variants grch37-variants.vcf.gz \
+     --depth-of-coverage grch37-depth-of-coverage.zip \
+     --control-statistics grch37-control-statistics-VDR.zip
+   done
diff --git a/pypgx/api/core.py b/pypgx/api/core.py
@@ -504,7 +504,11 @@ def get_paralog(gene):
     >>> import pypgx
     >>> pypgx.get_paralog('CYP2D6')
     'CYP2D7'
+    >>> pypgx.get_paralog('CYP2D7')
+    'CYP2D6'
     >>> pypgx.get_paralog('CYP2B6')
+    'CYP2B7'
+    >>> pypgx.get_paralog('CYP2E1')
     ''
     """
     df = load_gene_table()
@@ -1286,7 +1290,7 @@ def load_recommendation_table():
     4  tacrolimus  CYP3A5                      Indeterminate  None       None                                               None
     """
     b = BytesIO(pkgutil.get_data(__name__, 'data/recommendation-table.csv'))
-    return pd.read_csv(b)
+    return pd.read_csv(b, na_filter=False)
 
 def load_variant_table():
     """
diff --git a/pypgx/api/pipeline.py b/pypgx/api/pipeline.py
@@ -32,7 +32,7 @@ def run_chip_pipeline(
         Reference genome assembly.
     panel : str, optional
         VCF file corresponding to a reference haplotype panel (compressed or
-        uncompressed). By default, the 1KGP panel in the ``~/pypgx-bundle``
+        uncompressed). By default, the 1KGP panel in the ``pypgx-bundle``
         directory will be used.
     impute : bool, default: False
         If True, perform imputation of missing genotypes.
@@ -166,7 +166,7 @@ def run_ngs_pipeline(
         Reference genome assembly.
     panel : str, optional
         VCF file corresponding to a reference haplotype panel (compressed or
-        uncompressed). By default, the 1KGP panel in the ``~/pypgx-bundle``
+        uncompressed). By default, the 1KGP panel in the ``pypgx-bundle``
         directory will be used.
     force : bool, default : False
         Overwrite output directory if it already exists.
@@ -184,7 +184,7 @@ def run_ngs_pipeline(
         Do not plot allele fraction profile.
     cnv_caller : str or pypgx.Archive, optional
         Archive file or object with the semantic type Model[CNV]. By default,
-        a pre-trained CNV caller in the ``~/pypgx-bundle`` directory will be
+        a pre-trained CNV caller in the ``pypgx-bundle`` directory will be
         used.
     """
     if not core.is_target_gene(gene):
diff --git a/pypgx/api/utils.py b/pypgx/api/utils.py
@@ -794,7 +794,7 @@ def estimate_phase_beagle(
         VCF's contig names.
     panel : str, optional
         VCF file corresponding to a reference haplotype panel (compressed or
-        uncompressed). By default, the 1KGP panel in the ``~/pypgx-bundle``
+        uncompressed). By default, the 1KGP panel in the ``pypgx-bundle``
         directory will be used.
     impute : bool, default: False
         If True, perform imputation of missing genotypes.
@@ -819,8 +819,7 @@ def estimate_phase_beagle(
     metadata['Program'] = 'Beagle'
 
     if panel is None:
-        home = os.path.expanduser('~')
-        panel = f'{home}/pypgx-bundle/1kgp/{assembly}/{gene}.vcf.gz'
+        panel = f'{sdk.get_bundle_path()}/1kgp/{assembly}/{gene}.vcf.gz'
 
     has_chr_prefix = pyvcf.has_chr_prefix(panel)
 
@@ -839,6 +838,31 @@ def estimate_phase_beagle(
     if metadata['Platform'] == 'Chip':
         vf1 = vf1.filter_gsa()
 
+    def run_beagle(vf1, em):
+        with tempfile.TemporaryDirectory() as t:
+            vf1.to_file(f'{t}/input.vcf')
+            command = [
+                'java', '-Xmx2g', '-jar', beagle,
+                f'gt={t}/input.vcf',
+                f'chrom={region}',
+                f'ref={panel}',
+                f'out={t}/output',
+                f'impute={str(impute).lower()}',
+                f'em={em}'
+            ]
+            subprocess.run(
+                command,
+                check=True,
+                stdout=subprocess.DEVNULL,
+                stderr=subprocess.PIPE
+            )
+            vf3 = pyvcf.VcfFrame.from_file(f'{t}/output.vcf.gz')
+            if common_samples:
+                vf = vf3.rename({f'{x}_TEMP': x for x in common_samples})
+            if has_chr_prefix:
+                vf = vf3.update_chr_prefix('remove')
+        return vf3
+
     # Beagle will throw an error if there is only one marker overlapping with
     # the reference panel in a given window. This typically occurs when the
     # input VCF has very few markers or only one marker. Therefore, these
@@ -863,39 +887,26 @@ def estimate_phase_beagle(
         common_samples = list(set(vf1.samples) & set(vf2.samples))
         if common_samples:
             vf1 = vf1.rename({x: f'{x}_TEMP' for x in common_samples})
-        with tempfile.TemporaryDirectory() as t:
-            vf1.to_file(f'{t}/input.vcf')
-            command = [
-                'java', '-Xmx2g', '-jar', beagle,
-                f'gt={t}/input.vcf',
-                f'chrom={region}',
-                f'ref={panel}',
-                f'out={t}/output',
-                f'impute={str(impute).lower()}'
-            ]
-            try:
-                subprocess.run(
-                    command,
-                    check=True,
-                    stdout=subprocess.DEVNULL,
-                    stderr=subprocess.PIPE
-                )
-                vf3 = pyvcf.VcfFrame.from_file(f'{t}/output.vcf.gz')
-                if common_samples:
-                    vf3 = vf3.rename({f'{x}_TEMP': x for x in common_samples})
-                if has_chr_prefix:
-                    vf3 = vf3.update_chr_prefix('remove')
+
+        try:
+            vf3 = run_beagle(vf1, em='true')
+        except subprocess.CalledProcessError as e:
+            message = e.stderr.decode()
             # Beagle may throw an error even when multiple overlapping markers 
             # exist because they are too distant from each other -- that is, 
             # located in separate haplotype windows.
-            except subprocess.CalledProcessError as e:
-                message = e.stderr.decode()
-                if "Window has only one position" in message:
-                    warnings.warn("Beagle: Window has only one position")
-                    vf3 = pyvcf.VcfFrame([], vf1.df[0:0])
-                else:
-                    print(message)
-                    raise e
+            if "Window has only one position" in message:
+                warnings.warn("Beagle: Window has only one position")
+                vf3 = pyvcf.VcfFrame([], vf1.df[0:0])
+            # Beagle will throw an error if the expectation-maximization 
+            # algorithm estimates a parameter value outside the permitted 
+            # range. When this happens, we skip the expectation-maximization.
+            elif "IllegalArgumentException: 1.0" in message:
+                warnings.warn("Beagle: Expectation-maximization skipped")
+                vf3 = run_beagle(vf1, em='false')
+            else:
+                print(message)
+                raise e
 
     return sdk.Archive(metadata, vf3)
 
@@ -1203,7 +1214,7 @@ def predict_cnv(copy_number, cnv_caller=None):
         Archive file or object with the semantic type CovFrame[CopyNumber].
     cnv_caller : str or pypgx.Archive, optional
         Archive file or object with the semantic type Model[CNV]. By default,
-        a pre-trained CNV caller in the ``~/pypgx-bundle`` directory will be
+        a pre-trained CNV caller in the ``pypgx-bundle`` directory will be
         used.
 
     Returns
@@ -1218,8 +1229,7 @@ def predict_cnv(copy_number, cnv_caller=None):
 
     gene = copy_number.metadata['Gene']
     assembly = copy_number.metadata['Assembly']
-    home = os.path.expanduser('~')
-    model_file = f'{home}/pypgx-bundle/cnv/{assembly}/{gene}.zip'
+    model_file = f'{sdk.get_bundle_path()}/cnv/{assembly}/{gene}.zip'
 
     if cnv_caller is None:
         cnv_caller = sdk.Archive.from_file(model_file)
diff --git a/pypgx/cli/estimate_phase_beagle.py b/pypgx/cli/estimate_phase_beagle.py
@@ -41,7 +41,7 @@ def create_parser(subparsers):
         help=
 """VCF file (compressed or uncompressed) corresponding to a
 reference haplotype panel. By default, the 1KGP panel in
-the ~/pypgx-bundle directory will be used."""
+the pypgx-bundle directory will be used."""
     )
     parser.add_argument(
         '--impute',
diff --git a/pypgx/cli/predict_cnv.py b/pypgx/cli/predict_cnv.py
@@ -39,7 +39,7 @@ def create_parser(subparsers):
         metavar='PATH',
         help=
 """Archive file with the semantic type Model[CNV]. By
-default, a pre-trained CNV caller in the ~/pypgx-bundle
+default, a pre-trained CNV caller in the pypgx-bundle
 directory will be used."""
     )
 
diff --git a/pypgx/cli/run_chip_pipeline.py b/pypgx/cli/run_chip_pipeline.py
@@ -57,7 +57,7 @@ def create_parser(subparsers):
         help=
 """VCF file corresponding to a reference haplotype panel
 (compressed or uncompressed). By default, the 1KGP
-panel in the ~/pypgx-bundle directory will be used."""
+panel in the pypgx-bundle directory will be used."""
     )
     parser.add_argument(
         '--impute',
diff --git a/pypgx/cli/run_ngs_pipeline.py b/pypgx/cli/run_ngs_pipeline.py
diff --git a/pypgx/sdk/__init__.py b/pypgx/sdk/__init__.py
diff --git a/pypgx/sdk/utils.py b/pypgx/sdk/utils.py
diff --git a/pypgx/version.py b/pypgx/version.py

Original file line number	Diff line number	Diff line change
`@@ -41,7 +41,7 @@ def create_parser(subparsers):`
`41`	`41`	`help=`
`42`	`42`	`"""VCF file (compressed or uncompressed) corresponding to a`
`43`	`43`	`reference haplotype panel. By default, the 1KGP panel in`
`44`		`-the ~/pypgx-bundle directory will be used."""`
	`44`	`+the pypgx-bundle directory will be used."""`
`45`	`45`	`)`
`46`	`46`	`parser.add_argument(`
`47`	`47`	`'--impute',`
Original file line number	Diff line number	Diff line change
`@@ -39,7 +39,7 @@ def create_parser(subparsers):`
`39`	`39`	`metavar='PATH',`
`40`	`40`	`help=`
`41`	`41`	`"""Archive file with the semantic type Model[CNV]. By`
`42`		`-default, a pre-trained CNV caller in the ~/pypgx-bundle`
	`42`	`+default, a pre-trained CNV caller in the pypgx-bundle`
`43`	`43`	`directory will be used."""`
`44`	`44`	`)`
`45`	`45`
Original file line number	Diff line number	Diff line change
`@@ -57,7 +57,7 @@ def create_parser(subparsers):`
`57`	`57`	`help=`
`58`	`58`	`"""VCF file corresponding to a reference haplotype panel`
`59`	`59`	`(compressed or uncompressed). By default, the 1KGP`
`60`		`-panel in the ~/pypgx-bundle directory will be used."""`
	`60`	`+panel in the pypgx-bundle directory will be used."""`
`61`	`61`	`)`
`62`	`62`	`parser.add_argument(`
`63`	`63`	`'--impute',`