diff --git a/docs/conf.py b/docs/conf.py index 48b41634..1e8d725a 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -62,7 +62,7 @@ # The short X.Y version. version = u'0.7' # The full version, including alpha/beta/rc tags. -release = u'0.7.0' +release = u'0.7.1' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/docs/tutorial.rst b/docs/tutorial.rst index 48972404..71e1fdbc 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -237,39 +237,20 @@ Testing multiple hypothesis on the same dataset increases the chance of receivin However, GPSEA simplifies the application of an appropriate multiple testing correction. For general use, we recommend using a combination -of a *Phenotype MTC filter* (:class:`~gpsea.analysis.mtc_filter.PhenotypeMtcFilter`) with a *multiple testing correction*. -Phenotype MTC filter chooses the HPO terms to test according to several heuristics, which +of a *phenotype MT filter* (:class:`~gpsea.analysis.mtc_filter.PhenotypeMtcFilter`) with a *multiple testing correction*. +Phenotype MT filter chooses the HPO terms to test according to several heuristics, which reduce the multiple testing burden and focus the analysis -on the most interesting terms (see :ref:`HPO MTC filter ` for more info). +on the most interesting terms (see :ref:`HPO MT filter ` for more info). Then the multiple testing correction, such as Bonferroni or Benjamini-Hochberg, is used to control the family-wise error rate or the false discovery rate. See :ref:`mtc` for more information. -In this example, we will use a combination of the HPO MTC filter (:class:`~gpsea.analysis.mtc_filter.HpoMtcFilter`) -with Benjamini-Hochberg procedure (``mtc_correction='fdr_bh'``) -with a false discovery control level at (``mtc_alpha=0.05``): +>>> from gpsea.analysis.pcats import configure_hpo_term_analysis +>>> analysis = configure_hpo_term_analysis(hpo) ->>> from gpsea.analysis.mtc_filter import HpoMtcFilter ->>> mtc_filter = HpoMtcFilter.default_filter(hpo) ->>> mtc_correction = 'fdr_bh' ->>> mtc_alpha = 0.05 - -Choosing the statistical procedure for assessment of association between genotype and phenotype -groups is the last missing piece of the analysis. We will use Fisher Exact Test: - ->>> from gpsea.analysis.pcats.stats import FisherExactTest ->>> count_statistic = FisherExactTest() - -and we finalize the analysis setup by putting all components together -into :class:`~gpsea.analysis.pcats.HpoTermAnalysis`: - ->>> from gpsea.analysis.pcats import HpoTermAnalysis ->>> analysis = HpoTermAnalysis( -... count_statistic=count_statistic, -... mtc_filter=mtc_filter, -... mtc_correction=mtc_correction, -... mtc_alpha=mtc_alpha, -... ) +:func:`~gpsea.analysis.pcats.configure_hpo_term_analysis` configures the analysis +that uses HPO MTC filter (:class:`~gpsea.analysis.mtc_filter.HpoMtcFilter`) for selecting HPO terms of interest, +Fisher Exact test for computing nominal p values, and Benjamini-Hochberg for multiple testing correction. Now we can perform the analysis and investigate the results. diff --git a/docs/user-guide/analyses/phenotype-groups.rst b/docs/user-guide/analyses/phenotype-groups.rst index 86b72f59..d9a7b811 100644 --- a/docs/user-guide/analyses/phenotype-groups.rst +++ b/docs/user-guide/analyses/phenotype-groups.rst @@ -167,59 +167,84 @@ The function finds 369 HPO terms that annotate at least one individual, including the *indirect* annotations whose presence is implied by the :ref:`true-path-rule`. -Statistical test ----------------- +Statistical analysis +-------------------- We will use :ref:`fisher-exact-test` to test the association between genotype and phenotype groups, as described previously. ->>> from gpsea.analysis.pcats.stats import FisherExactTest ->>> count_statistic = FisherExactTest() +In the case of this cohort, we can test association between having a frameshift variant and one of 369 HPO terms. +However, testing multiple hypotheses on the same dataset increases the risk of finding +a significant association solely by chance. +GPSEA uses a two-pronged strategy to reduce the number of tests and, therefore, mitigate this risk: +a phenotype multiple testing (MT) filter and multiple testing correction (MTC). -FET will compute a p value for each genotype phenotype group. +Phenotype MT filter selects a (sub)set of HPO terms for testing, +for instance only the user-selected terms (see :class:`~gpsea.analysis.mtc_filter.SpecifyTermsStrategy`) +or the terms selected by :class:`~gpsea.analysis.mtc_filter.HpoMtcFilter`. +Multiple testing correction then adjusts the nominal p values for the increased risk +of false positive G/P associations. +The available MTC procedures are listed in the :ref:`mtc-correction-procedures` section. -Multiple testing correction ---------------------------- +We must pick one of these to perform genotype-phenotype analysis. -In the case of this cohort, we could test association between having a frameshift variant and one of 369 HPO terms. -However, testing multiple hypotheses on the same dataset increases the risk of finding a significant association -by chance. -GPSEA uses a two-pronged strategy to mitigate this risk - a phenotype MTC filter and multiple testing correction. -.. note:: +Default analysis +^^^^^^^^^^^^^^^^ + +We recommend using HPO MT filter (:class:`~gpsea.analysis.mtc_filter.HpoMtcFilter`) as a phenotype MT filter +and Benjamini-Hochberg for multiple testing correction. +The default analysis can be configured with :func:`~gpsea.analysis.pcats.configure_hpo_term_analysis` convenience method. + +>>> from gpsea.analysis.pcats import configure_hpo_term_analysis +>>> analysis = configure_hpo_term_analysis(hpo) - See the :ref:`mtc` section for more info on multiple testing procedures. -Here we will use a combination of the HPO MTC filter (:class:`~gpsea.analysis.mtc_filter.HpoMtcFilter`) -with Benjamini-Hochberg procedure (``mtc_correction='fdr_bh'``) -with a false discovery control level set to `0.05` (``mtc_alpha=0.05``): +Custom analysis +^^^^^^^^^^^^^^^ + +If the defaults do not work, we can configure the analysis manually. +First, we choose a phenotype MT filter (e.g. :class:`~gpsea.analysis.mtc_filter.HpoMtcFilter`): >>> from gpsea.analysis.mtc_filter import HpoMtcFilter ->>> mtc_filter = HpoMtcFilter.default_filter(hpo, term_frequency_threshold=0.2) ->>> mtc_correction = 'fdr_bh' ->>> mtc_alpha = 0.05 +>>> mtc_filter = HpoMtcFilter.default_filter(hpo, term_frequency_threshold=.2) + +.. note:: + See the :ref:`mtc-filters` section for more info on the available MT filters. -Final analysis --------------- +then a statistical test (e.g. Fisher Exact test): + +>>> from gpsea.analysis.pcats.stats import FisherExactTest +>>> count_statistic = FisherExactTest() + +.. note:: + + See the :mod:`gpsea.analysis.pcats.stats` module for the available multiple testing procedures + (TL;DR, just Fisher Exact test at this time). + +and we finalize the setup by choosing a MTC procedure +(e.g. `fdr_bh` for Benjamini-Hochberg) along with the MTC alpha: + +>>> mtc_correction = 'fdr_bh' +>>> mtc_alpha = 0.05 -We finalize the analysis setup by putting all components together -into :class:`~gpsea.analysis.pcats.HpoTermAnalysis`: +The final :class:`~gpsea.analysis.pcats.HpoTermAnalysis` is created as: >>> from gpsea.analysis.pcats import HpoTermAnalysis >>> analysis = HpoTermAnalysis( ... count_statistic=count_statistic, ... mtc_filter=mtc_filter, -... mtc_correction=mtc_correction, -... mtc_alpha=mtc_alpha, +... mtc_correction='fdr_bh', +... mtc_alpha=0.05, ... ) Analysis ======== -We can now execute the analysis: +We can now test associations between the genotype groups and the HPO terms: >>> result = analysis.compare_genotype_vs_phenotypes( ... cohort=cohort, @@ -232,8 +257,8 @@ We can now execute the analysis: 24 -Thanks to phenotype MTC filter, we only tested 24 out of 369 terms. -We can learn more by showing the MTC filter report: +Thanks to phenotype MT filter, we only tested 24 out of 369 terms. +We can learn more by showing the MT filter report: >>> from gpsea.view import MtcStatsViewer >>> mtc_viewer = MtcStatsViewer() diff --git a/docs/user-guide/mtc.rst b/docs/user-guide/mtc.rst index ffae4ded..b0fb478a 100644 --- a/docs/user-guide/mtc.rst +++ b/docs/user-guide/mtc.rst @@ -36,8 +36,9 @@ Therefore, unless we take into account the fact that multiple statistical tests it is likely that we will obtain one or more false-positive results. GPSEA offers two approaches to mitigate this problem: multiple-testing correction (MTC) procedures -and MTC filters to choose the terms to be tested. +and MT filters to choose the terms to be tested. +.. _mtc-correction-procedures: Multiple-testing correction procedures ====================================== @@ -106,7 +107,7 @@ when creating an instance of :class:`~gpsea.analysis.pcats.HpoTermAnalysis`: .. _mtc-filters: -MTC filters: Choosing which terms to test +MT filters: Choosing which terms to test ========================================= We can reduce the overall MTC burden by choosing which terms to test. @@ -117,18 +118,7 @@ may "survive" the multiple-testing correction. In the context of GPSEA, we represent the concept of phenotype filtering by :class:`~gpsea.analysis.mtc_filter.PhenotypeMtcFilter`. -The filter must be chosen before the :class:`~gpsea.analysis.pcats.MultiPhenotypeAnalysis`, -such as :class:`~gpsea.analysis.pcats.HpoTermAnalysis`, is run: - ->>> from gpsea.analysis.pcats import HpoTermAnalysis ->>> analysis = HpoTermAnalysis() # doctest: +ELLIPSIS -Traceback (most recent call last): - ... -TypeError: HpoTermAnalysis.__init__() missing 2 required positional arguments: 'count_statistic' and 'mtc_filter' - -Note the missing `mtc_filter` option. - -We describe the three filtering strategies in the following sections. +We provide three filtering strategies. .. _use-all-terms-strategy: @@ -136,9 +126,9 @@ We describe the three filtering strategies in the following sections. Test all terms -------------- -The first MTC filtering strategy is the simplest - do not apply any filtering at all. -This will result in testing all terms. We do not recommend this strategy, -but it can be useful to disable MTC filtering. +The first MT filtering strategy is the simplest - do not apply any filtering at all. +This will result in testing all terms and we do not recommend this strategy, +but it can be used to disable MT filtering. The strategy is implemented in :class:`~gpsea.analysis.mtc_filter.UseAllTermsMtcFilter`. @@ -171,10 +161,10 @@ we pass an iterable (e.g. a tuple) with these two terms as an argument: .. _hpo-mtc-filter-strategy: -HPO MTC filter strategy +HPO MT filter strategy ----------------------- -Last, the HPO MTC strategy involves making several domain judgments to take advantage of the HPO structure. +The HPO MT strategy involves making several domain judgments and takes advantage of the HPO structure. The strategy needs access to HPO: >>> import hpotk diff --git a/docs/user-guide/predicates/devries.rst b/docs/user-guide/predicates/devries.rst index 37989c86..84bbd948 100644 --- a/docs/user-guide/predicates/devries.rst +++ b/docs/user-guide/predicates/devries.rst @@ -15,7 +15,7 @@ Statistical significance of a difference in the De Vries score between groups ca determined using the Mann-Whitney-U test. We refer to `Feenstra et al. (2011) `_ for -the original description of the adjusted De Vries score. Here we offer a version of the +the original description of the adjusted De Vries score. Here we offer an adapted version of the score that leverages the structure of the Human Phenotype Ontology to assess the phenotype. @@ -113,38 +113,54 @@ is 2 because the same individual cannot have both tall and short stature or both Facial dysmorphic features ~~~~~~~~~~~~~~~~~~~~~~~~~~ -This section assigns two points if two or more anomalies are identified in the following -categories: hypertelorism, nasal anomalies and ear anomalies. Our implementation of this feature counts the total -number of terms or descendents of the following HPO terms. +This section assigns two points if two or more facial dysmorphisms are identified. In contrast to the list of anomalies described +in the original 2011 publication of the DeVries score, we leverage the structure of the HPO to include many more HPO terms that +denote various kinds of facial dysmorphism (e.g., `Abnormality of globe location `_ instead of just +`Hypertelorism (HP:0000316) `_). + +Our implementation of this feature counts the total number of terms or descendents of the following HPO terms. Up to one point is given +for each of the categories. +----------------------------------------------------------------------------------------------------------+-----------+ | HPO term | Score | +==========================================================================================================+===========+ -| `Hypertelorism (HP:0000316) `_ | 1 | +| `Abnormality of globe location (HP:0000316) `_ | 0 or 1 | ++----------------------------------------------------------------------------------------------------------+-----------+ +| `Abnormal lip morphology (HP:0000159) `_ | 0 or 1 | ++----------------------------------------------------------------------------------------------------------+-----------+ +| `Abnormal facial shape (HP:0001999) `_ | 0 or 1 | ++----------------------------------------------------------------------------------------------------------+-----------+ +| `Abnormal midface morphology (HP:0000309) `_ | 0 or 1 | +----------------------------------------------------------------------------------------------------------+-----------+ -| `Abnormal external nose morphology (HP:0010938) `_ | 1 each | +| `Abnormal forehead morphology (HP:0000290) `_ | 0 or 1 | +----------------------------------------------------------------------------------------------------------+-----------+ -| `Abnormal pinna morphology (HP:0000377) `_ | 1 each | +| `Abnormal chin morphology (HP:0000306) `_ | 0 or 1 | ++----------------------------------------------------------------------------------------------------------+-----------+ +| `Abnormal external nose morphology (HP:0010938) `_ | 0 or 1 | ++----------------------------------------------------------------------------------------------------------+-----------+ +| `Abnormal pinna morphology (HP:0000377) `_ | 0 or 1 | +----------------------------------------------------------------------------------------------------------+-----------+ -If two or more terms are found, the score is 2, otherwise a score of zero is assigned. +If items from two or more categories are found, the score is 2, otherwise a score of zero is assigned. Non-facial dysmorphism and congenital abnormalities ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -One point is assigned for either the -corresponding HPO terms or any of their descendents up to a maximum of two points. +One point is assigned for either the corresponding HPO terms or any of their descendents up to a maximum of two points. +A maximum of one point is assigned for each of the following categories. +----------------------------------------------------------------------------------------------------------+-----------+ | HPO term | Score | +==========================================================================================================+===========+ -| `Abnormal hand morphology (HP:0005922) `_ | 1 each | +| `Abnormal hand morphology (HP:0005922) `_ | 0 or 1 | +----------------------------------------------------------------------------------------------------------+-----------+ -| `Abnormal heart morphology (HP:0001627) `_ | 1 each | +| `Abnormal heart morphology (HP:0001627) `_ | 0 or 1 | +----------------------------------------------------------------------------------------------------------+-----------+ -| `Hypospadias (HP:0000047) `_ | 1 | +| `Abnormal external genitalia morphology (HP:0000811) `_ | 0 or 1 | +----------------------------------------------------------------------------------------------------------+-----------+ +The score for this section can thus be 0, 1, or 2. + Final score ~~~~~~~~~~~ diff --git a/src/gpsea/__init__.py b/src/gpsea/__init__.py index cddd77e8..4b512d92 100644 --- a/src/gpsea/__init__.py +++ b/src/gpsea/__init__.py @@ -2,4 +2,4 @@ GPSEA is a library for analyzing genotype-phenotype correlations in cohorts of rare disease patients. """ -__version__ = "0.7.0" +__version__ = "0.7.1" diff --git a/src/gpsea/analysis/_base.py b/src/gpsea/analysis/_base.py index c132f795..4bfe4824 100644 --- a/src/gpsea/analysis/_base.py +++ b/src/gpsea/analysis/_base.py @@ -3,6 +3,7 @@ import os import typing +import numpy as np import pandas as pd from .predicate.phenotype import PhenotypePolyPredicate, P @@ -205,6 +206,48 @@ def corrected_pvals(self) -> typing.Optional[typing.Sequence[float]]: The sequence includes a `NaN` value for each phenotype that was *not* tested. """ return self._corrected_pvals + + def n_significant_for_alpha( + self, + alpha: float = .05, + ) -> typing.Optional[int]: + """ + Get the count of the corrected p values with the value being less than or equal to `alpha`. + + :param alpha: a `float` with significance level. + """ + if self.corrected_pvals is None: + return None + else: + return sum(p_val <= alpha for p_val in self.corrected_pvals) + + def significant_phenotype_indices( + self, + alpha: float = .05, + pval_kind: typing.Literal["corrected", "nominal"] = "corrected", + ) -> typing.Optional[typing.Sequence[int]]: + """ + Get the indices of phenotypes that attain significance for provided `alpha`. + """ + if pval_kind == "corrected": + if self.corrected_pvals is None: + vals = None + else: + vals = np.array(self.corrected_pvals) + elif pval_kind == "nominal": + vals = np.array(self.pvals) + else: + raise ValueError(f"Unsupported `pval_kind` value {pval_kind}") + + if vals is None: + return None + + not_na = ~np.isnan(vals) + significant = vals <= alpha + selected = not_na & significant + + return tuple(int(idx) for idx in np.argsort(vals) if selected[idx]) + @property def total_tests(self) -> int: diff --git a/src/gpsea/analysis/pcats/__init__.py b/src/gpsea/analysis/pcats/__init__.py index 59eceb97..51042892 100644 --- a/src/gpsea/analysis/pcats/__init__.py +++ b/src/gpsea/analysis/pcats/__init__.py @@ -16,16 +16,20 @@ The results are provided as :class:`MultiPhenotypeAnalysisResult` (or more specific :class:`HpoTermAnalysisResult` for :class:`HpoTermAnalysis`). + +Use :func:`configure_hpo_term_analysis` to configure the HPO term analysis with the default parameters. """ from ._impl import MultiPhenotypeAnalysis, MultiPhenotypeAnalysisResult from ._impl import DiseaseAnalysis from ._impl import HpoTermAnalysis, HpoTermAnalysisResult from ._impl import apply_predicates_on_patients +from ._config import configure_hpo_term_analysis __all__ = [ 'MultiPhenotypeAnalysis', 'MultiPhenotypeAnalysisResult', 'DiseaseAnalysis', 'HpoTermAnalysis', 'HpoTermAnalysisResult', 'apply_predicates_on_patients', + 'configure_hpo_term_analysis', ] diff --git a/src/gpsea/analysis/pcats/_config.py b/src/gpsea/analysis/pcats/_config.py new file mode 100644 index 00000000..2a06fc3e --- /dev/null +++ b/src/gpsea/analysis/pcats/_config.py @@ -0,0 +1,29 @@ +import typing + +import hpotk + +from ..mtc_filter import HpoMtcFilter +from ._impl import HpoTermAnalysis +from .stats import CountStatistic, FisherExactTest + + +def configure_hpo_term_analysis( + hpo: hpotk.MinimalOntology, + count_statistic: CountStatistic = FisherExactTest(), + mtc_correction: typing.Optional[str] = "fdr_bh", + mtc_alpha: float = 0.05, +) -> HpoTermAnalysis: + """ + Configure HPO term analysis with default parameters. + + The default analysis will pre-filter HPO terms with :class:`~gpsea.analysis.mtc_filter.HpoMtcFilter`, + then compute nominal p values using `count_statistic` (default Fisher exact test), + and apply multiple testing correction (default Benjamini/Hochberg (`fdr_bh`)) + with target `mtc_alpha` (default `0.05`). + """ + return HpoTermAnalysis( + mtc_filter=HpoMtcFilter.default_filter(hpo), + count_statistic=count_statistic, + mtc_correction=mtc_correction, + mtc_alpha=mtc_alpha, + ) diff --git a/src/gpsea/analysis/pcats/_impl.py b/src/gpsea/analysis/pcats/_impl.py index 7c75706c..620d404f 100644 --- a/src/gpsea/analysis/pcats/_impl.py +++ b/src/gpsea/analysis/pcats/_impl.py @@ -329,6 +329,12 @@ def mtc_filter_results(self) -> typing.Sequence[PhenotypeMtcResult]: """ return self._mtc_filter_results + def n_filtered_out(self) -> int: + """ + Get the number of phenotype terms that were filtered out by the MTC filter. + """ + return sum(result.is_filtered_out() for result in self.mtc_filter_results) + def __eq__(self, other): return isinstance(other, HpoTermAnalysisResult) \ and super(MultiPhenotypeAnalysisResult, self).__eq__(other) \ diff --git a/src/gpsea/analysis/predicate/genotype/_variant.py b/src/gpsea/analysis/predicate/genotype/_variant.py index c725f24f..4b24daef 100644 --- a/src/gpsea/analysis/predicate/genotype/_variant.py +++ b/src/gpsea/analysis/predicate/genotype/_variant.py @@ -57,7 +57,7 @@ def all(predicates: typing.Iterable[VariantPredicate]) -> VariantPredicate: >>> genes = ('SURF1', 'SURF2',) >>> predicate = VariantPredicates.all(VariantPredicates.gene(g) for g in genes) - >>> predicate.get_question() + >>> predicate.description '(affects SURF1 AND affects SURF2)' Args: @@ -88,7 +88,7 @@ def any(predicates: typing.Iterable[VariantPredicate]) -> VariantPredicate: >>> tx_id = 'NM_123456.7' >>> effects = (VariantEffect.MISSENSE_VARIANT, VariantEffect.STOP_GAINED,) >>> predicate = VariantPredicates.any(VariantPredicates.variant_effect(e, tx_id) for e in effects) - >>> predicate.get_question() + >>> predicate.description '(MISSENSE_VARIANT on NM_123456.7 OR STOP_GAINED on NM_123456.7)' Args: @@ -117,7 +117,7 @@ def variant_effect( >>> from gpsea.model import VariantEffect >>> from gpsea.analysis.predicate.genotype import VariantPredicates >>> predicate = VariantPredicates.variant_effect(VariantEffect.MISSENSE_VARIANT, tx_id='NM_123.4') - >>> predicate.get_question() + >>> predicate.description 'MISSENSE_VARIANT on NM_123.4' Args: @@ -278,7 +278,7 @@ def structural_type( >>> from gpsea.analysis.predicate.genotype import VariantPredicates >>> predicate = VariantPredicates.structural_type('SO:1000029') - >>> predicate.get_question() + >>> predicate.description 'structural type is SO:1000029' Args: @@ -301,7 +301,7 @@ def variant_class( >>> from gpsea.model import VariantClass >>> from gpsea.analysis.predicate.genotype import VariantPredicates >>> predicate = VariantPredicates.variant_class(VariantClass.DEL) - >>> predicate.get_question() + >>> predicate.description 'variant class is DEL' Args: @@ -359,7 +359,7 @@ def change_length( >>> from gpsea.analysis.predicate.genotype import VariantPredicates >>> predicate = VariantPredicates.change_length('<=', -10) - >>> predicate.get_question() + >>> predicate.description 'change length <= -10' Args: @@ -391,7 +391,7 @@ def is_structural_deletion( >>> from gpsea.analysis.predicate.genotype import VariantPredicates >>> predicate = VariantPredicates.is_structural_deletion(-20) - >>> predicate.get_question() + >>> predicate.description '(structural type is SO:1000029 OR (variant class is DEL AND change length <= -20))' Args: diff --git a/src/gpsea/analysis/pscore/_hpo.py b/src/gpsea/analysis/pscore/_hpo.py index 6187b0ea..2f12d288 100644 --- a/src/gpsea/analysis/pscore/_hpo.py +++ b/src/gpsea/analysis/pscore/_hpo.py @@ -166,7 +166,7 @@ def _developmental_delay_score( ) -> float: """ Calculate the dev delay component of the score - + Args: observed_term_ids: terms observed in patient @@ -177,32 +177,12 @@ def _developmental_delay_score( for t in observed_term_ids: if t in self._gdd_tids: return self._gdd_tids[t] - + # Intellectual disability for t in observed_term_ids: if t in self._idd_tids: return self._idd_tids[t] - - return 0 - - def _term_or_descendant( - self, - target_tid: str, - observed_term_ids: typing.Iterable[str], - ) -> int: - """ - Args: - target_tid: term of interest - observed_term_ids: all terms observed in patient - Returns: - 1 if the term or any descendant is present in the patient, otherwise 0 - """ - for term_id in observed_term_ids: - if term_id == target_tid \ - or any(ancestor == target_tid for ancestor in self._hpo.graph.get_ancestors(term_id)): - return 1 - return 0 def _term_or_descendant_count( @@ -216,14 +196,12 @@ def _term_or_descendant_count( observed_term_ids: all terms observed in patient Returns: - the total count of the terms equal to or descending from the target_tid + 1 if at least one term is equal to or descending from the target_tid, otherwise 0 """ - total_count = 0 for term_id in observed_term_ids: - for desc_tid in self._hpo.graph.get_ancestors(term_id, include_source=True): - if desc_tid.value == target_tid: - total_count += 1 - return total_count + if term_id == target_tid or self._hpo.graph.is_descendant_of(term_id, target_tid): + return 1 + return 0 def _postnatal_growth_score( self, @@ -231,7 +209,7 @@ def _postnatal_growth_score( ) -> int: """ Calculate the postnatal growth component of the score. - + Args: observed_term_ids: terms observed in patient @@ -243,7 +221,7 @@ def _postnatal_growth_score( tall_stature = 'HP:0000098' total_count = 0 for tid in (microcephaly, short_stature, macrocephaly, tall_stature): - total_count += self._term_or_descendant(tid, observed_term_ids) + total_count += self._term_or_descendant_count(tid, observed_term_ids) if total_count > 2: raise ValueError(f"Inconsistent annotations for postnatal growth score {total_count}: {observed_term_ids}") return total_count @@ -264,14 +242,22 @@ def _facial_dysmorphism_score( Returns: facial dysmorphism score (between 0 and 2) """ - hypertelorism = 'HP:0000316' + globe_location = 'HP:0100886' # include Hypertelorism and others + lip = 'HP:0000159' # Abnormal lip morphology HP:0000159 external_nose = 'HP:0010938' pinna_morphology = 'HP:0000377' + facial_shape = 'HP:0001999' # Abnormal facial shape + midface = 'HP:0000309' # Abnormal midface morphology + chin = 'HP:0000306' # Abnormality of the chin - # No need to inspect descendants since Hypertelorism has none. - total_count = 1 if hypertelorism in observed_term_ids else 0 + total_count = self._term_or_descendant_count(target_tid=globe_location, observed_term_ids=observed_term_ids) + total_count += self._term_or_descendant_count(target_tid=lip, observed_term_ids=observed_term_ids) total_count += self._term_or_descendant_count(target_tid=external_nose, observed_term_ids=observed_term_ids) total_count += self._term_or_descendant_count(target_tid=pinna_morphology, observed_term_ids=observed_term_ids) + total_count += self._term_or_descendant_count(target_tid=facial_shape, observed_term_ids=observed_term_ids) + total_count += self._term_or_descendant_count(target_tid=midface, observed_term_ids=observed_term_ids) + total_count += self._term_or_descendant_count(target_tid=chin, observed_term_ids=observed_term_ids) + if total_count > 1: return 2 else: @@ -284,24 +270,20 @@ def _congenital_score( """ Non-facial dysmorphism and congenital abnormalities component. One point is assigned for either the corresponding HPO terms or any of their descendents up to a maximum of 2. - + Args: observed_term_ids: terms observed in patient Returns: Non-facial dysmorphism and congenital abnormalities score (between 0 and 2) """ - hypospadias = 'HP:0000047' + abn_external_genitalia = 'HP:0000811' # Abnormal external genitalia abnormal_hand_morphology = 'HP:0005922' abnormal_heart_morphology = 'HP:0001627' - # total_count = len([t for t in observed_term_ids if t == hypospadias]) - total_count = self._term_or_descendant_count( - target_tid=hypospadias, observed_term_ids=observed_term_ids, - ) - total_count += self._term_or_descendant_count(target_tid=abnormal_hand_morphology, - observed_term_ids=observed_term_ids) - total_count += self._term_or_descendant_count(target_tid=abnormal_heart_morphology, - observed_term_ids=observed_term_ids) + + total_count = self._term_or_descendant_count(target_tid=abn_external_genitalia, observed_term_ids=observed_term_ids,) + total_count += self._term_or_descendant_count(target_tid=abnormal_hand_morphology, observed_term_ids=observed_term_ids) + total_count += self._term_or_descendant_count(target_tid=abnormal_heart_morphology, observed_term_ids=observed_term_ids) return min(2, total_count) def _prenatal_growth_score( @@ -328,7 +310,7 @@ def _prenatal_growth_score( def score(self, patient: Patient) -> float: """ Calculate score based on list of strings with term identifiers or observed HPO terms. - + Args: patient: list of strings with term identifiers or observed HPO terms @@ -342,5 +324,5 @@ def score(self, patient: Patient) -> float: facial_score = self._facial_dysmorphism_score(observed_term_ids) congen_score = self._congenital_score(observed_term_ids) prenatal_score = self._prenatal_growth_score(observed_term_ids) - + return delay_score + growth_score + facial_score + congen_score + prenatal_score diff --git a/src/gpsea/model/_cohort.py b/src/gpsea/model/_cohort.py index e6eecf3b..55898bb8 100644 --- a/src/gpsea/model/_cohort.py +++ b/src/gpsea/model/_cohort.py @@ -13,6 +13,12 @@ from ._variant import Variant, VariantInfo +I = typing.TypeVar('I', bound=hpotk.model.Identified) +""" +Anything that extends `Identified` (e.g. `Disease`, `Phenotype`, `Measurement`). +""" + + class Status(enum.Enum): UNKNOWN = 0 ALIVE = 1 @@ -162,6 +168,16 @@ def phenotypes(self) -> typing.Sequence[Phenotype]: Get the phenotypes observed and excluded in the patient. """ return self._phenotypes + + def phenotype_by_id( + self, + term_id: typing.Union[str, hpotk.TermId], + ) -> typing.Optional[Phenotype]: + """ + Get a phenotype with an identifier or `None` if the individual has no such phenotype. + """ + term_id = Patient._check_id(term_id) + return Patient._find_first_by_id(term_id, self.phenotypes) @property def measurements(self) -> typing.Sequence[Measurement]: @@ -181,18 +197,8 @@ def measurement_by_id( representing the term ID of a measurement (e.g. `LOINC:2986-8` for *Testosterone[Mass/Vol]*). :returns: the corresponding :class:`Measurement` or `None` if not found in the patient. """ - if isinstance(term_id, str): - pass - elif isinstance(term_id, hpotk.TermId): - term_id = term_id.value - else: - raise ValueError(f'`term_id` must be a `str` or `hpotk.TermId` but was {type(term_id)}') - - for m in self._measurements: - if m.identifier.value == term_id: - return m - - return None + term_id = Patient._check_id(term_id) + return Patient._find_first_by_id(term_id, self.measurements) @property def diseases(self) -> typing.Sequence[Disease]: @@ -201,6 +207,16 @@ def diseases(self) -> typing.Sequence[Disease]: """ return self._diseases + def disease_by_id( + self, + term_id: typing.Union[str, hpotk.TermId], + ) -> typing.Optional[Disease]: + """ + Get a disease with an identifier or `None` if the individual has no such disease. + """ + term_id = Patient._check_id(term_id) + return Patient._find_first_by_id(term_id, self.diseases) + @property def variants(self) -> typing.Sequence[Variant]: """ @@ -232,6 +248,28 @@ def excluded_diseases(self) -> typing.Iterator[Disease]: """ return filter(lambda d: not d.is_present, self._diseases) + @staticmethod + def _check_id( + term_id: typing.Union[str, hpotk.TermId], + ) -> hpotk.TermId: + if isinstance(term_id, str): + return hpotk.TermId.from_curie(term_id) + elif isinstance(term_id, hpotk.TermId): + return term_id + else: + raise ValueError(f'`term_id` must be a `str` or `hpotk.TermId` but was {type(term_id)}') + + @staticmethod + def _find_first_by_id( + term_id: hpotk.TermId, + items: typing.Iterable[I], + ) -> typing.Optional[I]: + for m in items: + if m.identifier == term_id: + return m + + return None + def __str__(self) -> str: return (f"Patient(" f"labels:{self._labels}, " @@ -319,6 +357,16 @@ def all_phenotypes(self) -> typing.Set[Phenotype]: Get a set of all phenotypes (observed or excluded) in the cohort members. """ return set(itertools.chain(phenotype for patient in self._members for phenotype in patient.phenotypes)) + + def count_distinct_hpo_terms(self) -> int: + """ + Get count of distinct HPO terms (either in present or excluded state) seen in the cohort members. + """ + terms = set( + itertools.chain(phenotype.identifier for patient in self._members for phenotype in patient.phenotypes) + ) + return len(terms) + def all_measurements(self) -> typing.Set[Measurement]: """ @@ -332,6 +380,14 @@ def all_diseases(self) -> typing.Set[Disease]: """ return set(itertools.chain(disease for patient in self._members for disease in patient.diseases)) + def count_with_disease_onset(self) -> int: + """ + Get the count of individuals with recorded disease onset. + """ + return self._count_individuals_with_condition( + lambda i: any(d.onset is not None for d in i.diseases), + ) + def all_variants(self) -> typing.Set[Variant]: """ Get a set of all variants observed in the cohort members. @@ -483,6 +539,65 @@ def get_variant_by_key(self, variant_key) -> Variant: else: raise ValueError(f"Variant key {variant_key} not found in cohort.") + def count_males(self) -> int: + """ + Get the number of males in the cohort. + """ + return self._count_individuals_with_sex(Sex.MALE) + + def count_females(self) -> int: + """ + Get the number of females in the cohort. + """ + return self._count_individuals_with_sex(Sex.FEMALE) + + def count_unknown_sex(self) -> int: + """ + Get the number of individuals with unknown sex in the cohort. + """ + return self._count_individuals_with_sex(Sex.UNKNOWN_SEX) + + def _count_individuals_with_sex(self, sex: Sex) -> int: + return self._count_individuals_with_condition(lambda i: i.sex == sex) + + def count_alive(self) -> int: + """ + Get the number of individuals reported to be alive at the time of last encounter. + """ + return self._count_individuals_with_condition( + lambda i: i.vital_status is not None and i.vital_status.is_alive + ) + + def count_deceased(self) -> int: + """ + Get the number of individuals reported to be deceased. + """ + return self._count_individuals_with_condition( + lambda i: i.vital_status is not None and i.vital_status.is_deceased + ) + + def count_unknown_vital_status(self) -> int: + """ + Get the number of individuals with unknown or no reported vital status. + """ + return self._count_individuals_with_condition( + lambda i: i.vital_status is None or i.vital_status.is_unknown + ) + + def count_with_age_of_last_encounter(self) -> int: + """ + Get the number of individuals with a known age of last encounter. + """ + return self._count_individuals_with_condition( + lambda i: i.vital_status is not None and i.vital_status.age_of_death is not None + ) + + def _count_individuals_with_condition( + self, + predicate: typing.Callable[[Patient], bool], + ) -> int: + return sum(predicate(individual) for individual in self._members) + def __eq__(self, other): return isinstance(other, Cohort) and self._members == other._members diff --git a/src/gpsea/model/_protein.py b/src/gpsea/model/_protein.py index 2fe55684..6db36e09 100644 --- a/src/gpsea/model/_protein.py +++ b/src/gpsea/model/_protein.py @@ -118,21 +118,28 @@ class FeatureType(enum.Enum): A region of interest that cannot be described in other subsections. """ + ZINC_FINGER = enum.auto() + """ + A zinc finger is a small, functional, independently folded domain that coordinates one or more zinc ions to stabilize its structure through cysteine and/or histidine residues. + """ + @staticmethod def from_string(category: str) -> "FeatureType": - cat_lover = category.lower() - if cat_lover == "repeat": + cat_lower = category.lower() + if cat_lower == "repeat": return FeatureType.REGION - elif cat_lover == "motif": + elif cat_lower == "motif": return FeatureType.MOTIF - elif cat_lover == "domain": + elif cat_lower == "domain": return FeatureType.DOMAIN - elif cat_lover == "region": + elif cat_lower == "region": return FeatureType.REGION - elif cat_lover == "coiled coil": + elif cat_lower == "coiled coil": return FeatureType.REGION - elif cat_lover == "compositional bias": + elif cat_lower == "compositional bias": return FeatureType.COMPOSITIONAL_BIAS + elif cat_lower == "zinc finger": + return FeatureType.ZINC_FINGER else: raise ValueError(f'Unrecognized protein feature type: "{category}"') @@ -361,19 +368,16 @@ def from_uniprot_json( regions = list() for feature in data["features"]: - try: - region_name = feature["description"] - locus = feature["location"] - region_start = int(locus["start"]["value"]) - 1 # convert to 0-based coordinates - region_end = int(locus["end"]["value"]) - feature_type = FeatureType.from_string(feature["type"]) - finfo = FeatureInfo( - name=region_name, region=Region(start=region_start, end=region_end) - ) - pfeature = ProteinFeature.create(info=finfo, feature_type=feature_type) - regions.append(pfeature) - except Exception as feature_exception: - print(f"Could not parse feature: {str(feature_exception)} (skipping)") + region_name = feature["description"] + locus = feature["location"] + region_start = int(locus["start"]["value"]) - 1 # convert to 0-based coordinates + region_end = int(locus["end"]["value"]) + feature_type = FeatureType.from_string(feature["type"]) + finfo = FeatureInfo( + name=region_name, region=Region(start=region_start, end=region_end) + ) + pfeature = ProteinFeature.create(info=finfo, feature_type=feature_type) + regions.append(pfeature) return ProteinMetadata( protein_id=protein_id, diff --git a/src/gpsea/model/_temporal.py b/src/gpsea/model/_temporal.py index 2aede84e..32329eb1 100644 --- a/src/gpsea/model/_temporal.py +++ b/src/gpsea/model/_temporal.py @@ -92,6 +92,23 @@ def gestational( total = weeks * Age.DAYS_IN_WEEK + days return Age(days=float(total), timeline=Timeline.GESTATIONAL) + @staticmethod + def last_menstrual_period() -> "Age": + """ + Age of an individual at last menstrual period. + """ + return LAST_MENSTRUAL_PERIOD + + @staticmethod + def gestational_days(days: typing.Union[int, float]) -> "Age": + """ + Create a gestational age corresponding to the number of `days` + since the last menstrual period. + """ + if isinstance(days, int): + days = float(days) + return Age(days=days, timeline=Timeline.GESTATIONAL) + @staticmethod def birth() -> "Age": """ @@ -100,8 +117,14 @@ def birth() -> "Age": return BIRTH @staticmethod - def postnatal_days(days: int) -> "Age": - return Age(days=float(days), timeline=Timeline.POSTNATAL) + def postnatal_days(days: typing.Union[int, float]) -> "Age": + """ + Create a postnatal age corresponding to the number of `days` + since birth. + """ + if isinstance(days, int): + days = float(days) + return Age(days=days, timeline=Timeline.POSTNATAL) @staticmethod def postnatal_years( @@ -274,3 +297,4 @@ def __str__(self) -> str: BIRTH = Age(days=0.0, timeline=Timeline.POSTNATAL) +LAST_MENSTRUAL_PERIOD = Age(days=0.0, timeline=Timeline.GESTATIONAL) diff --git a/src/gpsea/preprocessing/__init__.py b/src/gpsea/preprocessing/__init__.py index 3c85d3e6..7cfb2797 100644 --- a/src/gpsea/preprocessing/__init__.py +++ b/src/gpsea/preprocessing/__init__.py @@ -6,7 +6,7 @@ from ._config import configure_default_protein_metadata_service, configure_protein_metadata_service from ._generic import DefaultImpreciseSvFunctionalAnnotator from ._patient import PatientCreator, CohortCreator -from ._phenopacket import PhenopacketVariantCoordinateFinder, PhenopacketPatientCreator +from ._phenopacket import PhenopacketVariantCoordinateFinder, PhenopacketPatientCreator, PhenopacketOntologyTermOnsetParser from ._protein import ProteinAnnotationCache, ProtCachingMetadataService from ._uniprot import UniprotProteinMetadataService from ._variant import VariantAnnotationCache, VarCachingFunctionalAnnotator @@ -18,7 +18,7 @@ 'configure_default_protein_metadata_service', 'configure_protein_metadata_service', 'VariantCoordinateFinder', 'FunctionalAnnotator', 'ImpreciseSvFunctionalAnnotator', 'ProteinMetadataService', 'PatientCreator', 'CohortCreator', - 'PhenopacketVariantCoordinateFinder', 'PhenopacketPatientCreator', + 'PhenopacketVariantCoordinateFinder', 'PhenopacketPatientCreator', 'PhenopacketOntologyTermOnsetParser', 'load_phenopacket_folder', 'load_phenopacket_files', 'load_phenopackets', 'PreprocessingValidationResult', 'TranscriptCoordinateService', 'GeneCoordinateService', diff --git a/src/gpsea/preprocessing/_config.py b/src/gpsea/preprocessing/_config.py index b69ac7e9..e8b99d5c 100644 --- a/src/gpsea/preprocessing/_config.py +++ b/src/gpsea/preprocessing/_config.py @@ -24,7 +24,7 @@ from ._api import FunctionalAnnotator, PreprocessingValidationResult from ._generic import DefaultImpreciseSvFunctionalAnnotator from ._patient import CohortCreator -from ._phenopacket import PhenopacketPatientCreator +from ._phenopacket import PhenopacketPatientCreator, PhenopacketOntologyTermOnsetParser from ._protein import ( ProteinMetadataService, @@ -44,6 +44,7 @@ def configure_caching_cohort_creator( genome_build: str = "GRCh38.p13", validation_runner: typing.Optional[ValidationRunner] = None, cache_dir: typing.Optional[str] = None, + include_ontology_class_onsets: bool = True, variant_fallback: str = "VEP", timeout: float = 30.0, ) -> CohortCreator[Phenopacket]: @@ -58,6 +59,8 @@ def configure_caching_cohort_creator( :param cache_dir: path to the folder where we will cache the results fetched from the remote APIs or `None` if the cache location should be determined as described in :func:`~gpsea.config.get_cache_dir_path`. In any case, the directory will be created if it does not exist (including non-existing parents). + :param include_ontology_class_onsets: `True` if onsets in the ontology class format + (e.g. `HP:0003621` for Juvenile onset) should be included (default `True`). :param variant_fallback: the fallback variant annotator to use if we cannot find the annotation locally. Choose from ``{'VEP'}`` (just one fallback implementation is available at the moment). :param timeout: timeout in seconds for the REST APIs @@ -73,6 +76,7 @@ def configure_caching_cohort_creator( build, cache_dir, timeout ) hgvs_annotator = VVHgvsVariantCoordinateFinder(build) + term_onset_parser = PhenopacketOntologyTermOnsetParser.default_parser() if include_ontology_class_onsets else None pc = PhenopacketPatientCreator( hpo=hpo, validator=validator, @@ -80,6 +84,7 @@ def configure_caching_cohort_creator( functional_annotator=functional_annotator, imprecise_sv_functional_annotator=imprecise_sv_functional_annotator, hgvs_coordinate_finder=hgvs_annotator, + term_onset_parser=term_onset_parser, ) return CohortCreator(pc) @@ -89,6 +94,7 @@ def configure_cohort_creator( hpo: hpotk.MinimalOntology, genome_build: str = "GRCh38.p13", validation_runner: typing.Optional[ValidationRunner] = None, + include_ontology_class_onsets: bool = True, variant_fallback: str = "VEP", timeout: float = 30.0, ) -> CohortCreator[Phenopacket]: @@ -102,6 +108,8 @@ def configure_cohort_creator( :param validation_runner: an instance of the validation runner. if the data should be cached in `.cache` folder in the current working directory. In any case, the directory will be created if it does not exist (including non-existing parents). + :param include_ontology_class_onsets: `True` if onsets in the ontology class format + (e.g. `HP:0003621` for Juvenile onset) should be included (default `True`). :param variant_fallback: the fallback variant annotator to use if we cannot find the annotation locally. Choose from ``{'VEP'}`` (just one fallback implementation is available at the moment). :param timeout: timeout in seconds for the VEP API @@ -116,6 +124,7 @@ def configure_cohort_creator( timeout=timeout, ) hgvs_annotator = VVHgvsVariantCoordinateFinder(build) + term_onset_parser = PhenopacketOntologyTermOnsetParser.default_parser() if include_ontology_class_onsets else None pc = PhenopacketPatientCreator( hpo=hpo, validator=validator, @@ -123,6 +132,7 @@ def configure_cohort_creator( functional_annotator=functional_annotator, imprecise_sv_functional_annotator=imprecise_sv_functional_annotator, hgvs_coordinate_finder=hgvs_annotator, + term_onset_parser=term_onset_parser, ) return CohortCreator(pc) diff --git a/src/gpsea/preprocessing/_phenopacket.py b/src/gpsea/preprocessing/_phenopacket.py index 3c7486c6..c7d90fb8 100644 --- a/src/gpsea/preprocessing/_phenopacket.py +++ b/src/gpsea/preprocessing/_phenopacket.py @@ -10,7 +10,7 @@ import phenopackets.schema.v2.core.individual_pb2 as ppi from phenopackets.schema.v2.phenopackets_pb2 import Phenopacket -from phenopackets.schema.v2.core.base_pb2 import TimeElement as PPTimeElement +from phenopackets.schema.v2.core.base_pb2 import TimeElement as PPTimeElement, OntologyClass as PPOntologyClass from phenopackets.schema.v2.core.phenotypic_feature_pb2 import PhenotypicFeature as PPPhenotypicFeature from phenopackets.schema.v2.core.disease_pb2 import Disease as PPDisease from phenopackets.schema.v2.core.measurement_pb2 import Measurement as PPMeasurement @@ -226,6 +226,99 @@ def _looks_like_large_sv( return structural_type is not None and gene_context is not None +class PhenopacketOntologyTermOnsetParser: + """ + Parser for mapping an onset formatted as an ontology class to the corresponding :class:`~gpsea.model.Age`. + + Each HPO onset includes start and end bounds (e.g. 29th day to 16th year for Pediatric onset) of the onset range + and the onset is mapped into the midpoint of the range. + + Use `default_parser` to create the parser for parsing current HPO + or provide the curie -> :class:`~gpsea.model.Age` mapping via `__init__`. + """ + + @staticmethod + def default_parser() -> "PhenopacketOntologyTermOnsetParser": + # These ranges are horribly general. + # Assuming 40 weeks as birth date and 80 years as age of death. + weeks_at_birth = 40 + age_at_death = 80 + term_id_to_range={ + 'HP:0030674': (Age.last_menstrual_period(), Age.gestational(weeks=weeks_at_birth)), # Antenatal onset + 'HP:0011460': (Age.last_menstrual_period(), Age.gestational(weeks=11)), # Embryonal onset + 'HP:0011461': (Age.gestational(weeks=11), Age.gestational(weeks=weeks_at_birth)), # Fetal onset + 'HP:0034199': (Age.gestational(weeks=11), Age.gestational(weeks=14)), # Late first trimester onset + 'HP:0034198': (Age.gestational(weeks=14), Age.gestational(weeks=28)), # Second trimester onset + 'HP:0034197': (Age.gestational(weeks=28), Age.gestational(weeks=weeks_at_birth)), # Third trimester onset + + 'HP:0003577': (Age.birth(), Age.birth()), # Congenital onset + 'HP:0003623': (Age.birth(), Age.postnatal_days(29)), # Neonatal onset + + 'HP:0410280': (Age.postnatal_days(29), Age.postnatal_years(16)), # Pediatric onset + 'HP:0003593': (Age.postnatal_days(29), Age.postnatal_years(1)), # Infantile onset + 'HP:0011463': (Age.postnatal_years(1), Age.postnatal_years(5)), # Childhood onset + 'HP:0003621': (Age.postnatal_years(5), Age.postnatal_years(16)), # Juvenile onset + + 'HP:0003581': (Age.postnatal_years(16), Age.postnatal_years(age_at_death)), # Adult onset + 'HP:0011462': (Age.postnatal_years(16), Age.postnatal_years(40)), # Young adult onset + 'HP:0025708': (Age.postnatal_years(16), Age.postnatal_years(19)), # Early young adult onset + 'HP:0025709': (Age.postnatal_years(19), Age.postnatal_years(25)), # Intermediate young adult onset + 'HP:0025710': (Age.postnatal_years(25), Age.postnatal_years(40)), # Late young adult onset + + 'HP:0003596': (Age.postnatal_years(40), Age.postnatal_years(60)), # Middle age onset + 'HP:0003584': (Age.postnatal_years(60), Age.postnatal_years(age_at_death)), # Late onset + } + return PhenopacketOntologyTermOnsetParser( + term_id_to_age={ + curie: PhenopacketOntologyTermOnsetParser._median_age(start, end) + for curie, (start, end) in term_id_to_range.items() + }, + ) + + @staticmethod + def _median_age(left: Age, right: Age) -> Age: + # Assuming right is at or after left + days = left.days + ((right.days - left.days) / 2) + if left.is_gestational and right.is_gestational: + return Age.gestational_days(days=days) + elif left.is_postnatal and right.is_postnatal: + return Age.postnatal_days(days=days) + else: + raise ValueError(f'`left` and `right` must be on the same timeline, but left={left.timeline}, right={right.timeline}`') + + def __init__( + self, + term_id_to_age: typing.Mapping[str, Age], + ): + """ + Create the onset parser from term to age mapping. + + :param term_id_to_age: a mapping from HPO curie (e.g. `HP:0410280`) to the corresponding :class:`~gpsea.model.Age`. + """ + self._term_id_to_age = dict(term_id_to_age) + + def process( + self, + ontology_class: PPOntologyClass, + notepad: Notepad, + ) -> typing.Optional[Age]: + curie = ontology_class.id + if curie.startswith('HP:'): + age = self._term_id_to_age.get(curie, None) + if age is None: + notepad.add_warning( + f'Unknown onset term {curie}', + solution='Use a term from HPO\'s Onset [HP:0003674] module', + ) + return age + else: + notepad.add_warning( + f'Unsupported ontology class {curie}', + solution='Use a term from HPO\'s Onset [HP:0003674] module', + ) + return None + + class PhenopacketPatientCreator(PatientCreator[Phenopacket]): """ `PhenopacketPatientCreator` transforms `Phenopacket` into :class:`~gpsea.model.Patient`. @@ -247,6 +340,7 @@ def __init__( functional_annotator: FunctionalAnnotator, imprecise_sv_functional_annotator: ImpreciseSvFunctionalAnnotator, hgvs_coordinate_finder: VariantCoordinateFinder[str], + term_onset_parser: typing.Optional[PhenopacketOntologyTermOnsetParser] = None, ): self._logger = logging.getLogger(__name__) # Violates DI, but it is specific to this class, so I'll leave it "as is". @@ -255,8 +349,12 @@ def __init__( ) self._gt_parser = PhenopacketGenotypeParser() self._validator = validate_instance(validator, ValidationRunner, 'validator') + if term_onset_parser is not None: + assert isinstance(term_onset_parser, PhenopacketOntologyTermOnsetParser) + self._term_onset_parser = term_onset_parser self._phenotype_creator = PhenopacketPhenotypicFeatureCreator( hpo=validate_instance(hpo, hpotk.MinimalOntology, 'hpo'), + term_onset_parser=term_onset_parser, ) self._functional_annotator = validate_instance( functional_annotator, FunctionalAnnotator, "functional_annotator" @@ -405,8 +503,9 @@ def _add_diseases( term_id = hpotk.TermId.from_curie(dis.term.id) if dis.HasField("onset"): - onset = PhenopacketPatientCreator._parse_time_element( + onset = parse_onset_element( time_element=dis.onset, + term_onset_parser=self._term_onset_parser, notepad=ith_disease_subsection, ) else: @@ -423,22 +522,6 @@ def _add_diseases( ) return final_diseases - - @staticmethod - def _parse_time_element( - time_element: PPTimeElement, - notepad: Notepad, - ) -> typing.Optional[Age]: - case = time_element.WhichOneof("element") - if case == "gestational_age": - age = time_element.gestational_age - return Age.gestational(weeks=age.weeks, days=age.days) - elif case == "age": - age = time_element.age - return Age.from_iso8601_period(value=age.iso8601duration) - else: - notepad.add_warning(f"`time_element` is in currently unsupported format `{case}`") - return None def _add_measurements( self, @@ -508,7 +591,7 @@ def _extract_age( ) -> typing.Optional[Age]: if individual.HasField("time_at_last_encounter"): tale = individual.time_at_last_encounter - return PhenopacketPatientCreator._parse_time_element(tale, notepad) + return parse_age_element(tale, notepad) return None @staticmethod @@ -530,7 +613,7 @@ def _extract_vital_status( status = Status.UNKNOWN if vital_status.HasField("time_of_death"): - age_of_death = PhenopacketPatientCreator._parse_time_element( + age_of_death = parse_age_element( time_element=vital_status.time_of_death, notepad=notepad, ) @@ -746,8 +829,10 @@ class PhenopacketPhenotypicFeatureCreator: def __init__( self, hpo: hpotk.MinimalOntology, + term_onset_parser: typing.Optional[PhenopacketOntologyTermOnsetParser], ): self._hpo = hpo + self._term_onset_parser = term_onset_parser def process( self, @@ -796,8 +881,9 @@ def process( ) if pf.HasField("onset"): - onset = PhenopacketPatientCreator._parse_time_element( + onset = parse_onset_element( time_element=pf.onset, + term_onset_parser=self._term_onset_parser, notepad=notepad, ) else: @@ -808,3 +894,44 @@ def process( is_observed=not pf.excluded, onset=onset, ) + +def parse_onset_element( + time_element: PPTimeElement, + term_onset_parser: typing.Optional[PhenopacketOntologyTermOnsetParser], + notepad: Notepad, +) -> typing.Optional[Age]: + """ + We allow to use `GestationalAge`, `Age` or `OntologyClass` as onset. + """ + case = time_element.WhichOneof("element") + if case == "ontology_class": + if term_onset_parser is None: + return None + else: + return term_onset_parser.process( + ontology_class=time_element.ontology_class, + notepad=notepad, + ) + else: + return parse_age_element( + time_element=time_element, + notepad=notepad, + ) + +def parse_age_element( + time_element: PPTimeElement, + notepad: Notepad, +) -> typing.Optional[Age]: + """ + We allow to use `GestationalAge` or `Age` as age. + """ + case = time_element.WhichOneof("element") + if case == "gestational_age": + age = time_element.gestational_age + return Age.gestational(weeks=age.weeks, days=age.days) + elif case == "age": + age = time_element.age + return Age.from_iso8601_period(value=age.iso8601duration) + else: + notepad.add_warning(f"`time_element` is in currently unsupported format `{case}`") + return None diff --git a/src/gpsea/preprocessing/_test__phenopacket.py b/src/gpsea/preprocessing/_test__phenopacket.py new file mode 100644 index 00000000..54418b9c --- /dev/null +++ b/src/gpsea/preprocessing/_test__phenopacket.py @@ -0,0 +1,44 @@ +import pytest + +from stairval.notepad import create_notepad +from phenopackets.schema.v2.core.base_pb2 import OntologyClass + +from ._phenopacket import PhenopacketOntologyTermOnsetParser + +class TestPhenopacketOntologyTermOnsetParser: + + @pytest.fixture(scope='class') + def parser(self) -> PhenopacketOntologyTermOnsetParser: + return PhenopacketOntologyTermOnsetParser.default_parser() + + @pytest.mark.parametrize( + 'curie, days, is_postnatal', + [ + ('HP:0030674', 140., False), # Antenatal onset + ('HP:0011460', 38.5, False), # Embryonal onset + + ('HP:0003577', 0., True), # Congenital onset + ('HP:0003623', 14.5, True), # Neonatal onset + ('HP:0410280', 2936.5, True), # Pediatric onset + ('HP:0011462', 10227.0, True), # Young adult onset + + ('HP:0003584', 25567.5, True), # Late onset + ] + ) + def test_process( + self, + curie: str, + days: float, + is_postnatal: bool, + parser: PhenopacketOntologyTermOnsetParser, + ): + notepad = create_notepad(label='Top') + oc = OntologyClass(id=curie, label='Whatever') + + age = parser.process(oc, notepad) + + assert age is not None + assert age.is_postnatal == is_postnatal + assert age.days == pytest.approx(days) + + assert not notepad.has_errors_or_warnings(include_subsections=True) diff --git a/src/gpsea/preprocessing/_uniprot.py b/src/gpsea/preprocessing/_uniprot.py index ccdf5ed7..03ee6663 100644 --- a/src/gpsea/preprocessing/_uniprot.py +++ b/src/gpsea/preprocessing/_uniprot.py @@ -110,7 +110,9 @@ def annotate(self, protein_id: str) -> ProteinMetadata: Args: protein_id (str): A protein ID Returns: - Sequence[ProteinMetadata]: A sequence of ProteinMetadata objects, or an empty sequence if no data was found. + ProteinMetadata: A :class:`~gpsea.model.ProteinMetadata` corresponding to the input `protein_id`. + Raises: + ValueError: in case of issues with `protein_id`, I/O issues, or parsing the REST response. """ if not isinstance(protein_id, str): raise ValueError(f'Protein ID must be a str but it was {type(protein_id)}') diff --git a/src/gpsea/view/_base.py b/src/gpsea/view/_base.py new file mode 100644 index 00000000..f26e34a3 --- /dev/null +++ b/src/gpsea/view/_base.py @@ -0,0 +1,34 @@ +import abc + +from jinja2 import Environment, PackageLoader + + +def pluralize(count: int, stem: str) -> str: + if isinstance(count, int): + if count == 1: + return f"{count} {stem}" + else: + if stem.endswith("s"): + return f"{count} {stem}es" + else: + return f"{count} {stem}s" + else: + raise ValueError(f"{count} must be an `int`!") + + +def was_were(count: int) -> str: + if isinstance(count, int): + if count == 1: + return "1 was" + else: + return f"{count} were" + else: + raise ValueError(f"{count} must be an `int`!") + + +class BaseViewer(metaclass=abc.ABCMeta): + + def __init__(self): + self._environment = Environment(loader=PackageLoader("gpsea.view", "templates")) + self._environment.filters["pluralize"] = pluralize + self._environment.filters["was_were"] = was_were diff --git a/src/gpsea/view/_cohort.py b/src/gpsea/view/_cohort.py index 2cd6d404..557e94ad 100644 --- a/src/gpsea/view/_cohort.py +++ b/src/gpsea/view/_cohort.py @@ -2,26 +2,26 @@ from collections import namedtuple, defaultdict -from hpotk import MinimalOntology -from jinja2 import Environment, PackageLoader +import hpotk from gpsea.model import Cohort, Sex +from ._base import BaseViewer from ._report import GpseaReport, HtmlGpseaReport from ._formatter import VariantFormatter ToDisplay = namedtuple('ToDisplay', ['hgvs_cdna', 'hgvsp', 'variant_effects']) -class CohortViewer: +class CohortViewer(BaseViewer): """ `CohortViewer` summarizes the most salient :class:`~gpsea.model.Cohort` aspects into an HTML report. """ def __init__( - self, - hpo: MinimalOntology, - top_phenotype_count: int = 10, - top_variant_count: int = 10, + self, + hpo: hpotk.MinimalOntology, + top_phenotype_count: int = 10, + top_variant_count: int = 10, ): """ Args: @@ -29,11 +29,11 @@ def __init__( top_phenotype_count(int): Maximum number of HPO terms to display in the HTML table (default: 10) top_variant_count(int): Maximum number of variants to display in the HTML table (default: 10) """ + super().__init__() self._hpo = hpo self._top_phenotype_count = top_phenotype_count self._top_variant_count = top_variant_count - environment = Environment(loader=PackageLoader('gpsea.view', 'templates')) - self._cohort_template = environment.get_template("cohort.html") + self._cohort_template = self._environment.get_template("cohort.html") def process( self, @@ -62,35 +62,18 @@ def _prepare_context( ) -> typing.Mapping[str, typing.Any]: hpo_counts = list() - for hpo in cohort.list_present_phenotypes(top=self._top_phenotype_count): - hpo_id = hpo[0] - individual_count = hpo[1] - hpo_label = "n/a" - if hpo_id in self._hpo: - hpo_label = self._hpo.get_term_name(hpo_id) + for term_id, count in cohort.list_present_phenotypes(top=self._top_phenotype_count): + label = self._hpo.get_term_name(term_id) + if label is None: + label = 'N/A' + hpo_counts.append( { - "HPO": hpo_label, - "ID": hpo_id, - "Count": individual_count, + "label": label, + "term_id": term_id, + "count": count, } ) - # Show the same max number of measuresments as phenotypes - measurement_counts = list() - for msmt in cohort.list_measurements(top=self._top_phenotype_count): - msmt_id = msmt[0] - individual_count = msmt[1] - msmt_label = "n/a" - #if hpo_id in self._hpo: - # hpo_label = self._hpo.get_term_name(hpo_id) - msmt_label = "need a way to retrieve measuremnt label" - measurement_counts.append( - { - "Assay": msmt_label, - "ID": msmt_id, - "Count": individual_count, - }) - n_measurements = len(measurement_counts) variant_counts = list() variant_to_display_d = CohortViewer._get_variant_description(cohort, transcript_id) @@ -98,87 +81,68 @@ def _prepare_context( # get HGVS or human readable variant if variant_key in variant_to_display_d: display = variant_to_display_d[variant_key] - hgvs_cdna = display.hgvs_cdna - protein_name = display.hgvsp + hgvsc = display.hgvs_cdna + hgvsp = display.hgvsp effects = '' if display.variant_effects is None else ", ".join(display.variant_effects) else: - display = variant_key - hgvs_cdna = '' - protein_name = '' + hgvsc = '' + hgvsp = '' effects = '' variant_counts.append( { - "variant": variant_key, - "variant_name": hgvs_cdna, - "protein_name": protein_name, - "variant_effects": effects, - "Count": count, + "count": count, + "key": variant_key, + "hgvsc": hgvsc, + "hgvsp": hgvsp, + "effects": effects, } ) disease_counts = list() for disease_id, disease_count in cohort.list_all_diseases(): - disease_name = "Unknown" + label = "Unknown" for disease in cohort.all_diseases(): if disease.identifier.value == disease_id: - disease_name = disease.name + label = disease.name + break + disease_counts.append( { "disease_id": disease_id, - "disease_name": disease_name, + "label": label, "count": disease_count, } ) - n_diseases = len(disease_counts) - - var_effects_list = list() - var_effects_d = dict() - if transcript_id is not None: - has_transcript = True - data_by_tx = cohort.variant_effect_count_by_tx(tx_id=transcript_id) - # e.g., data structure - # -- {'effect}': 'FRAMESHIFT_VARIANT', 'count': 175}, - # -- {'effect}': 'STOP_GAINED', 'count': 67}, - for tx_id, counter in data_by_tx.items(): + variant_effects = list() + has_transcript = transcript_id is not None + if has_transcript: + var_effects_d = dict() + for tx_id, counter in cohort.variant_effect_count_by_tx(tx_id=transcript_id).items(): + # e.g., data structure + # -- {'effect}': 'FRAMESHIFT_VARIANT', 'count': 175}, + # -- {'effect}': 'STOP_GAINED', 'count': 67}, if tx_id == transcript_id: for effect, count in counter.items(): var_effects_d[effect] = count + break total = sum(var_effects_d.values()) # Sort in descending order based on counts - sorted_counts_desc = dict(sorted(var_effects_d.items(), key=lambda item: item[1], reverse=True)) - for effect, count in sorted_counts_desc.items(): - percent = f"{round(count / total * 100)}%" - var_effects_list.append({"effect": effect, "count": count, "percent": percent}) + for effect, count in sorted(var_effects_d.items(), key=lambda item: item[1], reverse=True): + variant_effects.append( + { + "effect": effect, + "count": count, + "percent": round(count / total * 100), + } + ) else: - has_transcript = False transcript_id = "MANE transcript ID" - n_male = 0 - n_female = 0 - n_unknown_sex = 0 - n_deceased = 0 - n_alive = 0 - n_has_age_at_last_encounter = 0 n_has_disease_onset = 0 hpo_id_to_has_cnset_count_d = defaultdict(int) for pat in cohort.all_patients: - if pat.sex == Sex.MALE: - n_male += 1 - elif pat.sex == Sex.FEMALE: - n_female += 1 - else: - n_unknown_sex += 1 - patient_not_deceased = True - if pat.vital_status is not None: - if pat.vital_status.is_deceased: - n_deceased += 1 - patient_not_deceased = False - if patient_not_deceased: - n_alive += 1 - if pat.age is not None: - n_has_age_at_last_encounter += 1 diseases = pat.diseases for d in diseases: if d.onset is not None: @@ -191,47 +155,34 @@ def _prepare_context( terms_and_ancestors_with_onset_information.update(ancs) for hpo_id in terms_and_ancestors_with_onset_information: hpo_id_to_has_cnset_count_d[hpo_id] += 1 + # When we get here, we want to present the counts of HPO terms that have onset information - n_individuals = len(cohort.all_patients) - onset_threshold = int(0.2 * n_individuals) # only show terms with a decent amount of information - has_onset_information = list() - for hpo_id, count in hpo_id_to_has_cnset_count_d.items(): - if count < onset_threshold: - continue - label = self._hpo.get_term_name(hpo_id) - if label is not None: - display_label = f"{label} ({hpo_id})" - else: - display_label = hpo_id - has_onset_information.append({"HPO": display_label, "count": count}) - n_has_onset_info = len(has_onset_information) + # onset_threshold = int(0.2 * len(cohort)) # only show terms with a decent amount of information + # has_onset_information = list() + # for hpo_id, count in hpo_id_to_has_cnset_count_d.items(): + # if count < onset_threshold: + # continue + # label = self._hpo.get_term_name(hpo_id) + # if label is not None: + # display_label = f"{label} ({hpo_id})" + # else: + # display_label = hpo_id + # has_onset_information.append( + # {"HPO": display_label, "count": count}) + # n_has_onset_info = len(has_onset_information) # The following dictionary is used by the Jinja2 HTML template return { - "n_individuals": n_individuals, - "n_male": n_male, - "n_female": n_female, - "n_unknown_sex": n_unknown_sex, - "n_deceased": n_deceased, - "n_alive": n_alive, - "n_has_disease_onset": n_has_disease_onset, - "n_has_age_at_last_encounter": n_has_age_at_last_encounter, - "has_onset_information": has_onset_information, - "n_has_onset_info": n_has_onset_info, - "n_excluded": cohort.get_excluded_count(), - "total_hpo_count": len(cohort.all_phenotypes()), + "cohort": cohort, + "transcript_id": transcript_id, "top_hpo_count": self._top_phenotype_count, "hpo_counts": hpo_counts, - "total_measurement_count": n_measurements, - "measurement_counts": measurement_counts, - "unique_variant_count": len(cohort.all_variant_infos()), "top_var_count": self._top_variant_count, "var_counts": variant_counts, - "n_diseases": n_diseases, + # "n_diseases": n_diseases, "disease_counts": disease_counts, "has_transcript": has_transcript, - "var_effects_list": var_effects_list, - "transcript_id": transcript_id, + "variant_effects": variant_effects, } @staticmethod diff --git a/src/gpsea/view/templates/cohort.html b/src/gpsea/view/templates/cohort.html index fdd45d0a..d566d63f 100644 --- a/src/gpsea/view/templates/cohort.html +++ b/src/gpsea/view/templates/cohort.html @@ -1,9 +1,10 @@ + Cohort -