Skip to content
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions hail_search/definitions.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,10 @@ def family_entries_field(self) -> str:
}[self]

@property
def passes_inheritance_field(self) -> str:
def failed_family_sample_field(self) -> str:
return {
SampleType.WES: 'wes_passes_inheritance',
SampleType.WGS: 'wgs_passes_inheritance',
SampleType.WES: 'wes_failed_family_sample_guids',
SampleType.WGS: 'wgs_failed_family_sample_guids',
}[self]

@property
Expand Down
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
This folder comprises a Hail (www.hail.is) native Table or MatrixTable.
Written with version 0.2.130-bea04d9c79b5
Created at 2024/10/02 14:46:35
Written with version 0.2.132-678e1f52b999
Created at 2024/10/28 16:21:30
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
47 changes: 27 additions & 20 deletions hail_search/queries/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -473,7 +473,7 @@ def _filter_single_entries_table(self, ht, project_families, inheritance_filter=
ht, sorted_family_sample_data = self._add_entry_sample_families(ht, project_families, is_merged_ht)
ht = self._filter_quality(ht, quality_filter, **kwargs)
ht, ch_ht = self._filter_inheritance(
ht, None, inheritance_filter, sorted_family_sample_data,
ht, None, inheritance_filter, sorted_family_sample_data, self._annotate_families_inheritance
)
ht = self._apply_entry_filters(ht)
ch_ht = self._apply_entry_filters(ch_ht)
Expand Down Expand Up @@ -573,7 +573,7 @@ def _get_sample_type(cls, family_index, ht_globals):
return ht_globals.sample_type

def _filter_inheritance(
self, ht, comp_het_ht, inheritance_filter, sorted_family_sample_data,
self, ht, comp_het_ht, inheritance_filter, sorted_family_sample_data, annotate_func,
annotation='family_entries', entries_ht_field='family_entries'
):
any_valid_entry = lambda x: self.GENOTYPE_QUERY_MAP[HAS_ALT](x.GT)
Expand All @@ -584,12 +584,12 @@ def _filter_inheritance(
any_valid_entry = lambda x: prev_any_valid_entry(x) & (x.affected_id == AFFECTED_ID)

ht = ht.annotate(**{
annotation: ht[entries_ht_field].map(
entries_ht_field: ht[entries_ht_field].map(
lambda entries: hl.or_missing(entries.any(any_valid_entry), entries)
)})

if self._has_comp_het_search:
comp_het_ht = self._annotate_families_inheritance(
comp_het_ht = annotate_func(
comp_het_ht if comp_het_ht is not None else ht, COMPOUND_HET, inheritance_filter,
sorted_family_sample_data, annotation, entries_ht_field
)
Expand All @@ -598,7 +598,7 @@ def _filter_inheritance(
# No sample-specific inheritance filtering needed
sorted_family_sample_data = []

ht = None if self._inheritance_mode == COMPOUND_HET else self._annotate_families_inheritance(
ht = None if self._inheritance_mode == COMPOUND_HET else annotate_func(
ht, self._inheritance_mode, inheritance_filter, sorted_family_sample_data,
annotation, entries_ht_field
)
Expand All @@ -609,6 +609,27 @@ def _annotate_families_inheritance(
self, ht, inheritance_mode, inheritance_filter, sorted_family_sample_data,
annotation, entries_ht_field,
):
entry_indices_by_gt = self._get_entry_indices_by_gt_map(
inheritance_filter, inheritance_mode, sorted_family_sample_data
)

for genotype, entry_indices in entry_indices_by_gt.items():
if not entry_indices:
continue
entry_indices = hl.dict(entry_indices)
ht = ht.annotate(**{
annotation: hl.enumerate(ht[entries_ht_field]).starmap(
lambda family_i, family_samples: hl.or_missing(
~entry_indices.contains(family_i) | entry_indices[family_i].all(
lambda sample_i: self.GENOTYPE_QUERY_MAP[genotype](family_samples[sample_i].GT)
), family_samples,
),
)
})

return ht

def _get_entry_indices_by_gt_map(self, inheritance_filter, inheritance_mode, sorted_family_sample_data):
individual_genotype_filter = (inheritance_filter or {}).get('genotype')

# Create a mapping of genotypes to check against a list of samples for a family
Expand All @@ -630,21 +651,7 @@ def _annotate_families_inheritance(
]
self.max_unaffected_samples = max(family_unaffected_counts) if family_unaffected_counts else 0

for genotype, entry_indices in entry_indices_by_gt.items():
if not entry_indices:
continue
entry_indices = hl.dict(entry_indices)
ht = ht.annotate(**{
annotation: hl.enumerate(ht[entries_ht_field]).starmap(
lambda family_i, family_samples: hl.or_missing(
~entry_indices.contains(family_i) | entry_indices[family_i].all(
lambda sample_i: self.GENOTYPE_QUERY_MAP[genotype](family_samples[sample_i].GT)
), family_samples,
),
)
})

return ht
return entry_indices_by_gt

def _get_family_passes_quality_filter(self, quality_filter, ht, **kwargs):
quality_filter = quality_filter or {}
Expand Down
97 changes: 76 additions & 21 deletions hail_search/queries/mito.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,34 +209,55 @@ def _filter_entries_ht_both_sample_types(
for sample_type, sorted_family_sample_data in sample_types:
ht, ch_ht = self._filter_inheritance(
ht, ch_ht, inheritance_filter, sorted_family_sample_data,
annotation=sample_type.passes_inheritance_field, entries_ht_field=sample_type.family_entries_field
annotate_func=self._annotate_failed_family_samples_inheritance,
annotation=sample_type.failed_family_sample_field, entries_ht_field=sample_type.family_entries_field,
)
for family_idx, samples in enumerate(sorted_family_sample_data):
family_guid = samples[0]['familyGuid']
family_guid_idx_map[family_guid][sample_type.value] = family_idx

family_idx_map = hl.dict(family_guid_idx_map)
ht = self._apply_multi_sample_type_entry_filters(ht, family_idx_map)
ch_ht = self._apply_multi_sample_type_entry_filters(ch_ht, family_idx_map)
family_guid_idx_map = hl.dict(family_guid_idx_map)
ht = self._apply_multi_sample_type_entry_filters(ht, family_guid_idx_map)
ch_ht = self._apply_multi_sample_type_entry_filters(ch_ht, family_guid_idx_map)
return ht, ch_ht

def _annotate_failed_family_samples_inheritance(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This feels incredibly similar to _annotate_families_inheritance. Rather than making a whole separate function for this, you could make a much more tightly scoped conditional helper to pass into _annotate_families_inheritance, perhaps just for the lambda function applied to the hl.enumerate(ht[entries_ht_field]).starmap(

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I implemented this, but because I don't want the single sample type families that come originally through the mito class code path to call the mito 'family_passes_inheritance_filter' function, I'm still passing a family_passes_inheritance_filter function to _filter_inheritance instead of using python inheritance here. Do you know if there's a cleaner way to do this?

self, ht, inheritance_mode, inheritance_filter, sorted_family_sample_data, annotation, entries_ht_field
):
entry_indices_by_gt = self._get_entry_indices_by_gt_map(
inheritance_filter, inheritance_mode, sorted_family_sample_data
)

if ht is None:
return ht

# Initialize empty array
ht = ht.annotate(**{annotation: ht[entries_ht_field].map(lambda x: hl.empty_array(hl.tstr))})

# Add failed genotype samples
for genotype, entry_indices in entry_indices_by_gt.items():
if not entry_indices:
continue

entry_indices = hl.dict(entry_indices)
ht = ht.annotate(**{annotation: hl.enumerate(ht[entries_ht_field]).starmap(
lambda family_idx, entries: hl.bind(
lambda failed_samples: ht[annotation][family_idx].extend(failed_samples),
entry_indices.get(family_idx).filter(
lambda sample_idx: ~self.GENOTYPE_QUERY_MAP[genotype](entries[sample_idx].GT)
).map(lambda sample_idx: entries[sample_idx]['sampleId'])
)
)})

return ht

def _apply_multi_sample_type_entry_filters(self, ht, family_idx_map):
if ht is None:
return ht

# Keep family from both sample types if either passes quality AND inheritance
for sample_type in SampleType:
ht = ht.annotate(**{
sample_type.family_entries_field: hl.enumerate(ht[sample_type.family_entries_field]).starmap(
lambda i, family_samples: hl.or_missing(
hl.bind(
lambda other_sample_type_idx: (
self._family_has_valid_sample_type_entries(ht, sample_type, i) |
self._family_has_valid_sample_type_entries(ht, sample_type.other_sample_type, other_sample_type_idx)
),
family_idx_map.get(hl.coalesce(family_samples)[0]['familyGuid']).get(sample_type.other_sample_type.value),
), family_samples)
)})
ht = self._apply_quality_entry_filters(ht, sample_type, family_idx_map)
ht = self._apply_inheritance_entry_filters(ht, sample_type, family_idx_map)

# Merge family entries and filters from both sample types
ht = ht.transmute(
Expand All @@ -252,15 +273,49 @@ def _apply_multi_sample_type_entry_filters(self, ht, family_idx_map):
# Filter out families with no valid entries in either sample type
return ht.filter(ht.family_entries.any(hl.is_defined))

def _apply_quality_entry_filters(self, ht, sample_type, family_idx_map):
return ht.annotate(**{
sample_type.family_entries_field: hl.enumerate(ht[sample_type.family_entries_field]).starmap(
lambda i, family_samples: hl.or_missing(
hl.bind(lambda other_sample_type_idx: (
self._family_has_valid_quality(ht, sample_type, i) |
self._family_has_valid_quality(ht, sample_type.other_sample_type, other_sample_type_idx)
), family_idx_map.get(hl.coalesce(family_samples)[0]['familyGuid']).get(sample_type.other_sample_type.value),
), family_samples)
)})

@staticmethod
def _family_has_valid_sample_type_entries(ht, sample_type, sample_type_family_idx):
# Note: This logic does not sufficiently handle case 2 here https://docs.google.com/presentation/d/1hqDV8ulhviUcR5C4PtNUqkCLXKDsc6pccgFVlFmWUAU/edit?usp=sharing
# and will need to be changed to support it - https://github.com/broadinstitute/seqr/issues/4403
def _family_has_valid_quality(ht, sample_type, sample_type_family_idx):
return (
hl.is_defined(sample_type_family_idx) &
hl.is_defined(ht[sample_type.passes_quality_field][sample_type_family_idx]) &
hl.is_defined(ht[sample_type.passes_inheritance_field][sample_type_family_idx])
hl.is_defined(ht[sample_type.passes_quality_field][sample_type_family_idx])
)

@staticmethod
def _apply_inheritance_entry_filters(ht, sample_type, family_idx_map):
ht = ht.annotate(
**{sample_type.family_entries_field: hl.enumerate(ht[sample_type.family_entries_field]).starmap(
lambda family_idx, family_samples: hl.or_missing(
hl.bind(lambda other_sample_type_family_idx: (
hl.bind(
lambda other_sample_type_pass_samples, sample_type_pass_samples: (
ht[sample_type.failed_family_sample_field][family_idx].all(
other_sample_type_pass_samples.contains
) & ht[sample_type.other_sample_type.failed_family_sample_field][other_sample_type_family_idx].all(
sample_type_pass_samples.contains
)),
Copy link
Contributor Author

@jklugherz jklugherz Oct 28, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

these binds are a little wild, but the logic is that for all families, keep family if all of its samples pass this check:
if sample failed in one sample type, it must pass in the other - and if a sample for that family failed in the other sample type, it must pass in the current sample type

ht[sample_type.other_sample_type.family_entries_field][other_sample_type_family_idx].filter(
lambda s: ~ht[sample_type.other_sample_type.failed_family_sample_field][other_sample_type_family_idx].contains(s['sampleId'])
).map(lambda s: s['sampleId']),
ht[sample_type.other_sample_type.family_entries_field][family_idx].filter(
lambda s: ~ht[sample_type.failed_family_sample_field][family_idx].contains(s['sampleId'])
).map(lambda s: s['sampleId']),
)
), family_idx_map.get(hl.coalesce(family_samples)[0]['familyGuid']).get(sample_type.other_sample_type.value)
), family_samples)
)}
)
return ht

def _get_sample_genotype(self, samples, r=None, include_genotype_overrides=False, select_fields=None, **kwargs):
if not self._has_both_sample_types:
Expand Down
5 changes: 3 additions & 2 deletions hail_search/test_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -388,10 +388,11 @@ async def test_both_sample_types_search(self):
[VARIANT2_BOTH_SAMPLE_TYPES], sample_data=FAMILY_2_BOTH_SAMPLE_TYPE_SAMPLE_DATA_MISSING_PARENTAL_WGS,
inheritance_mode=inheritance_mode, **COMP_HET_ALL_PASS_FILTERS, intervals=[variant2_interval]
)
# Genome passes quality and inheritance exome fails inheritance (parental data shows variant is inherited).
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I still think a comment here explaining whats being tested is helpful. Maybe something like "Variant 2 fails inheritance when parental data is present"

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

updated!

# Genome passes quality and inheritance but exome fails inheritance (parental data shows variant is inherited).
# Variant is excluded from search results.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So this is testing the case where its not returned when theres no valid WGS parental data, but can we also test that it IS returned when we include the parental data for WGS and its overridden?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this case is covered by an above test -

    # Variant1 in family_2 is de novo in exome but maternally inherited in genome.
    # Genome passes quality and inheritance, show genotypes for both sample types.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The search criteria changes between these 2 test cases. We really need back-to-back tests that the exact same search returns different results when the paternal data is present/absent. Instead of running a dominant and recessive search each filtered to variant 1 and variant 2, it would probably be better to not use an interval filter at all and test that dominant and recessive search return the expected combo of variants with and without paternal data

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I did this^!

inheritance_mode = 'de_novo'
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this line is unneeded

await self._assert_expected_search(
[VARIANT2_BOTH_SAMPLE_TYPES], sample_data=FAMILY_2_BOTH_SAMPLE_TYPE_SAMPLE_DATA_MISSING_PARENTAL_WGS,
[], sample_data=FAMILY_2_BOTH_SAMPLE_TYPE_SAMPLE_DATA_MISSING_PARENTAL_WGS,
inheritance_mode=inheritance_mode, intervals=[variant2_interval]
)

Expand Down
2 changes: 1 addition & 1 deletion hail_search/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@
VARIANT1_BOTH_SAMPLE_TYPES['genotypes'] = {
'I000004_hg00731': [
genotypes['I000004_hg00731'],
{**genotypes['I000004_hg00731'], 'sampleType': 'WGS'}
{**genotypes['I000004_hg00731'], 'numAlt': 2, 'sampleType': 'WGS'}
],
'I000005_hg00732': [
genotypes['I000005_hg00732'],
Expand Down
Loading