Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

handle wes/wgs inheritance edge case #4440

Open
wants to merge 16 commits into
base: dev
Choose a base branch
from
6 changes: 3 additions & 3 deletions hail_search/definitions.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,10 @@ def family_entries_field(self) -> str:
}[self]

@property
def passes_inheritance_field(self) -> str:
def failed_family_sample_field(self) -> str:
return {
SampleType.WES: 'wes_passes_inheritance',
SampleType.WGS: 'wgs_passes_inheritance',
SampleType.WES: 'wes_failed_family_sample_guids',
SampleType.WGS: 'wgs_failed_family_sample_guids',
}[self]

@property
Expand Down
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
This folder comprises a Hail (www.hail.is) native Table or MatrixTable.
Written with version 0.2.130-bea04d9c79b5
Created at 2024/10/02 14:46:35
Written with version 0.2.132-678e1f52b999
Created at 2024/10/28 16:21:30
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
56 changes: 34 additions & 22 deletions hail_search/queries/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -472,9 +472,7 @@ def _apply_entry_filters(ht):
def _filter_single_entries_table(self, ht, project_families, inheritance_filter=None, quality_filter=None, is_merged_ht=False, **kwargs):
ht, sorted_family_sample_data = self._add_entry_sample_families(ht, project_families, is_merged_ht)
ht = self._filter_quality(ht, quality_filter, **kwargs)
ht, ch_ht = self._filter_inheritance(
ht, None, inheritance_filter, sorted_family_sample_data,
)
ht, ch_ht = self._filter_inheritance(ht, None, inheritance_filter, sorted_family_sample_data)
ht = self._apply_entry_filters(ht)
ch_ht = self._apply_entry_filters(ch_ht)

Expand Down Expand Up @@ -574,7 +572,7 @@ def _get_sample_type(cls, family_index, ht_globals):

def _filter_inheritance(
self, ht, comp_het_ht, inheritance_filter, sorted_family_sample_data,
annotation='family_entries', entries_ht_field='family_entries'
annotation='family_entries', entries_ht_field='family_entries', **kwargs
):
any_valid_entry = lambda x: self.GENOTYPE_QUERY_MAP[HAS_ALT](x.GT)

Expand All @@ -584,14 +582,14 @@ def _filter_inheritance(
any_valid_entry = lambda x: prev_any_valid_entry(x) & (x.affected_id == AFFECTED_ID)

ht = ht.annotate(**{
annotation: ht[entries_ht_field].map(
entries_ht_field: ht[entries_ht_field].map(
lambda entries: hl.or_missing(entries.any(any_valid_entry), entries)
)})

if self._has_comp_het_search:
comp_het_ht = self._annotate_families_inheritance(
comp_het_ht if comp_het_ht is not None else ht, COMPOUND_HET, inheritance_filter,
sorted_family_sample_data, annotation, entries_ht_field
sorted_family_sample_data, annotation, entries_ht_field, **kwargs
)

if is_any_affected or not (inheritance_filter or self._inheritance_mode):
Expand All @@ -600,15 +598,37 @@ def _filter_inheritance(

ht = None if self._inheritance_mode == COMPOUND_HET else self._annotate_families_inheritance(
ht, self._inheritance_mode, inheritance_filter, sorted_family_sample_data,
annotation, entries_ht_field
annotation, entries_ht_field, **kwargs
)

return ht, comp_het_ht

def _annotate_families_inheritance(
self, ht, inheritance_mode, inheritance_filter, sorted_family_sample_data,
annotation, entries_ht_field,
annotation, entries_ht_field, family_passes_inheritance_filter = None
):
if not family_passes_inheritance_filter:
family_passes_inheritance_filter = self._get_family_passes_inheritance_filter

entry_indices_by_gt = self._get_entry_indices_by_gt_map(
inheritance_filter, inheritance_mode, sorted_family_sample_data
)

for genotype, entry_indices in entry_indices_by_gt.items():
if not entry_indices:
continue
entry_indices = hl.dict(entry_indices)
ht = ht.annotate(**{
annotation: hl.enumerate(ht[entries_ht_field]).starmap(
lambda family_idx, family_samples: family_passes_inheritance_filter(
entry_indices, family_idx, genotype, family_samples, ht, annotation
)
)
})

return ht

def _get_entry_indices_by_gt_map(self, inheritance_filter, inheritance_mode, sorted_family_sample_data):
individual_genotype_filter = (inheritance_filter or {}).get('genotype')

# Create a mapping of genotypes to check against a list of samples for a family
Expand All @@ -630,21 +650,13 @@ def _annotate_families_inheritance(
]
self.max_unaffected_samples = max(family_unaffected_counts) if family_unaffected_counts else 0

for genotype, entry_indices in entry_indices_by_gt.items():
if not entry_indices:
continue
entry_indices = hl.dict(entry_indices)
ht = ht.annotate(**{
annotation: hl.enumerate(ht[entries_ht_field]).starmap(
lambda family_i, family_samples: hl.or_missing(
~entry_indices.contains(family_i) | entry_indices[family_i].all(
lambda sample_i: self.GENOTYPE_QUERY_MAP[genotype](family_samples[sample_i].GT)
), family_samples,
),
)
})
return entry_indices_by_gt

return ht
def _get_family_passes_inheritance_filter(self, entry_indices, family_idx, genotype, family_samples, *args):
return hl.or_missing(
~entry_indices.contains(family_idx) | entry_indices[family_idx].all(
lambda sample_i: self.GENOTYPE_QUERY_MAP[genotype](family_samples[sample_i].GT)
), family_samples)

def _get_family_passes_quality_filter(self, quality_filter, ht, **kwargs):
quality_filter = quality_filter or {}
Expand Down
67 changes: 49 additions & 18 deletions hail_search/queries/mito.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,37 +205,61 @@ def _filter_entries_ht_both_sample_types(
)

ch_ht = None
family_guid_idx_map = defaultdict(dict)
family_idx_map = defaultdict(dict)
for sample_type, sorted_family_sample_data in sample_types:
ht = self._annotate_empty_failed_inheritance(ht, sample_type)
ch_ht = self._annotate_empty_failed_inheritance(ch_ht, sample_type)

ht, ch_ht = self._filter_inheritance(
ht, ch_ht, inheritance_filter, sorted_family_sample_data,
annotation=sample_type.passes_inheritance_field, entries_ht_field=sample_type.family_entries_field
annotation=sample_type.failed_family_sample_field, entries_ht_field=sample_type.family_entries_field,
family_passes_inheritance_filter=self._get_family_passes_inheritance_filter_both_sample_types
)
for family_idx, samples in enumerate(sorted_family_sample_data):
family_guid = samples[0]['familyGuid']
family_guid_idx_map[family_guid][sample_type.value] = family_idx
family_idx_map[family_guid][sample_type.value] = family_idx

family_idx_map = hl.dict(family_guid_idx_map)
family_idx_map = hl.dict(family_idx_map)
ht = self._apply_multi_sample_type_entry_filters(ht, family_idx_map)
ch_ht = self._apply_multi_sample_type_entry_filters(ch_ht, family_idx_map)
return ht, ch_ht

@staticmethod
def _annotate_empty_failed_inheritance(ht, sample_type):
if ht is None:
return ht

return ht.annotate(**{
sample_type.failed_family_sample_field: ht[sample_type.family_entries_field].map(
lambda x: hl.empty_array(hl.tstr)
)})

def _get_family_passes_inheritance_filter_both_sample_types(
self, entry_indices, family_idx, genotype, family_samples, ht, annotation
):
return hl.bind(
lambda failed_samples: ht[annotation][family_idx].extend(failed_samples),
entry_indices.get(family_idx).filter(
lambda sample_idx: ~self.GENOTYPE_QUERY_MAP[genotype](family_samples[sample_idx].GT)
).map(lambda sample_idx: family_samples[sample_idx]['sampleId'])
)
jklugherz marked this conversation as resolved.
Show resolved Hide resolved

def _apply_multi_sample_type_entry_filters(self, ht, family_idx_map):
if ht is None:
return ht

# Keep family from both sample types if either passes quality AND inheritance
for sample_type in SampleType:
ht = ht.annotate(**{
sample_type.family_entries_field: hl.enumerate(ht[sample_type.family_entries_field]).starmap(
lambda i, family_samples: hl.or_missing(
hl.bind(
lambda other_sample_type_idx: (
self._family_has_valid_sample_type_entries(ht, sample_type, i) |
self._family_has_valid_sample_type_entries(ht, sample_type.other_sample_type, other_sample_type_idx)
),
family_idx_map.get(hl.coalesce(family_samples)[0]['familyGuid']).get(sample_type.other_sample_type.value),
), family_samples)
lambda family_idx, family_samples: hl.or_missing(
hl.bind(lambda other_sample_type_family_idx: ((
self._family_has_valid_quality(ht, sample_type, family_idx) |
self._family_has_valid_quality(ht, sample_type.other_sample_type, other_sample_type_family_idx)
) &
self._family_has_valid_inheritance(ht, sample_type, family_idx, other_sample_type_family_idx) &
self._family_has_valid_inheritance(ht, sample_type.other_sample_type, other_sample_type_family_idx, family_idx)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Commenting for posterity that I'm not 100% sure that this is the final logic we want, because it would allow us to return variants that pass quality and fail inheritance in one sample type and fail quality and pass inheritance in the other, meaning theres no sample type that clearly passes both inheritance and quality. However, I think we maybe do want to return these, the logic gets kind of confusing and I can't quite be sure these would not be helpful. I think being overly permissive here is better, if the analysts are seeing a bunch of cases where they ultimately think that the returned variants are not helpful and should be filtered out we can always go back later and make this a stricter criteria, so we should leave this as is for now

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This makes sense, we may want to consider quality and inheritance passing together and handle it differently (the & seems like it could be too simple) but it's not clear to me what the logic/change would be. I agree that trying this out and getting feedback from analysts before we do that is the way to go.

), family_idx_map.get(hl.coalesce(family_samples)[0]['familyGuid']).get(sample_type.other_sample_type.value),
), family_samples)
)})

# Merge family entries and filters from both sample types
Expand All @@ -253,13 +277,20 @@ def _apply_multi_sample_type_entry_filters(self, ht, family_idx_map):
return ht.filter(ht.family_entries.any(hl.is_defined))

@staticmethod
def _family_has_valid_sample_type_entries(ht, sample_type, sample_type_family_idx):
# Note: This logic does not sufficiently handle case 2 here https://docs.google.com/presentation/d/1hqDV8ulhviUcR5C4PtNUqkCLXKDsc6pccgFVlFmWUAU/edit?usp=sharing
# and will need to be changed to support it - https://github.com/broadinstitute/seqr/issues/4403
def _family_has_valid_quality(ht, sample_type, sample_type_family_idx):
return (
hl.is_defined(sample_type_family_idx) &
hl.is_defined(ht[sample_type.passes_quality_field][sample_type_family_idx]) &
hl.is_defined(ht[sample_type.passes_inheritance_field][sample_type_family_idx])
hl.is_defined(ht[sample_type.passes_quality_field][sample_type_family_idx])
)

@staticmethod
def _family_has_valid_inheritance(ht, sample_type, family_idx, other_sample_type_family_idx):
return hl.bind(
lambda other_sample_type_pass_samples: (
ht[sample_type.failed_family_sample_field][family_idx].all(other_sample_type_pass_samples.contains)
jklugherz marked this conversation as resolved.
Show resolved Hide resolved
), ht[sample_type.other_sample_type.family_entries_field][other_sample_type_family_idx].filter(
lambda s: ~ht[sample_type.other_sample_type.failed_family_sample_field][other_sample_type_family_idx].contains(s['sampleId'])
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

string comparisons in hail are substantially slower than integer comparisons. family_entries_field and failed_family_sample_field for a given sample type should be the same length and the same order so you should be able to rewrite this more performantly as

hl.enumerate(ht[sample_type.other_sample_type.family_entries_field][other_sample_type_family_idx]).starmap(
  lambda i, s: hl.or_missing(
    hl.is_missing(ht[sample_type.other_sample_type.failed_family_sample_field][other_sample_type_family_idx][i]),
    s['sampleId'],
  )
)

Copy link
Contributor Author

@jklugherz jklugherz Oct 31, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this won't work since family_entries_field and failed_family_sample_field for a given sample type are not necessarily the same length. ht[sample_type.other_sample_type.failed_family_sample_field][other_sample_type_family_idx] is a list of sample_ids of variable length - it only contains failed sample IDs - so the [i] access won't work. I originally had failed_family_sample_field as an array of failing indices but switched it to an array of failing sample IDs because it was easier for me to come up with a solution to compare the same samples from different sample types.

Maybe a better way to structure failed_family_sample_field is an array of booleans that matches the shape of family_entries_field where the value at each sample index is true if failed or false if pass. It's just more complicated to compare samples individually that way - I'd need to reintroduce that map of families to samples to sample types to indices inside of their respective family_entries field.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd need to reintroduce that map of families to samples to sample types to indices inside of their respective family_entries field.

if failed_family_sample_field is a list of booleans then you can easily get the list of passing sample ids from the main family_entries field without maintaining an entry map, and then your logic for using other_sample_type_pass_samples would not need to change

hl.enumerate(ht[family_entries_field][family_idx]).starmap(
  lambda sample_idx, sample: hl.or_missing(
    ~ht[failed_family_sample_field][family_idx][sample_idx]),
    sample['sampleId'],
  )
).filter(hl.is_defined)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok, I refactored the passes_inheritance_field to be a list of booleans corresponding to the samples in family entries, ^ and used this more performant code to get a list of passing sample IDs

).map(lambda s: s['sampleId']),
)

def _get_sample_genotype(self, samples, r=None, include_genotype_overrides=False, select_fields=None, **kwargs):
Expand Down
5 changes: 3 additions & 2 deletions hail_search/test_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -388,10 +388,11 @@ async def test_both_sample_types_search(self):
[VARIANT2_BOTH_SAMPLE_TYPES], sample_data=FAMILY_2_BOTH_SAMPLE_TYPE_SAMPLE_DATA_MISSING_PARENTAL_WGS,
inheritance_mode=inheritance_mode, **COMP_HET_ALL_PASS_FILTERS, intervals=[variant2_interval]
)
# Genome passes quality and inheritance exome fails inheritance (parental data shows variant is inherited).
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I still think a comment here explaining whats being tested is helpful. Maybe something like "Variant 2 fails inheritance when parental data is present"

# Genome passes quality and inheritance but exome fails inheritance (parental data shows variant is inherited).
# Variant is excluded from search results.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So this is testing the case where its not returned when theres no valid WGS parental data, but can we also test that it IS returned when we include the parental data for WGS and its overridden?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this case is covered by an above test -

    # Variant1 in family_2 is de novo in exome but maternally inherited in genome.
    # Genome passes quality and inheritance, show genotypes for both sample types.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The search criteria changes between these 2 test cases. We really need back-to-back tests that the exact same search returns different results when the paternal data is present/absent. Instead of running a dominant and recessive search each filtered to variant 1 and variant 2, it would probably be better to not use an interval filter at all and test that dominant and recessive search return the expected combo of variants with and without paternal data

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I did this^!

inheritance_mode = 'de_novo'
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this line is unneeded

await self._assert_expected_search(
[VARIANT2_BOTH_SAMPLE_TYPES], sample_data=FAMILY_2_BOTH_SAMPLE_TYPE_SAMPLE_DATA_MISSING_PARENTAL_WGS,
[], sample_data=FAMILY_2_BOTH_SAMPLE_TYPE_SAMPLE_DATA_MISSING_PARENTAL_WGS,
inheritance_mode=inheritance_mode, intervals=[variant2_interval]
)

Expand Down
2 changes: 1 addition & 1 deletion hail_search/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@
VARIANT1_BOTH_SAMPLE_TYPES['genotypes'] = {
'I000004_hg00731': [
genotypes['I000004_hg00731'],
{**genotypes['I000004_hg00731'], 'sampleType': 'WGS'}
{**genotypes['I000004_hg00731'], 'numAlt': 2, 'sampleType': 'WGS'}
],
'I000005_hg00732': [
genotypes['I000005_hg00732'],
Expand Down
Loading