Skip to content

Commit

Permalink
another pass
Browse files Browse the repository at this point in the history
  • Loading branch information
jklugherz committed Oct 28, 2024
1 parent 8d9d25d commit 877abbc
Show file tree
Hide file tree
Showing 18 changed files with 70 additions and 79 deletions.
11 changes: 2 additions & 9 deletions hail_search/definitions.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,8 @@ def family_entries_field(self) -> str:
@property
def failed_family_sample_field(self) -> str:
return {
SampleType.WES: 'wes_failed_family_sample_indices',
SampleType.WGS: 'wgs_failed_family_sample_indices',
}[self]

@property
def passes_inheritance_field(self) -> str:
return {
SampleType.WES: 'wes_passes_inheritance',
SampleType.WGS: 'wgs_passes_inheritance',
SampleType.WES: 'wes_failed_family_sample_guids',
SampleType.WGS: 'wgs_failed_family_sample_guids',
}[self]

@property
Expand Down
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
This folder comprises a Hail (www.hail.is) native Table or MatrixTable.
Written with version 0.2.130-bea04d9c79b5
Created at 2024/10/02 14:46:35
Written with version 0.2.132-678e1f52b999
Created at 2024/10/28 16:21:30
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
52 changes: 29 additions & 23 deletions hail_search/queries/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -472,8 +472,8 @@ def _apply_entry_filters(ht):
def _filter_single_entries_table(self, ht, project_families, inheritance_filter=None, quality_filter=None, is_merged_ht=False, **kwargs):
ht, sorted_family_sample_data = self._add_entry_sample_families(ht, project_families, is_merged_ht)
ht = self._filter_quality(ht, quality_filter, **kwargs)
ht, ch_ht, _, _ = self._filter_inheritance(
ht, None, inheritance_filter, sorted_family_sample_data,
ht, ch_ht = self._filter_inheritance(
ht, None, inheritance_filter, sorted_family_sample_data, self._annotate_families_inheritance
)
ht = self._apply_entry_filters(ht)
ch_ht = self._apply_entry_filters(ch_ht)
Expand Down Expand Up @@ -573,7 +573,7 @@ def _get_sample_type(cls, family_index, ht_globals):
return ht_globals.sample_type

def _filter_inheritance(
self, ht, comp_het_ht, inheritance_filter, sorted_family_sample_data,
self, ht, comp_het_ht, inheritance_filter, sorted_family_sample_data, annotate_func,
annotation='family_entries', entries_ht_field='family_entries'
):
any_valid_entry = lambda x: self.GENOTYPE_QUERY_MAP[HAS_ALT](x.GT)
Expand All @@ -584,13 +584,12 @@ def _filter_inheritance(
any_valid_entry = lambda x: prev_any_valid_entry(x) & (x.affected_id == AFFECTED_ID)

ht = ht.annotate(**{
annotation: ht[entries_ht_field].map(
entries_ht_field: ht[entries_ht_field].map(
lambda entries: hl.or_missing(entries.any(any_valid_entry), entries)
)})

ch_ht_entry_indices_by_gt = None
if self._has_comp_het_search:
comp_het_ht, ch_ht_entry_indices_by_gt = self._annotate_families_inheritance(
comp_het_ht = annotate_func(
comp_het_ht if comp_het_ht is not None else ht, COMPOUND_HET, inheritance_filter,
sorted_family_sample_data, annotation, entries_ht_field
)
Expand All @@ -599,17 +598,38 @@ def _filter_inheritance(
# No sample-specific inheritance filtering needed
sorted_family_sample_data = []

ht, ht_entry_indices_by_gt = (None, None) if self._inheritance_mode == COMPOUND_HET else self._annotate_families_inheritance(
ht = None if self._inheritance_mode == COMPOUND_HET else annotate_func(
ht, self._inheritance_mode, inheritance_filter, sorted_family_sample_data,
annotation, entries_ht_field
)

return ht, comp_het_ht, ht_entry_indices_by_gt, ch_ht_entry_indices_by_gt
return ht, comp_het_ht

def _annotate_families_inheritance(
self, ht, inheritance_mode, inheritance_filter, sorted_family_sample_data,
annotation, entries_ht_field,
):
entry_indices_by_gt = self._get_entry_indices_by_gt_map(
inheritance_filter, inheritance_mode, sorted_family_sample_data
)

for genotype, entry_indices in entry_indices_by_gt.items():
if not entry_indices:
continue
entry_indices = hl.dict(entry_indices)
ht = ht.annotate(**{
annotation: hl.enumerate(ht[entries_ht_field]).starmap(
lambda family_i, family_samples: hl.or_missing(
~entry_indices.contains(family_i) | entry_indices[family_i].all(
lambda sample_i: self.GENOTYPE_QUERY_MAP[genotype](family_samples[sample_i].GT)
), family_samples,
),
)
})

return ht

def _get_entry_indices_by_gt_map(self, inheritance_filter, inheritance_mode, sorted_family_sample_data):
individual_genotype_filter = (inheritance_filter or {}).get('genotype')

# Create a mapping of genotypes to check against a list of samples for a family
Expand All @@ -631,21 +651,7 @@ def _annotate_families_inheritance(
]
self.max_unaffected_samples = max(family_unaffected_counts) if family_unaffected_counts else 0

for genotype, entry_indices in entry_indices_by_gt.items():
if not entry_indices:
continue
entry_indices = hl.dict(entry_indices)
ht = ht.annotate(**{
annotation: hl.enumerate(ht[entries_ht_field]).starmap(
lambda family_i, family_samples: hl.or_missing(
~entry_indices.contains(family_i) | entry_indices[family_i].all(
lambda sample_i: self.GENOTYPE_QUERY_MAP[genotype](family_samples[sample_i].GT)
), family_samples,
),
)
})

return ht, entry_indices_by_gt
return entry_indices_by_gt

def _get_family_passes_quality_filter(self, quality_filter, ht, **kwargs):
quality_filter = quality_filter or {}
Expand Down
80 changes: 36 additions & 44 deletions hail_search/queries/mito.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,41 +206,33 @@ def _filter_entries_ht_both_sample_types(

ch_ht = None
family_guid_idx_map = defaultdict(dict)
family_sample_idx_map = defaultdict(lambda: defaultdict(dict))
for sample_type, sorted_family_sample_data in sample_types:
ht, ch_ht, ht_idx_by_gt_map, ch_idx_by_gt_map = self._filter_inheritance(
ht, ch_ht = self._filter_inheritance(
ht, ch_ht, inheritance_filter, sorted_family_sample_data,
annotation=sample_type.passes_inheritance_field, entries_ht_field=sample_type.family_entries_field
annotate_func=self._annotate_failed_family_samples_inheritance,
annotation=sample_type.failed_family_sample_field, entries_ht_field=sample_type.family_entries_field,
)
ht = self._annotate_failed_family_samples_inheritance(
ht, ht_idx_by_gt_map,
annotation=sample_type.failed_family_sample_field, entries_ht_field=sample_type.family_entries_field
)
ch_ht = self._annotate_failed_family_samples_inheritance(
ch_ht, ch_idx_by_gt_map,
annotation=sample_type.failed_family_sample_field, entries_ht_field=sample_type.family_entries_field
)

for family_idx, samples in enumerate(sorted_family_sample_data):
family_guid = samples[0]['familyGuid']
family_guid_idx_map[family_guid][sample_type.value] = family_idx
for sample_idx, sample in enumerate(samples):
family_sample_idx_map[family_guid][sample['sampleId']][sample_type.value] = sample_idx

family_guid_idx_map = hl.dict(family_guid_idx_map)
family_sample_idx_map = hl.dict(family_sample_idx_map)
ht = self._apply_multi_sample_type_entry_filters(ht, family_guid_idx_map, family_sample_idx_map)
ch_ht = self._apply_multi_sample_type_entry_filters(ch_ht, family_guid_idx_map, family_sample_idx_map)
ht = self._apply_multi_sample_type_entry_filters(ht, family_guid_idx_map)
ch_ht = self._apply_multi_sample_type_entry_filters(ch_ht, family_guid_idx_map)
return ht, ch_ht

def _annotate_failed_family_samples_inheritance(
self, ht, entry_indices_by_gt, annotation, entries_ht_field,
self, ht, inheritance_mode, inheritance_filter, sorted_family_sample_data, annotation, entries_ht_field
):
entry_indices_by_gt = self._get_entry_indices_by_gt_map(
inheritance_filter, inheritance_mode, sorted_family_sample_data
)

if ht is None:
return ht

# Initialize empty array
ht = ht.annotate(**{annotation: ht[entries_ht_field].map(lambda x: hl.empty_array(hl.tint32))})
ht = ht.annotate(**{annotation: ht[entries_ht_field].map(lambda x: hl.empty_array(hl.tstr))})

# Add failed genotype samples
for genotype, entry_indices in entry_indices_by_gt.items():
Expand All @@ -252,23 +244,21 @@ def _annotate_failed_family_samples_inheritance(
lambda family_idx, entries: hl.bind(
lambda failed_samples: ht[annotation][family_idx].extend(failed_samples),
entry_indices.get(family_idx).filter(
lambda sample_i: ~self.GENOTYPE_QUERY_MAP[genotype](entries[sample_i].GT)
)
lambda sample_idx: ~self.GENOTYPE_QUERY_MAP[genotype](entries[sample_idx].GT)
).map(lambda sample_idx: entries[sample_idx]['sampleId'])
)
)})

return ht

def _apply_multi_sample_type_entry_filters(self, ht, family_idx_map, sample_idx_map):
def _apply_multi_sample_type_entry_filters(self, ht, family_idx_map):
if ht is None:
return ht

# Keep family from both sample types if either passes quality AND inheritance
for sample_type in SampleType:
ht = self._apply_quality_entry_filters(ht, sample_type, family_idx_map)
# TODO - Since each sample type is processed separately, wgs with 1 sample will not be filtered out if it passes in wes (even though another sample failed in wes)
# and the coalesce below keeps that sample even though the family was filtered out in wes. This is a limitation of the current implementation.
ht = self._apply_inheritance_entry_filters(ht, sample_type, family_idx_map, sample_idx_map)
ht = self._apply_inheritance_entry_filters(ht, sample_type, family_idx_map)

# Merge family entries and filters from both sample types
ht = ht.transmute(
Expand Down Expand Up @@ -302,28 +292,30 @@ def _family_has_valid_quality(ht, sample_type, sample_type_family_idx):
hl.is_defined(ht[sample_type.passes_quality_field][sample_type_family_idx])
)

def _apply_inheritance_entry_filters(self, ht, sample_type, family_idx_map, sample_idx_map):
@staticmethod
def _apply_inheritance_entry_filters(ht, sample_type, family_idx_map):
ht = ht.annotate(
**{sample_type.family_entries_field: hl.enumerate(ht[sample_type.family_entries_field]).starmap(
lambda family_idx, family_samples: hl.or_missing( # Keep a family if
hl.all(hl.enumerate(family_samples).starmap( # For each sample in the family,
lambda sample_idx, sample: hl.bind(lambda other_sample_type_indices: ( # Get the sample and family index of the sample in the other sample type family_entries
hl.if_else(
hl.is_defined(sample_idx) & hl.is_defined(other_sample_type_indices[1]), # If samples are present for both sample types,
( # Keep the family entries if family passes inheritance in either sample type.
hl.is_defined(ht[sample_type.passes_inheritance_field][family_idx]) |
hl.is_defined(ht[sample_type.other_sample_type.passes_inheritance_field][other_sample_type_indices[0]])
), # Else, if sample is in only one sample type, check if that sample did not fail inheritance in either sample type
self._family_sample_has_valid_inheritance(ht, sample_type, family_idx, sample_idx) |
self._family_sample_has_valid_inheritance(ht, sample_type.other_sample_type, other_sample_type_indices[0], other_sample_type_indices[1])
)
),(
family_idx_map.get(hl.coalesce(sample)['familyGuid']).get(sample_type.other_sample_type.value),
sample_idx_map.get(hl.coalesce(sample)['familyGuid']).get(hl.coalesce(sample)['sampleId']).get(sample_type.other_sample_type.value)),
lambda family_idx, family_samples: hl.or_missing(
hl.bind(lambda other_sample_type_family_idx: (
hl.bind(
lambda other_sample_type_pass_samples, sample_type_pass_samples: (
ht[sample_type.failed_family_sample_field][family_idx].all(
other_sample_type_pass_samples.contains
) & ht[sample_type.other_sample_type.failed_family_sample_field][other_sample_type_family_idx].all(
sample_type_pass_samples.contains
)),
ht[sample_type.other_sample_type.family_entries_field][other_sample_type_family_idx].filter(
lambda s: ~ht[sample_type.other_sample_type.failed_family_sample_field][other_sample_type_family_idx].contains(s['sampleId'])
).map(lambda s: s['sampleId']),
ht[sample_type.other_sample_type.family_entries_field][family_idx].filter(
lambda s: ~ht[sample_type.failed_family_sample_field][family_idx].contains(s['sampleId'])
).map(lambda s: s['sampleId']),
)
)), family_samples)
)
})
), family_idx_map.get(hl.coalesce(family_samples)[0]['familyGuid']).get(sample_type.other_sample_type.value)
), family_samples)
)}
)
return ht

@staticmethod
Expand Down
2 changes: 1 addition & 1 deletion hail_search/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@
VARIANT1_BOTH_SAMPLE_TYPES['genotypes'] = {
'I000004_hg00731': [
genotypes['I000004_hg00731'],
{**genotypes['I000004_hg00731'], 'sampleType': 'WGS'}
{**genotypes['I000004_hg00731'], 'numAlt': 2, 'sampleType': 'WGS'}
],
'I000005_hg00732': [
genotypes['I000005_hg00732'],
Expand Down

0 comments on commit 877abbc

Please sign in to comment.