broadinstitute · hanars · Oct 22, 2024 · Oct 10, 2024 · Oct 10, 2024 · Oct 10, 2024
diff --git a/hail_search/queries/base.py b/hail_search/queries/base.py
@@ -624,43 +624,28 @@ def _annotate_families_inheritance(
                 if genotype:
                     entry_indices_by_gt[genotype][family_index].append(sample_index)
 
-        min_unaffected = None
         if inheritance_mode == COMPOUND_HET:
             family_unaffected_counts = [
                 len(i) for i in entry_indices_by_gt[INHERITANCE_FILTERS[COMPOUND_HET][UNAFFECTED_ID]].values()
             ]
             self.max_unaffected_samples = max(family_unaffected_counts) if family_unaffected_counts else 0
-            if self.max_unaffected_samples > 1:
-                min_unaffected = min(family_unaffected_counts)
 
         for genotype, entry_indices in entry_indices_by_gt.items():
             if not entry_indices:
                 continue
             entry_indices = hl.dict(entry_indices)
             ht = ht.annotate(**{
                 annotation: hl.enumerate(ht[entries_ht_field]).starmap(
-                    lambda i, family_samples: self._valid_genotype_family_entries(
-                        family_samples, entry_indices.get(i), genotype, min_unaffected
-                    )
+                    lambda family_i, family_samples: hl.or_missing(
+                        ~entry_indices.contains(family_i) | entry_indices[family_i].all(
+                            lambda sample_i: self.GENOTYPE_QUERY_MAP[genotype](family_samples[sample_i].GT)
+                        ), family_samples,
+                    ),
                 )
             })
 
         return ht
 
-    @classmethod
-    def _valid_genotype_family_entries(cls, entries: list, genotype_entry_indices, genotype: str, min_unaffected: int):
-        is_valid = hl.is_missing(genotype_entry_indices) | genotype_entry_indices.all(
-            lambda i: cls.GENOTYPE_QUERY_MAP[genotype](entries[i].GT)
-        )
-        if min_unaffected is not None and genotype == HAS_REF:
-            unaffected_filter = genotype_entry_indices.any(
-                lambda i: cls.GENOTYPE_QUERY_MAP[REF_REF](entries[i].GT)
-            )
-            if min_unaffected < 2:
-                unaffected_filter |= genotype_entry_indices.size() < 2
-            is_valid &= unaffected_filter
-        return hl.or_missing(is_valid, entries)
-
     def _get_family_passes_quality_filter(self, quality_filter, ht, **kwargs):
         quality_filter = quality_filter or {}
 

diff --git a/matchmaker/models.py b/matchmaker/models.py
@@ -8,7 +8,10 @@
 
 class MatchmakerSubmission(ModelWithGUID):
 
-    SEX_LOOKUP = {Individual.SEX_MALE: 'MALE', Individual.SEX_FEMALE: 'FEMALE'}
+    SEX_LOOKUP = {
+        **{sex: 'MALE' for sex in Individual.MALE_SEXES},
+        **{sex: 'FEMALE' for sex in Individual.FEMALE_SEXES},
+    }
 
     individual = models.OneToOneField(Individual, on_delete=models.PROTECT)
 

diff --git a/seqr/fixtures/1kg_project.json b/seqr/fixtures/1kg_project.json
@@ -444,7 +444,7 @@
         "individual_id": "NA19675_1",
         "mother_id": 3,
         "father_id": 2,
-        "sex": "M",
+        "sex": "XXY",
         "affected": "A",
         "display_name": "",
         "notes": "",
@@ -536,7 +536,7 @@
         "individual_id": "HG00731",
         "mother_id": 6,
         "father_id": 5,
-        "sex": "F",
+        "sex": "X0",
         "affected": "A",
         "proband_relationship": "S",
         "display_name": "HG00731_a",

diff --git a/seqr/fixtures/report_variants.json b/seqr/fixtures/report_variants.json
@@ -105,9 +105,9 @@
         "last_modified_date": "2018-05-31T16:36:02.805Z",
         "xpos": 19001912632,
         "xpos_end": 19001912632,
-        "ref": "GC",
-        "alt": "TT",
-        "variant_id": "19-1912632-GC-TT",
+        "ref": "G",
+        "alt": "C",
+        "variant_id": "19-1912632-G-C",
         "saved_variant_json": {
             "pos": 1912632,
             "end": 1912632,
@@ -116,7 +116,7 @@
             "genotypes": {
                 "I000004_hg00731": {"numAlt": 1}
             },
-            "variantId": "19-1912632-GC-TT",
+            "variantId": "19-1912632-G-C",
             "chrom": "19",
             "mainTranscriptId": "ENST00000371839",
             "transcripts": {

diff --git a/seqr/migrations/0076_alter_individual_sex.py b/seqr/migrations/0076_alter_individual_sex.py
@@ -0,0 +1,18 @@
+# Generated by Django 4.2.13 on 2024-10-10 20:51
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('seqr', '0075_alter_individual_primary_biosample'),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name='individual',
+            name='sex',
+            field=models.CharField(choices=[('M', 'Male'), ('F', 'Female'), ('U', 'Unknown'), ('XXY', 'XXY'), ('XYY', 'XYY'), ('XXX', 'XXX'), ('X0', 'X0')], default='U', max_length=3),
+        ),
+    ]
diff --git a/seqr/models.py b/seqr/models.py
@@ -447,10 +447,16 @@ class Individual(ModelWithGUID):
     SEX_MALE = 'M'
     SEX_FEMALE = 'F'
     SEX_UNKNOWN = 'U'
+    FEMALE_ANEUPLOIDIES = ['XXX', 'X0']
+    MALE_ANEUPLOIDIES = ['XXY', 'XYY']
+    FEMALE_SEXES = [SEX_FEMALE] + FEMALE_ANEUPLOIDIES
+    MALE_SEXES = [SEX_MALE] + MALE_ANEUPLOIDIES
     SEX_CHOICES = (
         (SEX_MALE, 'Male'),
         ('F', 'Female'),
         ('U', 'Unknown'),
+        *[(sex, sex) for sex in MALE_ANEUPLOIDIES],
+        *[(sex, sex) for sex in FEMALE_ANEUPLOIDIES],
     )
 
     AFFECTED_STATUS_AFFECTED = 'A'
@@ -603,7 +609,7 @@ class Individual(ModelWithGUID):
     mother = models.ForeignKey('seqr.Individual', null=True, blank=True, on_delete=models.SET_NULL, related_name='maternal_children')
     father = models.ForeignKey('seqr.Individual', null=True, blank=True, on_delete=models.SET_NULL, related_name='paternal_children')
 
-    sex = models.CharField(max_length=1, choices=SEX_CHOICES, default='U')
+    sex = models.CharField(max_length=3, choices=SEX_CHOICES, default='U')
     affected = models.CharField(max_length=1, choices=AFFECTED_STATUS_CHOICES, default=AFFECTED_STATUS_UNKNOWN)
 
     # TODO once sample and individual ids are fully decoupled no reason to maintain this field

diff --git a/seqr/utils/search/add_data_utils.py b/seqr/utils/search/add_data_utils.py
@@ -116,7 +116,7 @@ def format_email(sample_summary, project_link, num_new_samples):
 
 def prepare_data_loading_request(projects: list[Project], sample_type: str, dataset_type: str, genome_version: str,
                                  data_path: str, user: User, pedigree_dir: str,  raise_pedigree_error: bool = False,
-                                 individual_ids: list[str] = None):
+                                 individual_ids: list[str] = None, skip_validation: bool = False):
     project_guids = sorted([p.guid for p in projects])
     variables = {
         'projects_to_run': project_guids,
@@ -125,6 +125,8 @@ def prepare_data_loading_request(projects: list[Project], sample_type: str, data
         'dataset_type': _dag_dataset_type(sample_type, dataset_type),
         'reference_genome': GENOME_VERSION_LOOKUP[genome_version],
     }
+    if skip_validation:
+        variables['skip_validation'] = True
     file_path = _get_pedigree_path(pedigree_dir, genome_version, sample_type, dataset_type)
     _upload_data_loading_files(projects, user, file_path, individual_ids, raise_pedigree_error)
     return variables, file_path

diff --git a/seqr/utils/search/hail_search_utils.py b/seqr/utils/search/hail_search_utils.py
@@ -141,7 +141,7 @@ def _get_sample_data(samples, inheritance_filter=None, inheritance_mode=None, **
         affected=F('individual__affected'),
     )
     if inheritance_mode == X_LINKED_RECESSIVE:
-        sample_values['is_male'] = Case(When(individual__sex=Individual.SEX_MALE, then=True), default=False)
+        sample_values['is_male'] = Case(When(individual__sex__in=Individual.MALE_SEXES, then=True), default=False)
     sample_data = samples.order_by('guid').values('individual__individual_id', 'dataset_type', 'sample_type', **sample_values)
 
     custom_affected = (inheritance_filter or {}).pop('affected', None)

diff --git a/seqr/utils/search/search_utils_tests.py b/seqr/utils/search/search_utils_tests.py
@@ -550,7 +550,7 @@ def test_get_variant_query_gene_counts(self, mock_call):
     def test_cached_get_variant_query_gene_counts(self):
         super(HailSearchUtilsTests, self).test_cached_get_variant_query_gene_counts()
 
-        self.set_cache({'all_results': PARSED_COMPOUND_HET_VARIANTS_MULTI_PROJECT + [SV_VARIANT1], 'total_results': 3})
+        self.set_cache({'all_results': [PARSED_COMPOUND_HET_VARIANTS_MULTI_PROJECT] + [SV_VARIANT1], 'total_results': 2})
         gene_counts = get_variant_query_gene_counts(self.results_model, self.user)
         self.assertDictEqual(gene_counts, {
             'ENSG00000135953': {'total': 2, 'families': {'F000003_3': 2, 'F000011_11': 2}},

diff --git a/seqr/utils/search/utils.py b/seqr/utils/search/utils.py
@@ -314,7 +314,12 @@ def get_variant_query_gene_counts(search_model, user):
 
 def _get_gene_aggs_for_cached_variants(previous_search_results):
     gene_aggs = defaultdict(lambda: {'total': 0, 'families': defaultdict(int)})
-    for var in previous_search_results['all_results']:
+    # ES caches compound hets separately from main results, hail search caches everything together
+    flattened_variants = backend_specific_call(
+        lambda results: results,
+        lambda results: [v for variants in results for v in (variants if isinstance(variants, list) else [variants])],
+    )(previous_search_results['all_results'])
+    for var in flattened_variants:
         # ES only reports breakdown for main transcript gene only, hail backend reports for all genes
         gene_ids = backend_specific_call(
             lambda variant_transcripts: next((

diff --git a/seqr/views/apis/anvil_workspace_api_tests.py b/seqr/views/apis/anvil_workspace_api_tests.py
@@ -730,7 +730,7 @@ def _assert_valid_operation(self, project, test_add_data=True):
                 ['R0001_1kg', 'F000001_1', '1', 'NA19675_1', 'NA19678', '', 'F'],
                 ['R0001_1kg', 'F000001_1', '1', 'NA19678', '', '', 'M'],
                 ['R0001_1kg', 'F000001_1', '1', 'NA19679', '', '', 'F'],
-                ['R0001_1kg', 'F000002_2', '2', 'HG00731', 'HG00732', 'HG00733', 'F'],
+                ['R0001_1kg', 'F000002_2', '2', 'HG00731', 'HG00732', 'HG00733', 'X0'],
                 ['R0001_1kg', 'F000002_2', '2', 'HG00732', '', '', 'M'],
                 ['R0001_1kg', 'F000002_2', '2', 'HG00733', '', '', 'F'],
                 ['R0001_1kg', 'F000003_3', '3', 'NA20870', '', '', 'M'],

diff --git a/seqr/views/apis/data_manager_api.py b/seqr/views/apis/data_manager_api.py
@@ -448,8 +448,9 @@ def load_phenotype_prioritization_data(request):
 @pm_or_data_manager_required
 def validate_callset(request):
     request_json = json.loads(request.body)
+    dataset_type = request_json.get('datasetType', Sample.DATASET_TYPE_VARIANT_CALLS)
     validate_vcf_exists(
-        _callset_path(request_json), request.user, allowed_exts=DATA_TYPE_FILE_EXTS.get(request_json['datasetType']),
+        _callset_path(request_json), request.user, allowed_exts=DATA_TYPE_FILE_EXTS.get(dataset_type),
         path_name=request_json['filePath'],
     )
     return create_json_response({'success': True})
@@ -516,7 +517,8 @@ def _fetch_airtable_loadable_project_samples(user, dataset_type, sample_type):
 def load_data(request):
     request_json = json.loads(request.body)
     sample_type = request_json['sampleType']
-    dataset_type = request_json['datasetType']
+    dataset_type = request_json.get('datasetType', Sample.DATASET_TYPE_VARIANT_CALLS)
+    skip_validation = request_json.get('skipValidation', False)
     projects = [json.loads(project) for project in request_json['projects']]
     project_samples = {p['projectGuid']: p.get('sampleIds') for p in projects}
 
@@ -528,17 +530,18 @@ def load_data(request):
     loading_args = (
         project_models, sample_type, dataset_type, request_json['genomeVersion'], _callset_path(request_json),
     )
+    loading_kwargs = {'user': request.user, 'skip_validation': skip_validation}
     if AirtableSession.is_airtable_enabled():
         individual_ids = _get_valid_project_samples(project_samples, dataset_type, sample_type, request.user)
         success_message = f'*{request.user.email}* triggered loading internal {sample_type} {dataset_type} data for {len(projects)} projects'
         error_message = f'ERROR triggering internal {sample_type} {dataset_type} loading'
         trigger_airflow_data_loading(
-            *loading_args, user=request.user, success_message=success_message, error_message=error_message,
+            *loading_args, **loading_kwargs, success_message=success_message, error_message=error_message,
             success_slack_channel=SEQR_SLACK_LOADING_NOTIFICATION_CHANNEL, is_internal=True, individual_ids=individual_ids,
         )
     else:
         request_json, _ = prepare_data_loading_request(
-            *loading_args, user=request.user, pedigree_dir=LOADING_DATASETS_DIR, raise_pedigree_error=True,
+            *loading_args, **loading_kwargs, pedigree_dir=LOADING_DATASETS_DIR, raise_pedigree_error=True,
         )
         response = requests.post(f'{PIPELINE_RUNNER_SERVER}/loading_pipeline_enqueue', json=request_json, timeout=60)
         if response.status_code == 409:

diff --git a/seqr/views/apis/data_manager_api_tests.py b/seqr/views/apis/data_manager_api_tests.py
@@ -390,10 +390,10 @@
 
 PEDIGREE_HEADER = ['Project_GUID', 'Family_GUID', 'Family_ID', 'Individual_ID', 'Paternal_ID', 'Maternal_ID', 'Sex']
 EXPECTED_PEDIGREE_ROWS = [
-    ['R0001_1kg', 'F000001_1', '1', 'NA19675_1', 'NA19678', 'NA19679', 'M'],
+    ['R0001_1kg', 'F000001_1', '1', 'NA19675_1', 'NA19678', 'NA19679', 'XXY'],
     ['R0001_1kg', 'F000001_1', '1', 'NA19678', '', '', 'M'],
     ['R0001_1kg', 'F000001_1', '1', 'NA19679', '', '', 'F'],
-    ['R0001_1kg', 'F000002_2', '2', 'HG00731', 'HG00732', 'HG00733', 'F'],
+    ['R0001_1kg', 'F000002_2', '2', 'HG00731', 'HG00732', 'HG00733', 'X0'],
 ]
 
 PROJECT_OPTION = {
@@ -1433,7 +1433,7 @@ def test_validate_callset(self, mock_subprocess, mock_glob, mock_os_isfile):
         mock_subprocess.return_value.communicate.return_value = (
             b'', b'CommandException: One or more URLs matched no objects.',
         )
-        body = {'filePath': f'{self.CALLSET_DIR}/sharded_vcf/part0*.vcf', 'datasetType': 'SNV_INDEL'}
+        body = {'filePath': f'{self.CALLSET_DIR}/sharded_vcf/part0*.vcf'}
         response = self.client.post(url, content_type='application/json', data=json.dumps(body))
         self.assertEqual(response.status_code, 400)
         self.assertListEqual(
@@ -1503,7 +1503,7 @@ def test_load_data(self, mock_temp_dir, mock_open, mock_mkdir):
         mock_temp_dir.return_value.__enter__.return_value = '/mock/tmp'
         body = {'filePath': f'{self.CALLSET_DIR}/mito_callset.mt', 'datasetType': 'MITO', 'sampleType': 'WES', 'genomeVersion': '38', 'projects': [
             json.dumps(option) for option in self.PROJECT_OPTIONS + [{'projectGuid': 'R0005_not_project'}]
-        ]}
+        ], 'skipValidation': True}
         response = self.client.post(url, content_type='application/json', data=json.dumps(body))
         self.assertEqual(response.status_code, 400)
         self.assertDictEqual(response.json(), {'error': 'The following projects are invalid: R0005_not_project'})
@@ -1516,7 +1516,7 @@ def test_load_data(self, mock_temp_dir, mock_open, mock_mkdir):
         )
         self.assertDictEqual(response.json(), {'success': True})
 
-        self._assert_expected_load_data_requests(sample_type='WES')
+        self._assert_expected_load_data_requests(sample_type='WES', skip_validation=True)
         self._has_expected_ped_files(mock_open, mock_mkdir, 'MITO', sample_type='WES')
 
         dag_json = {
@@ -1528,6 +1528,7 @@ def test_load_data(self, mock_temp_dir, mock_open, mock_mkdir):
             'sample_type': 'WES',
             'dataset_type': 'MITO',
             'reference_genome': 'GRCh38',
+            'skip_validation': True,
         }
         self._assert_success_notification(dag_json)
 
@@ -1538,14 +1539,17 @@ def test_load_data(self, mock_temp_dir, mock_open, mock_mkdir):
         responses.calls.reset()
         self.reset_logs()
 
+        del body['skipValidation']
+        del dag_json['skip_validation']
         body.update({'datasetType': 'SV', 'filePath': f'{self.CALLSET_DIR}/sv_callset.vcf'})
         self._trigger_error(url, body, dag_json, mock_open, mock_mkdir)
 
         responses.add(responses.POST, PIPELINE_RUNNER_URL)
         responses.calls.reset()
         mock_open.reset_mock()
         mock_mkdir.reset_mock()
-        body.update({'datasetType': 'SNV_INDEL', 'sampleType': 'WGS', 'projects': [json.dumps(self.PROJECT_OPTION)]})
+        body.update({'sampleType': 'WGS', 'projects': [json.dumps(self.PROJECT_OPTION)]})
+        del body['datasetType']
         response = self.client.post(url, content_type='application/json', data=json.dumps(body))
         self._test_load_single_project(mock_open, mock_mkdir, response, url=url, body=body)
 
@@ -1638,18 +1642,21 @@ def _add_file_iter(self, stdout):
     def _assert_expected_get_projects_requests(self):
         self.assertEqual(len(responses.calls), 0)
 
-    def _assert_expected_load_data_requests(self, dataset_type='MITO', sample_type='WGS', trigger_error=False, skip_project=False):
+    def _assert_expected_load_data_requests(self, dataset_type='MITO', sample_type='WGS', trigger_error=False, skip_project=False, skip_validation=False):
         self.assertEqual(len(responses.calls), 1)
         projects = [PROJECT_GUID, NON_ANALYST_PROJECT_GUID]
         if skip_project:
             projects = projects[1:]
-        self.assertDictEqual(json.loads(responses.calls[0].request.body), {
+        body = {
             'projects_to_run': projects,
             'callset_path': '/local_datasets/sv_callset.vcf' if trigger_error else '/local_datasets/mito_callset.mt',
             'sample_type': sample_type,
             'dataset_type': dataset_type,
             'reference_genome': 'GRCh38',
-        })
+        }
+        if skip_validation:
+            body['skip_validation'] = True
+        self.assertDictEqual(json.loads(responses.calls[0].request.body), body)
 
     @staticmethod
     def _local_pedigree_path(dataset_type, sample_type):
@@ -1786,6 +1793,7 @@ def _get_dag_variable_overrides(*args, **kwargs):
             'sample_source': 'Broad_Internal',
             'sample_type': 'WES',
             'dataset_type': 'MITO',
+            'skip_validation': True,
         }
 
     def _assert_expected_load_data_requests(self, dataset_type='MITO', **kwargs):

diff --git a/seqr/views/apis/individual_api.py b/seqr/views/apis/individual_api.py
@@ -29,12 +29,6 @@
 from seqr.views.utils.individual_utils import delete_individuals, add_or_update_individuals_and_families
 from seqr.views.utils.variant_utils import bulk_create_tagged_variants
 
-_SEX_TO_EXPORTED_VALUE = dict(Individual.SEX_LOOKUP)
-_SEX_TO_EXPORTED_VALUE['U'] = ''
-
-__AFFECTED_TO_EXPORTED_VALUE = dict(Individual.AFFECTED_STATUS_LOOKUP)
-__AFFECTED_TO_EXPORTED_VALUE['U'] = ''
-
 
 @login_and_policies_required
 def update_individual_handler(request, individual_guid):
@@ -875,7 +869,7 @@ def import_gregor_metadata(request, project_guid):
     genes = set()
     for row in _iter_metadata_table(
         metadata_files_path, FINDINGS_TABLE, request.user,
-            lambda r: r['participant_id'] in participant_individual_map and r['variant_type'] == 'SNV/INDEL',
+            lambda r: r['participant_id'] in participant_individual_map and r['variant_type'] in {'SNV/INDEL', 'SNV', 'INDEL'},
     ):
         individual = participant_individual_map[row['participant_id']]
         variant_id = '-'.join([row[col] for col in ['chrom', 'pos', 'ref', 'alt']])