🔨 update PR suggestions

📝 update docs
kids-first · Feb 29, 2024 · 4af1b30 · 4af1b30
1 parent 590443a
commit 4af1b30
Show file tree

Hide file tree

Showing 2 changed files with 88 additions and 85 deletions.
diff --git a/docs/DIFF_STUDY_CLINICAL.md b/docs/DIFF_STUDY_CLINICAL.md
@@ -30,77 +30,86 @@ For the patient and sample views, each file respectively has:
  - A list, one per line, per ID, per attribute, of what would change if the data were loaded
  - A list of IDs that would be removed from the portal, if any
  - A list of IDs that would be added if any
- - A summary of the number of changes of each attribute type
+ - A summary of the number of changes of each attribute type printed to STDOUT
 
 ### patient_portal_v_build.txt example:
 ```
-Per Patient changes:
-Patient PT_017WC8PS attribute ETHNICITY would change from NA to Not Available
-Patient PT_01HNFSBZ attribute ETHNICITY would change from NA to Not Available
-Patient PT_01HNFSBZ attribute GERMLINE_SEX_ESTIMATE would change from Unknown to NA
-Patient PT_01HNFSBZ attribute CANCER_PREDISPOSITIONS would change from None documented to NA
-Patient PT_01SH4F1X attribute AGE_IN_DAYS would change from 3838 to NA
-Patient PT_01SH4F1X attribute OS_MONTHS would change from 45 to 54
-Patient PT_0324HWD5 attribute AGE_IN_DAYS would change from 3121 to NA
-Patient PT_047YGDRW attribute ETHNICITY would change from NA to Not Available
-Patient PT_04V47WFC attribute AGE_IN_DAYS would change from 5717 to NA
-Patient PT_08M919BH attribute EFS_MONTHS would change from 44 to 62
-Patient PT_08M919BH attribute OS_MONTHS would change from 44 to 62
-Patient PT_0BSG3R3N attribute AGE_IN_DAYS would change from 3431 to NA
-Patient PT_0BVR16FK attribute ETHNICITY would change from NA to Not Available
-Patient PT_0CE0HFYB attribute GERMLINE_SEX_ESTIMATE would change from Male to NA
-
-...
-
-CHANGE SUMMARY:
-ETHNICITY has 358 change(s)
-GERMLINE_SEX_ESTIMATE has 220 change(s)
-CANCER_PREDISPOSITIONS has 29 change(s)
-AGE_IN_DAYS has 147 change(s)
-OS_MONTHS has 99 change(s)
-EFS_MONTHS has 60 change(s)
-EFS_STATUS has 18 change(s)
-SEX has 9 change(s)
-AGE has 6 change(s)
-OS_STATUS has 4 change(s)
+Patient attribute       before  after
+PT_017WC8PS     ETHNICITY       NA      Not Available
+PT_01HNFSBZ     CANCER_PREDISPOSITIONS  None documented NA
+PT_01HNFSBZ     ETHNICITY       NA      Not Available
+PT_01HNFSBZ     GERMLINE_SEX_ESTIMATE   Unknown NA
+PT_01SH4F1X     AGE_IN_DAYS     3838    NA
+PT_01SH4F1X     OS_MONTHS       45      54
+PT_0324HWD5     AGE_IN_DAYS     3121    NA
+PT_047YGDRW     ETHNICITY       NA      Not Available
+PT_04V47WFC     AGE_IN_DAYS     5717    NA
+PT_08M919BH     OS_MONTHS       44      62
+PT_08M919BH     EFS_MONTHS      44      62
+PT_0BSG3R3N     AGE_IN_DAYS     3431    NA
+PT_0BVR16FK     ETHNICITY       NA      Not Available
+PT_0CE0HFYB     GERMLINE_SEX_ESTIMATE   Male    NA
+PT_0CVRX4SJ     OS_MONTHS       NA      149
 ```
 
 ### sample_portal_v_build.txt example:
 ```
-Per Sample changes:
-Sample 16510-1 attribute TUMOR_FRACTION would change from 0.349951221921 to 0.34995122192100003
-Sample 16510-15 attribute TUMOR_FRACTION would change from 0.892871847605 to 0.8928718476049999
-Sample 16510-2 attribute TUMOR_FRACTION would change from 0.242536563786 to 0.24253656378600005
-Sample 16510-8 attribute TUMOR_FRACTION would change from 0.557284218924 to 0.5572842189239999
-Sample 7316-100 attribute TUMOR_FRACTION would change from 0.270649989118 to 0.27064998911800003
-Sample 7316-1017 attribute TUMOR_FRACTION would change from 0.570184695999 to 0.559801637737
-Sample 7316-104 attribute TUMOR_FRACTION would change from 0.664343255194 to 0.6643432551940001
-Sample 7316-1045 attribute TUMOR_FRACTION would change from 0.477859261757 to 0.496989582389
-Sample 7316-105 attribute MOLECULAR_SUBTYPE would change from NA to LGG, BRAF V600E
-Sample 7316-105 attribute TUMOR_PLOIDY would change from NA to 2
-Sample 7316-105 attribute CANCER_TYPE_DETAILED would change from NA to Low-grade glioma, BRAF V600E
-Sample 7316-105 attribute CANCER_GROUP would change from NA to Low-grade glioma
-Sample 7316-105 attribute TUMOR_FRACTION would change from NA to 0.823344460708
-Sample 7316-105 attribute PATHOLOGY_FREE_TEXT_DIAGNOSIS would change from NA to pilocytic astrocytoma ii
-
-...
+Sample  attribute       before  after
+16510-1 TUMOR_FRACTION  0.349951221921  0.34995122192100003
+16510-15        TUMOR_FRACTION  0.892871847605  0.8928718476049999
+16510-2 TUMOR_FRACTION  0.242536563786  0.24253656378600005
+16510-8 TUMOR_FRACTION  0.557284218924  0.5572842189239999
+7316-100        TUMOR_FRACTION  0.270649989118  0.27064998911800003
+7316-1017       TUMOR_FRACTION  0.570184695999  0.559801637737
+7316-104        TUMOR_FRACTION  0.664343255194  0.6643432551940001
+7316-1045       TUMOR_FRACTION  0.477859261757  0.496989582389
+7316-105        CNS_REGION      NA      Mixed
+7316-105        CANCER_TYPE_DETAILED    NA      Low-grade glioma, BRAF V600E
+7316-105        MOLECULAR_SUBTYPE       NA      LGG, BRAF V600E
+7316-105        BROAD_HISTOLOGY NA      Low-grade astrocytic tumor
+7316-105        CANCER_GROUP    NA      Low-grade glioma
+7316-105        TUMOR_PLOIDY    NA      2
+7316-105        PATHOLOGY_FREE_TEXT_DIAGNOSIS   NA      pilocytic astrocytoma ii
+7316-105        TUMOR_FRACTION  NA      0.823344460708
+7316-1052       CANCER_TYPE_DETAILED    Diffuse midline glioma, H3 K28-mutant   Diffuse midline glioma, H3 K28-altered
+7316-1052       MOLECULAR_SUBTYPE       DMG, H3 K28     DMG, H3 K28, TP53
+7316-1062       CANCER_TYPE_DETAILED    Diffuse midline glioma, H3 K28-mutant   Diffuse midline glioma, H3 K28-altered
+7316-1068       CANCER_TYPE_DETAILED    Diffuse midline glioma, H3 K28-mutant   Diffuse midline glioma, H3 K28-altered
+7316-1072       CANCER_TYPE_DETAILED    Glial-neuronal tumor NOS        Glial-neuronal tumor,  To be classified
+7316-1072       BROAD_HISTOLOGY Low-grade astrocytic tumor      Neuronal and mixed neuronal-glial tumor
+7316-1072       CANCER_GROUP    Glial-neuronal tumor    Glial-neuronal tumor NOS
+```
 
-CHANGE SUMMARY:
+### STDOUT:
+```
+Sample CHANGE SUMMARY:
 27 Samples in build would be added to the portal: 1235928,1235929,1235930,1235931,1235932,1235933,1235934,1235935,1235936,1235937,1235938,1235939,1235940,1235941,1235981,1240110,1240112,1240114,1240116,1242273,1242274,1242276,1250775,1250776,1250777,1250778,1273223
 TUMOR_FRACTION has 1005 change(s)
-MOLECULAR_SUBTYPE has 488 change(s)
-TUMOR_PLOIDY has 403 change(s)
+CNS_REGION has 410 change(s)
 CANCER_TYPE_DETAILED has 847 change(s)
+MOLECULAR_SUBTYPE has 488 change(s)
+BROAD_HISTOLOGY has 390 change(s)
 CANCER_GROUP has 517 change(s)
+TUMOR_PLOIDY has 403 change(s)
 PATHOLOGY_FREE_TEXT_DIAGNOSIS has 507 change(s)
-BROAD_HISTOLOGY has 390 change(s)
-CNS_REGION has 410 change(s)
-CANCER_TYPE has 8 change(s)
 ONCOTREE_CODE has 7 change(s)
+CANCER_TYPE has 8 change(s)
 TUMOR_TYPE has 8 change(s)
 TUMOR_TISSUE_TYPE has 17 change(s)
 EXPERIMENT_STRATEGY has 1 change(s)
 SPECIMEN_ID has 1 change(s)
 CBTN_TUMOR_TYPE has 6 change(s)
 SAMPLE_TYPE has 2 change(s)
+
+Patient CHANGE SUMMARY:
+ETHNICITY has 358 change(s)
+CANCER_PREDISPOSITIONS has 29 change(s)
+GERMLINE_SEX_ESTIMATE has 220 change(s)
+AGE_IN_DAYS has 147 change(s)
+OS_MONTHS has 99 change(s)
+EFS_MONTHS has 60 change(s)
+EFS_STATUS has 18 change(s)
+SEX has 9 change(s)
+AGE has 6 change(s)
+OS_STATUS has 4 change(s)
 ```
diff --git a/scripts/diff_studies.py b/scripts/diff_studies.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 """
 Script to check a study on pedcbioportal for differences against a local build
 """
@@ -17,45 +18,38 @@ def clinical_diffs(portal, build, portal_attr, build_attr, clin_type, out):
     build_clinical_ids = set(build.keys())
     portal_only = sorted(portal_clinical_ids - build_clinical_ids)
     build_only = sorted(build_clinical_ids - portal_clinical_ids)
-    common_samp_ids = sorted(portal_clinical_ids & build_clinical_ids)
+    common_clinical_ids = sorted(portal_clinical_ids & build_clinical_ids)
     # gross attribute diffs
     portal_attr_only = list(portal_attr - build_attr)
     build_attr_only = list(build_attr - portal_attr)
     common_attr = list(portal_attr & build_attr)
     # focus on common samp and common attr, as "everything is different for x" is not that useful
-    print("Per " + clin_type + " changes:", file=out)
+    print(clin_type + "\tattribute\tbefore\tafter", file=out)
     attr_cts = {}
-    for samp_id in common_samp_ids:
+    for clinical_id in common_clinical_ids:
         for attr in common_attr:
             # portal will not have a value for that attr in the struct if none
-            portal_value = portal[samp_id].get(attr, "NA")
-            if portal_value != build[samp_id][attr]:
-                print("{} {} attribute {} would change from {} to {}".format(clin_type, samp_id, attr, portal_value, build[samp_id][attr]), file=out)
+            portal_value = portal[clinical_id].get(attr, "NA")
+            if portal_value != build[clinical_id][attr]:
+                print("{}\t{}\t{}\t{}".format(clinical_id, attr, portal_value, build[clinical_id][attr]), file=out)
                 if attr not in attr_cts:
                     attr_cts[attr] = 0
                 attr_cts[attr] += 1
 
-    print("CHANGE SUMMARY:", file=out)
+    # print change summary to STDOUT
+    print(clin_type +" CHANGE SUMMARY:")
     if len(portal_only) > 0:
-        print("{} {}s in portal would be removed: {}".format(len(portal_only), clin_type, ",".join(portal_only)), file=out)
+        print("{} {}s in portal would be removed: {}".format(len(portal_only), clin_type, ",".join(portal_only)))
     if len(build_only) > 0:
-        print("{} {}s in build would be added to the portal: {}".format(len(build_only), clin_type,  ",".join(build_only)), file=out)
+        print("{} {}s in build would be added to the portal: {}".format(len(build_only), clin_type,  ",".join(build_only)))
     if len(portal_attr_only) > 0:
-        print("{} attributes in portal would be removed: {}".format(len(portal_attr_only), ",".join(portal_attr_only)), file=out)
+        print("{} attributes in portal would be removed: {}".format(len(portal_attr_only), ",".join(portal_attr_only)))
     if len(build_attr_only) > 0:
-        print("{} attributes in build would be added to the portal: {}".format(len(build_attr_only), ",".join(build_attr_only)), file=out)
+        print("{} attributes in build would be added to the portal: {}".format(len(build_attr_only), ",".join(build_attr_only)))
     for attr in attr_cts:
-        print("{} has {} change(s)".format(attr, attr_cts[attr]), file=out)
-
-
-def split_sort_field(value, sep):
-    """
-    For ease of comparison, aggregate attributes concatenated by a separator may not be in the same order, but order does not matter.
-    Therefore, sort them so that when compared, no errors are triggered
-    """
-    value_list = value.split(sep)
-    value_list.sort()
-    return sep.join(value_list)
+        print("{} has {} change(s)".format(attr, attr_cts[attr]))
+    # Print extra newline for readability
+    print ("")
 
 
 def table_to_dict(in_file, key, aggr_list):
@@ -74,17 +68,17 @@ def table_to_dict(in_file, key, aggr_list):
                 for aggr in aggr_list:
                     if aggr in header:
                         aggr_head.append(header.index(aggr))
-
                 break
         data_dict = {}
         for entry in f:
             data = entry.rstrip('\n').split('\t')
             # Replace empty string with NA as that is how the portal will return it
             data = ["NA" if d == "" else d for d in data]
             data_dict[data[primary]] = {}
-            # sort aggr fields
+            # For ease of comparison, aggregate attributes concatenated by a separator may not be in the same order, but order does not matter
+            # Therefore, sort them so that when compared, no errors are triggered
             for i in aggr_head:
-                data[i] = split_sort_field(data[i], ";")
+                data[i] = ';'.join(sorted(data[i].split(';')))
             # two loops, for up until primary key, then after.
             for i in range(len(data)):
                 if i == primary: continue
@@ -114,6 +108,8 @@ def data_clinical_from_study(cbio_conn, study_id, data_type, aggr_list):
             data_dict[clinical_id] = {"PATIENT_ID": entry.patientId}
         value = entry.value
         attr_id = entry.clinicalAttributeId
+        # For ease of comparison, aggregate attributes concatenated by a separator may not be in the same order, but order does not matter
+        # Therefore, sort them so that when compared, no errors are triggered
         if attr_id in aggr_list:
             value = ';'.join(sorted(value.split(';')))
         # "standardize" status field so that 0:LIVING = LIVING and 1:DECEASED = DECEASED
@@ -159,32 +155,30 @@ def main():
                                                 "validate_swagger_spec": False}
     )
 
-    # hardcode for now names of aggregate fields
+    # hardcode for now names of aggregate fields, implicit, and skip fields
     aggr_list = ["SPECIMEN_ID", "EXPERIMENT_STRATEGY"]
+    portal_sample_attr_implicit = ['PATIENT_ID']
+    portal_patient_attr_skip = ['SAMPLE_COUNT']
+    portal_sample_attr_skip = ['FRACTION_GENOME_ALTERED', 'MUTATION_COUNT']
     # get attribute keys
     attr_key_obj = cbioportal.Clinical_Attributes.fetchClinicalAttributesUsingPOST(studyIds=[args.study], projection='ID').result()
     # gather sample-level metadata
     portal_sample_data = data_clinical_from_study(cbioportal, args.study, "SAMPLE", aggr_list)
     build_sample_data, build_sample_attr_keys = table_to_dict(args.data_dir + "/data_clinical_sample.txt", "SAMPLE_ID", aggr_list)
-    sample_diff_out = open('sample_portal_v_build.txt', 'w')
     portal_sample_attr_keys = set([x.clinicalAttributeId for x in attr_key_obj if not x.patientAttribute])
     # implicit attributes not returned by function that are required for sample view
-    portal_sample_attr_implicit = ['PATIENT_ID']
     portal_sample_attr_keys.update(portal_sample_attr_implicit)
     # drop attributes that are post-load portal-specific
-    portal_sample_attr_skip = ['FRACTION_GENOME_ALTERED', 'MUTATION_COUNT']
     portal_sample_attr_keys -= set(portal_sample_attr_skip)
     # sample-level diffs
     with open('sample_portal_v_build.txt', 'w') as sample_diff_out:
         clinical_diffs(portal_sample_data, build_sample_data, portal_sample_attr_keys, build_sample_attr_keys, "Sample", sample_diff_out)
     # patient-level diffs
     portal_patient_data =  data_clinical_from_study(cbioportal, args.study, "PATIENT", aggr_list)
     build_patient_data, build_patient_attr_keys = table_to_dict(args.data_dir + "/data_clinical_patient.txt", "PATIENT_ID", aggr_list)
-    patient_diff_out = open('patient_portal_v_build.txt', 'w')
     portal_patient_attr_keys = set([x.clinicalAttributeId for x in attr_key_obj if x.patientAttribute])
-    portal_patient_attr_skip = ['SAMPLE_COUNT']
+    # drop attributes that are post-load portal-specific
     portal_patient_attr_keys -= set(portal_patient_attr_skip)
-
     with open('patient_portal_v_build.txt', 'w') as patient_diff_out:
         clinical_diffs(portal_patient_data, build_patient_data, portal_patient_attr_keys, build_patient_attr_keys, "Patient", patient_diff_out)