Merge pull request #61 from kids-first/feature/pbta-updates-2024-06-24

🔧 Fix Missing Field
kids-first · Jun 20, 2024 · 61738ac · 61738ac
2 parents 49157bb + 8007e68
commit 61738ac
Show file tree

Hide file tree

Showing 7 changed files with 117 additions and 7 deletions.
diff --git a/COLLABORATIONS/openTARGETS/header_desc.tsv b/COLLABORATIONS/openTARGETS/header_desc.tsv
@@ -14,13 +14,14 @@ cohort_participant_id	EXTERNAL_PATIENT_ID	External Patient Identifier	0	1	STRING
 formatted_sample_id	SAMPLE_ID		1	0	STRING	98	7316-1069-T-353281.WGS	
 Kids_First_Biospecimen_ID	SPECIMEN_ID	KFDRC tumor biopsecimen ID	1	0	STRING	13	BS_A9S5HT6P	
 broad_histology	CANCER_TYPE		1	0	STRING	12	Benign tumor	
+molecular_subtype	MOLECULAR_SUBTYPE	Molecular subtype defined by WHO 2021 guidelines	1	0	STRING	12	EPN, PF A	
 cancer_group	HISTOLOGY		1	0	STRING	11	Adenoma	
 harmonized_diagnosis	CANCER_TYPE_DETAILED		1	0	STRING	10	Adenoma	
 primary_site	TUMOR_TISSUE_SITE		1	0	STRING	9	Suprasellar/Hypothalamic/Pituitary	
 tumor_descriptor	TUMOR_TYPE		1	0	STRING	8	Initial CNS Tumor	
 composition	SAMPLE_TYPE		1	0	STRING	7	Solid Tissue	
-cohort	COHORT	Source study cohort name	1	0	STRING	6	
-sub_cohort	SUB_COHORT	Source study sub-cohort name	1	0	STRING	6	
+cohort	COHORT	Source study cohort name	1	0	STRING	6	PBTA	
+sub_cohort	SUB_COHORT	Source study sub-cohort name	1	0	STRING	6	DGD	
 CNS_region			1	0	STRING	5	Suprasellar	
 tumor_ploidy			1	0	NUMBER	4	3	
 tumor_fraction			1	0	NUMBER	3	0.476369391	

diff --git a/STUDY_CONFIGS/pbta_all_treatment_meta_config.json b/STUDY_CONFIGS/pbta_all_treatment_meta_config.json
@@ -167,9 +167,37 @@
                 "cbio_name": "data_clinical_timeline_imaging.txt",
                 "meta_file_attr": {
                     "genetic_alteration_type": "CLINICAL",
-                    "datatype": "TIMELINE",
-                    "data_filename": "data_clinical_timeline_imaging.txt"
+                    "datatype": "TIMELINE"
                 }
+            },
+            "clinical_event": {
+                "_comment": "see https://docs.cbioportal.org/file-formats/#event-types for detailed specifics",
+                "cbio_name": "data_clinical_timeline_clinical_event.txt",
+                "meta_file_attr": {
+                    "genetic_alteration_type": "CLINICAL",
+                    "datatype": "TIMELINE"
+                }
+            },
+            "specimen": {
+                "_comment": "see https://docs.cbioportal.org/file-formats/#event-types for detailed specifics",
+                "cbio_name": "data_clinical_timeline_specimen.txt",
+                "meta_file_attr": {
+                    "genetic_alteration_type": "CLINICAL",
+                    "datatype": "TIMELINE"                }
+            },
+            "surgery": {
+                "_comment": "see https://docs.cbioportal.org/file-formats/#event-types for detailed specifics",
+                "cbio_name": "data_clinical_timeline_surgery.txt",
+                "meta_file_attr": {
+                    "genetic_alteration_type": "CLINICAL",
+                    "datatype": "TIMELINE"                }
+            },
+            "treatment": {
+                "_comment": "see https://docs.cbioportal.org/file-formats/#event-types for detailed specifics",
+                "cbio_name": "data_clinical_timeline_treatment.txt",
+                "meta_file_attr": {
+                    "genetic_alteration_type": "CLINICAL",
+                    "datatype": "TIMELINE"                }
             }
         }
     },

diff --git a/scripts/get_study_metadata.py b/scripts/get_study_metadata.py
@@ -52,7 +52,7 @@ def generic_print(out_file, rows, colnames):
     out_file.write("\t".join(colnames) + "\n")
     for row in rows:
         # convert None to empty str
-        new_row = [str(i or '') for i in row]
+        new_row = ["" if i is None else str(i) for i in row]
         out_file.write("\t".join(new_row) + "\n")
     out_file.close()
     return 0

diff --git a/scripts/get_study_participant_ids.py b/scripts/get_study_participant_ids.py
@@ -0,0 +1,48 @@
+#!/usr/bin/env python3
+"""
+Script to pull patient IDs from a study on pedcbioportal 
+"""
+
+import argparse
+from bravado.client import SwaggerClient
+from bravado.requests_client import RequestsClient
+from urllib.parse import urlparse
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Pull patient IDs from a study on pedcbioportal"
+    )
+    parser.add_argument(
+        "-u", "--url", action="store", dest="url", help="url to search against", default="https://pedcbioportal.kidsfirstdrc.org/api/v2/api-docs"
+    )
+    parser.add_argument(
+        "-s", "--study", action="store", dest="study", help="Cancer study ID to compare on server"
+    )
+    parser.add_argument(
+        "-t", "--token", action="store", dest="token", help="Token file obtained from Web API"
+    )
+
+    args = parser.parse_args()
+    with open(args.token, 'r') as token_file:
+        token = token_file.read().rstrip().split(': ')[1]
+
+    url_object = urlparse(args.url)
+
+    http_client = RequestsClient()
+    http_client.set_api_key(
+        '{}'.format(url_object.hostname), 'Bearer {}'.format(token),
+        param_name='Authorization', param_in='header'
+    )
+
+    cbioportal = SwaggerClient.from_url(args.url,
+                                        http_client=http_client,
+                                        config={"validate_requests":False,
+                                                "validate_responses":False,
+                                                "validate_swagger_spec": False}
+    )
+
+    pt_list = cbioportal.Patients.getAllPatientsInStudyUsingGET(studyId=args.study).result()
+    print("\n".join([x.patientId for x in pt_list]))
+
+if __name__ == '__main__':
+    main()
diff --git a/scripts/organize_upload_packages.py b/scripts/organize_upload_packages.py
@@ -56,8 +56,6 @@ def process_meta_data(meta_data, output_dir, canc_study_id):
             subprocess.call(cmd, shell=True)
         except Exception as e:
             sys.stderr.write(str(e) + " failed processing meta data file\n")
-            pdb.set_trace()
-            hold = 1
 
 
 def process_clinical_data(meta_data, output_dir, canc_study_id):

diff --git a/utilities/merge_ploya_stranded_files.R → utilities/merge_polya_stranded_files.R b/utilities/merge_ploya_stranded_files.R → utilities/merge_polya_stranded_files.R
diff --git a/utilities/subtract_by_id.py b/utilities/subtract_by_id.py
@@ -0,0 +1,35 @@
+#!/usr/bin/python
+"""
+Script that can be cleaned up.
+Removes entries from a table using a list of banned values.
+Usage:
+  python subtract_by_id.py <id_list> <colname> <out_flag> <in_file>
+  To have no out_flag, provide "SKIP_THIS" as the value
+"""
+
+import sys
+import pdb
+
+id_list = {}
+with open(sys.argv[1]) as rm_list:
+    for line in rm_list:
+        id_list[line.rstrip('\n')] = 0
+colname = sys.argv[2]
+out_flag = sys.argv[3]
+with open(sys.argv[4]) as in_file:
+    head = next(in_file)
+    header = head.rstrip('\n').split('\t')
+    c_idx = header.index(colname)
+    o_idx = None
+    if out_flag != "SKIP_THIS":
+        out_list = []
+        o_idx = header.index(out_flag)
+    print(head, end='')
+    for line in in_file:
+        info = line.rstrip('\n').split('\t')
+        if info[c_idx] not in id_list:
+            print(line, end='')
+        elif o_idx is not None:
+            out_list.append(info[o_idx])
+if o_idx is not None:
+    print("\n".join(list(set(out_list))), file=sys.stderr)