From 108e1972c78ba5c7d7066978604e31a84a3b1c86 Mon Sep 17 00:00:00 2001 From: Miguel Brown Date: Fri, 14 Jun 2024 11:45:03 -0400 Subject: [PATCH 1/2] :pencil: update config to get treatment :hammer: update get study to put all clinical in datasheets dir :pencil: update configs to support treatment data :hammer: add up front bucket access checks for download :pencil: update bucket-key pairs --- .../openpedcan_v15_case_meta_config.json | 1 - REFS/aws_bucket_key_pairs.txt | 12 ++-- STUDY_CONFIGS/pbta_all_case_meta_config.json | 64 +++++++++++++++++++ .../pbta_all_treatment_meta_config.json | 13 ++-- scripts/get_files_from_manifest.py | 12 +++- scripts/get_study_metadata.py | 3 +- 6 files changed, 91 insertions(+), 14 deletions(-) diff --git a/COLLABORATIONS/openTARGETS/openpedcan_v15_case_meta_config.json b/COLLABORATIONS/openTARGETS/openpedcan_v15_case_meta_config.json index f448b10..1a02880 100644 --- a/COLLABORATIONS/openTARGETS/openpedcan_v15_case_meta_config.json +++ b/COLLABORATIONS/openTARGETS/openpedcan_v15_case_meta_config.json @@ -110,7 +110,6 @@ "datatype": "SAMPLE_ATTRIBUTES" } } - } }, "study": { diff --git a/REFS/aws_bucket_key_pairs.txt b/REFS/aws_bucket_key_pairs.txt index ebd78a8..bd3fc2a 100644 --- a/REFS/aws_bucket_key_pairs.txt +++ b/REFS/aws_bucket_key_pairs.txt @@ -1,6 +1,10 @@ -s3://cds-246-phs002517-p30-fy20 NCI-AR s3://cds-246-phs002517-sequencefiles-p30-fy20 NCI-AR s3://cds-306-phs002517-x01 NCI-X01 -s3://d3b-cds-working-bucket d3b -s3://kf-strides-study-us-east-1-prd-sd-bhjxbdqk kf -s3://kf-study-us-east-1-prd-sd-8y99qzjj saml \ No newline at end of file +s3://d3b-cds-working-bucket Mgmt-Console-Dev-chopd3bprod@684194535433 +s3://d3b-study-us-east-1-prd-pbta-staging Mgmt-Console-Dev-chopd3bprod@684194535433 +s3://d3b-study-us-east-1-prd-sd-8y99qzjj Mgmt-Console-Dev-chopd3bprod@684194535433 +s3://d3b-study-us-east-1-prd-sd-bhjxbdqk Mgmt-Console-Dev-chopd3bprod@684194535433 +s3://d3b-study-us-east-1-prd-sd-hkrzzhfw Mgmt-Console-Dev-chopd3bprod@684194535433 +s3://d3b-study-us-east-1-prd-sd-m3dbxd12 Mgmt-Console-Dev-chopd3bprod@684194535433 +s3://kf-strides-study-us-east-1-prd-sd-bhjxbdqk Mgmt-Console-Dev-D3bCenter@232196027141 +s3://kf-study-us-east-1-prd-sd-8y99qzjj Mgmt-Console-Dev-D3b@538745987955 \ No newline at end of file diff --git a/STUDY_CONFIGS/pbta_all_case_meta_config.json b/STUDY_CONFIGS/pbta_all_case_meta_config.json index 822db1f..44255f2 100644 --- a/STUDY_CONFIGS/pbta_all_case_meta_config.json +++ b/STUDY_CONFIGS/pbta_all_case_meta_config.json @@ -161,6 +161,46 @@ "genetic_alteration_type": "CLINICAL", "datatype": "SAMPLE_ATTRIBUTES" } + }, + "imaging": { + "_comment": "see https://docs.cbioportal.org/file-formats/#event-types for detailed specifics", + "cbio_name": "data_clinical_timeline_imaging.txt", + "meta_file_attr": { + "genetic_alteration_type": "CLINICAL", + "datatype": "TIMELINE" + } + }, + "clinical_event": { + "_comment": "see https://docs.cbioportal.org/file-formats/#event-types for detailed specifics", + "cbio_name": "data_clinical_timeline_clinical_event.txt", + "meta_file_attr": { + "genetic_alteration_type": "CLINICAL", + "datatype": "TIMELINE" + } + }, + "specimen": { + "_comment": "see https://docs.cbioportal.org/file-formats/#event-types for detailed specifics", + "cbio_name": "data_clinical_timeline_specimen.txt", + "meta_file_attr": { + "genetic_alteration_type": "CLINICAL", + "datatype": "TIMELINE" + } + }, + "surgery": { + "_comment": "see https://docs.cbioportal.org/file-formats/#event-types for detailed specifics", + "cbio_name": "data_clinical_timeline_surgery.txt", + "meta_file_attr": { + "genetic_alteration_type": "CLINICAL", + "datatype": "TIMELINE" + } + }, + "treatment": { + "_comment": "see https://docs.cbioportal.org/file-formats/#event-types for detailed specifics", + "cbio_name": "data_clinical_timeline_treatment.txt", + "meta_file_attr": { + "genetic_alteration_type": "CLINICAL", + "datatype": "TIMELINE" + } } } }, @@ -274,6 +314,30 @@ "table": "prod_cbio.pbta_all_data_clinical_patient", "out_file": "data_clinical_patient.txt" }, + "imaging_timeline": { + "table": "prod_cbio.pbta_all_treatment_data_clinical_timeline_imaging", + "out_file": "data_clinical_timeline_imaging.txt" + }, + "specimen_timeline": { + "table": "prod_cbio.pbta_all_timeline_specimen", + "out_file": "data_clinical_timeline_specimen.txt" + + }, + "clinical_event_timeline": { + "table": "prod_cbio.pbta_all_timeline_clinical_event", + "out_file": "data_clinical_timeline_clinical_event.txt" + + }, + "surgery_timeline": { + "table": "prod_cbio.pbta_all_timeline_surgery", + "out_file": "data_clinical_timeline_surgery.txt" + + }, + "treatment_timeline": { + "table": "prod_cbio.pbta_all_timeline_treatment", + "out_file": "data_clinical_timeline_treatment.txt" + + }, "genomics_etl": { "table": "prod_cbio.pbta_all_genomics_etl_file", "out_file": "cbio_file_name_id.txt" diff --git a/STUDY_CONFIGS/pbta_all_treatment_meta_config.json b/STUDY_CONFIGS/pbta_all_treatment_meta_config.json index beb58f8..de6077f 100644 --- a/STUDY_CONFIGS/pbta_all_treatment_meta_config.json +++ b/STUDY_CONFIGS/pbta_all_treatment_meta_config.json @@ -183,21 +183,24 @@ "cbio_name": "data_clinical_timeline_specimen.txt", "meta_file_attr": { "genetic_alteration_type": "CLINICAL", - "datatype": "TIMELINE" } + "datatype": "TIMELINE" + } }, "surgery": { "_comment": "see https://docs.cbioportal.org/file-formats/#event-types for detailed specifics", "cbio_name": "data_clinical_timeline_surgery.txt", "meta_file_attr": { "genetic_alteration_type": "CLINICAL", - "datatype": "TIMELINE" } + "datatype": "TIMELINE" + } }, "treatment": { "_comment": "see https://docs.cbioportal.org/file-formats/#event-types for detailed specifics", "cbio_name": "data_clinical_timeline_treatment.txt", "meta_file_attr": { "genetic_alteration_type": "CLINICAL", - "datatype": "TIMELINE" } + "datatype": "TIMELINE" + } } } }, @@ -309,10 +312,6 @@ "table": "brownm28_dev_schema_cbio.pbta_all_timeline_treatment", "out_file": "data_clinical_timeline_treatment.txt" - }, - "genomics_etl": { - "table": "brownm28_dev_schema_cbio.pbta_all_treatment_genomics_etl_file", - "out_file": "cbio_file_name_id.txt" } } } \ No newline at end of file diff --git a/scripts/get_files_from_manifest.py b/scripts/get_files_from_manifest.py index 98c8030..a3130d2 100755 --- a/scripts/get_files_from_manifest.py +++ b/scripts/get_files_from_manifest.py @@ -9,7 +9,6 @@ import os import pandas as pd import sevenbridges as sbg -from sevenbridges.errors import SbgError from sevenbridges.http.error_handlers import rate_limit_sleeper, maintenance_sleeper import pdb @@ -173,6 +172,7 @@ def mt_type_download(file_type): ) # setting up a key dict that, for each aws key, has an associated sesion and manifest to download with key_dict = {} + bucket_errs = 0 with open(args.aws_tbl) as kl: for line in kl: (bucket, key) = line.rstrip('\n').split('\t') @@ -183,6 +183,16 @@ def mt_type_download(file_type): key_dict[key]['dl_client'] = key_dict[key]['session'].client("s3", config=client_config) else: key_dict[key]['manifest'] = pd.concat([key_dict[key]['manifest'], selected[selected['s3_path'].str.startswith(bucket)]], ignore_index=True) + # Test bucket access with that key, if it fails, print error then kill to not waste time + parse_url = urllib3.util.parse_url(bucket) + try: + key_dict[key]['dl_client'].list_objects(Bucket=parse_url.host) + except Exception as e: + bucket_errs = 1 + print(e, file=sys.stderr) + print("Bucket access ERROR: {}\t{}".format(bucket, key), file=sys.stderr) + if bucket_errs: + exit(1) if args.sbg_profile is not None: check = 1 config = sbg.Config(profile=args.sbg_profile) diff --git a/scripts/get_study_metadata.py b/scripts/get_study_metadata.py index d98b1f9..3644588 100644 --- a/scripts/get_study_metadata.py +++ b/scripts/get_study_metadata.py @@ -165,7 +165,8 @@ def get_manifests(db_cur, config_dict): tbl_name = config_data['database_pulls'][key]['table'] (rows, colnames) = generic_pull(cur, tbl_name) out_fn = config_data['database_pulls'][key]['out_file'] - if key == 'gene_file': + # all data_clinical sheets go in this dir + if out_fn.startswith('data_clinical_'): out_fn = datasheet_dir + "/" + out_fn out_file = open(out_fn, 'w') generic_print(out_file, rows, colnames) From 71885f6513f994a21c1a97f0b27ee2d3c0a19f59 Mon Sep 17 00:00:00 2001 From: Miguel Brown Date: Fri, 21 Jun 2024 15:16:40 -0400 Subject: [PATCH 2/2] :pencil: update readme --- README.md | 63 ++++++++++++++++++++++++++++++++----------------------- 1 file changed, 37 insertions(+), 26 deletions(-) diff --git a/README.md b/README.md index 8997838..8faffd0 100644 --- a/README.md +++ b/README.md @@ -28,33 +28,44 @@ In the end, if you named your output dir `processed`, you'll end up with this ex processed └── pbta_all ├── case_lists - │   ├── cases_3way_complete.txt - │   ├── cases_RNA_Seq_v2_mRNA.txt - │   ├── cases_all.txt - │   ├── cases_cna.txt - │   ├── cases_cnaseq.txt - │   ├── cases_sequenced.txt - │   └── cases_sv.txt - ├── data_CNA.txt -> /home/ubuntu/mount/pbta_all/merged_cnvs/pbta_all.discrete_cnvs.txt - ├── data_clinical_patient.txt -> /home/ubuntu/mount/pbta_all/datasheets/data_clinical_patient.txt - ├── data_clinical_sample.txt -> /home/ubuntu/mount/pbta_all/datasheets/data_clinical_sample.txt - ├── data_cna.seg.txt -> /home/ubuntu/mount/pbta_all/merged_cnvs/pbta_all.merged_seg.txt - ├── data_linear_CNA.txt -> /home/ubuntu/mount/pbta_all/merged_cnvs/pbta_all.predicted_cnv.txt - ├── data_mutations_extended.txt -> /home/ubuntu/mount/pbta_all/merged_mafs/pbta_all.maf - ├── data_rna_seq_v2_mrna.txt -> /home/ubuntu/mount/pbta_all/merged_rsem/pbta_all.rsem_merged.txt - ├── data_rna_seq_v2_mrna_median_Zscores.txt -> /home/ubuntu/mount/pbta_all/merged_rsem/pbta_all.rsem_merged_zscore.txt - ├── data_sv.txt -> /home/ubuntu/mount/pbta_all/pbta_all.fusions.txt - ├── meta_CNA.txt - ├── meta_SV.txt - ├── meta_clinical_patient.txt - ├── meta_clinical_sample.txt - ├── meta_cna.seg.txt - ├── meta_linear_CNA.txt - ├── meta_mutations_extended.txt - ├── meta_rna_seq_v2_mrna.txt - ├── meta_rna_seq_v2_mrna_median_Zscores.txt - └── meta_study.txt +│   ├── cases_3way_complete.txt +│   ├── cases_RNA_Seq_v2_mRNA.txt +│   ├── cases_all.txt +│   ├── cases_cna.txt +│   ├── cases_cnaseq.txt +│   ├── cases_sequenced.txt +│   └── cases_sv.txt +├── data_CNA.txt -> /home/ubuntu/volume/PORTAL_LOADS/pbta_all/merged_cnvs/pbta_all.discrete_cnvs.txt +├── data_clinical_patient.txt -> /home/ubuntu/volume/PORTAL_LOADS/pbta_all/datasheets/data_clinical_patient.txt +├── data_clinical_sample.txt -> /home/ubuntu/volume/PORTAL_LOADS/pbta_all/datasheets/data_clinical_sample.txt +├── data_clinical_timeline_clinical_event.txt -> /home/ubuntu/volume/PORTAL_LOADS/pbta_all/datasheets/data_clinical_timeline_clinical_event.txt +├── data_clinical_timeline_imaging.txt -> /home/ubuntu/volume/PORTAL_LOADS/pbta_all/datasheets/data_clinical_timeline_imaging.txt +├── data_clinical_timeline_specimen.txt -> /home/ubuntu/volume/PORTAL_LOADS/pbta_all/datasheets/data_clinical_timeline_specimen.txt +├── data_clinical_timeline_surgery.txt -> /home/ubuntu/volume/PORTAL_LOADS/pbta_all/datasheets/data_clinical_timeline_surgery.txt +├── data_clinical_timeline_treatment.txt -> /home/ubuntu/volume/PORTAL_LOADS/pbta_all/datasheets/data_clinical_timeline_treatment.txt +├── data_cna.seg.txt -> /home/ubuntu/volume/PORTAL_LOADS/pbta_all/merged_cnvs/pbta_all.merged_seg.txt +├── data_linear_CNA.txt -> /home/ubuntu/volume/PORTAL_LOADS/pbta_all/merged_cnvs/pbta_all.predicted_cnv.txt +├── data_mutations_extended.txt -> /home/ubuntu/volume/PORTAL_LOADS/pbta_all/merged_mafs/pbta_all.maf +├── data_rna_seq_v2_mrna.txt -> /home/ubuntu/volume/PORTAL_LOADS/pbta_all/merged_rsem/pbta_all.rsem_merged.txt +├── data_rna_seq_v2_mrna_median_Zscores.txt -> /home/ubuntu/volume/PORTAL_LOADS/pbta_all/merged_rsem/pbta_all.rsem_merged_zscore.txt +├── data_sv.txt -> /home/ubuntu/volume/PORTAL_LOADS/pbta_all/merged_fusion/pbta_all.fusions.txt +├── meta_CNA.txt +├── meta_clinical_patient.txt +├── meta_clinical_sample.txt +├── meta_clinical_timeline_clinical_event.txt +├── meta_clinical_timeline_imaging.txt +├── meta_clinical_timeline_specimen.txt +├── meta_clinical_timeline_surgery.txt +├── meta_clinical_timeline_treatment.txt +├── meta_cna.seg.txt +├── meta_linear_CNA.txt +├── meta_mutations_extended.txt +├── meta_rna_seq_v2_mrna.txt +├── meta_rna_seq_v2_mrna_median_Zscores.txt +├── meta_study.txt +└── meta_sv.txt ``` +Note! Most other studies won't have a timeline set of files. # Details Use this section as a reference in case your overconfidence got the best of you