Merge pull request #60 from kids-first/feature/mb-update-pbta-20240322

migbro · web-flow · commit 49157bbb5784 · 2024-03-28T11:28:55.000-04:00
🔨 Update PBTA 20240322
diff --git a/STUDY_CONFIGS/aml_sd_pet7q6f2_2018_data_processing_config.json b/STUDY_CONFIGS/aml_sd_pet7q6f2_2018_data_processing_config.json
@@ -11,7 +11,7 @@
   "file_loc_defs": {
     "_comment": "edit the values based on existing/anticipated source file locations, relative to working directory of the script being run",
     "mafs": {
-      "kf": "annotated_public_outputs",
+      "kf": ["annotated_public_outputs"],
       "header": "/home/ubuntu/tools/kf-cbioportal-etl/REFS/maf_KF_CONSENSUS.txt"
     },
     "rsem": "RSEM_gene",
diff --git a/STUDY_CONFIGS/bllnos_sd_z6mwd3h0_2018_data_processing_config.json b/STUDY_CONFIGS/bllnos_sd_z6mwd3h0_2018_data_processing_config.json
@@ -16,7 +16,7 @@
   "file_loc_defs": {
     "_comment": "edit the values based on existing/anticipated source file locations, relative to working directory of the script being run",
     "mafs": {
-      "kf": "annotated_public_outputs",
+      "kf": ["annotated_public_outputs"],
       "header": "/home/ubuntu/tools/kf-cbioportal-etl/REFS/maf_KF_CONSENSUS_r105.txt"
     },
     "cnvs": {
diff --git a/STUDY_CONFIGS/chdm_sd_7spqtt8m_data_processing_config.json b/STUDY_CONFIGS/chdm_sd_7spqtt8m_data_processing_config.json
@@ -16,7 +16,7 @@
     "file_loc_defs": {
       "_comment": "edit the values based on existing/anticipated source file locations, relative to working directory of the script being run",
       "mafs": {
-        "kf": "annotated_public_outputs",
+        "kf": ["annotated_public_outputs"],
         "header": "/home/ubuntu/tools/kf-cbioportal-etl/REFS/maf_KF_CONSENSUS_r105.txt"
       },
       "cnvs": {
diff --git a/STUDY_CONFIGS/data_processing_config.json b/STUDY_CONFIGS/data_processing_config.json
@@ -16,7 +16,7 @@
   "file_loc_defs": {
     "_comment": "edit the values based on existing/anticipated source file locations, relative to working directory of the script being run",
     "mafs": {
-      "kf": "annotated_public_outputs",
+      "kf": ["annotated_public_outputs"],
       "dgd": "annotated_public",
       "header": "/home/ubuntu/tools/kf-cbioportal-etl/REFS/maf_KF_CONSENSUS_r105.txt"
     },
diff --git a/STUDY_CONFIGS/os_sd_zxjffmef_2015_data_processing_config.json b/STUDY_CONFIGS/os_sd_zxjffmef_2015_data_processing_config.json
@@ -12,7 +12,7 @@
   "file_loc_defs": {
     "_comment": "edit the values based on existing/anticipated source file locations, relative to working directory of the script being run",
     "mafs": {
-      "kf": "annotated_public_outputs",
+      "kf": ["annotated_public_outputs"],
       "header": "/home/ubuntu/tools/kf-cbioportal-etl/REFS/maf_KF_CONSENSUS_r105.txt"
     },
     "cnvs": {
diff --git a/STUDY_CONFIGS/pbta_all_case_meta_config.json b/STUDY_CONFIGS/pbta_all_case_meta_config.json
@@ -248,8 +248,17 @@
                 "table": "bix_genomics_file.sd_bhjxbdqk_mioncoseq-genomics_file_manifest",
                 "file_type": ["RSEM_gene","annofuse_filtered_fusions_tsv","annotated_public_outputs","ctrlfreec_pval","ctrlfreec_info","ctrlfreec_bam_seg"],
                 "out_file": "mioncoseq_genomics_file_manifest.txt"
+            },
+            "dgd_cbtn_panel": {
+                "table": "bix_genomics_file.sd_bhjxbdqk_dgd_panel-genomics_file_manifest",
+                "file_type": ["RSEM_gene","annofuse_filtered_fusions_tsv","annotated_public_outputs","ctrlfreec_pval","ctrlfreec_info","ctrlfreec_bam_seg"],
+                "out_file": "dgd_cbtn_panel_genomics_file_manifest.txt"
+            },
+            "dgd__panel": {
+                "table": "bix_genomics_file.sd_6g58hhsx_dgd_panel-genomics_file_manifest",
+                "file_type": ["RSEM_gene","annofuse_filtered_fusions_tsv","annotated_public_outputs","ctrlfreec_pval","ctrlfreec_info","ctrlfreec_bam_seg"],
+                "out_file": "dgd_panel_genomics_file_manifest.txt"
             }
-
         },
         "sample_head": {
             "table": "template_sample_header.txt"
diff --git a/STUDY_CONFIGS/pbta_all_data_processing_config.json b/STUDY_CONFIGS/pbta_all_data_processing_config.json
@@ -0,0 +1,43 @@
+{
+  "bedtools": "bedtools",
+  "cp_only_script": "/home/ubuntu/tools/kf-cbioportal-etl/scripts/get_cbio_copy_only_num.pl",
+  "bed_genes": "/home/ubuntu/tools/kf-cbioportal-etl/REFS/Homo_sapiens.GRCh38.105.chr.gtf_genes.bed",
+  "hugo_tsv": "/home/ubuntu/tools/kf-cbioportal-etl/REFS/HUGO_2021-06-01_EntrezID.tsv",
+  "entrez_tsv": "/home/ubuntu/tools/kf-cbioportal-etl/REFS/EntrezGeneId_HugoGeneSymbol_2021-06-01.txt",
+  "rna_ext_list": {
+    "expression": "rsem.genes.results.gz",
+    "fusion": "annoFuse_filter.tsv"
+  },
+  "dna_ext_list": {
+    "mutation": "consensus_somatic.norm.annot.public.maf", 
+    "copy_number": "controlfreec.CNVs.p.value.txt",
+    "seg": "controlfreec.seg"
+  },
+  "file_loc_defs": {
+    "_comment": "edit the values based on existing/anticipated source file locations, relative to working directory of the script being run",
+    "mafs": {
+      "kf": ["annotated_public_outputs", "consensus_public_outputs"],
+      "dgd": "annotated_public",
+      "header": "/home/ubuntu/tools/kf-cbioportal-etl/REFS/maf_KF_CONSENSUS_r105.txt"
+    },
+    "cnvs": {
+      "pval": "ctrlfreec_pval",
+      "info": "ctrlfreec_info",
+      "seg": "ctrlfreec_bam_seg"
+    },
+    "rsem": "RSEM_gene",
+    "fusion": "annofuse_filtered_fusions_tsv",
+    "dgd_fusion": "fusion-dgd.tsv.gz"
+  },
+  "dl_file_type_list": ["RSEM_gene","annofuse_filtered_fusions_tsv","annotated_public_outputs",
+    "ctrlfreec_pval","ctrlfreec_info","ctrlfreec_bam_seg", "DGD_MAF"],
+  "ens_gene_list":"/home/ubuntu/tools/kf-cbioportal-etl/REFS/gencode27_gene_list.txt",
+  "script_dir": "/home/ubuntu/tools/kf-cbioportal-etl/scripts/",
+  "cbioportal_validator": "/home/ubuntu/tools/cbioportal/core/src/main/scripts/importer/validateData.py",
+  "cna_flag": 1,
+  "cnv_high_gain": 4,
+  "cnv_min_len": 50000,
+  "rna_flag": 1,
+  "cpus": 8,
+  "threads": 40
+}
diff --git a/STUDY_CONFIGS/pbta_all_treatment_data_processing_config.json b/STUDY_CONFIGS/pbta_all_treatment_data_processing_config.json
@@ -16,7 +16,7 @@
     "file_loc_defs": {
       "_comment": "edit the values based on existing/anticipated source file locations, relative to working directory of the script being run",
       "mafs": {
-        "kf": "annotated_public_outputs",
+        "kf": ["annotated_public_outputs"],
         "header": "/home/ubuntu/tools/kf-cbioportal-etl/REFS/maf_KF_CONSENSUS.txt"
       },
       "cnvs": {
diff --git a/scripts/cnv_2_merge.py b/scripts/cnv_2_merge.py
@@ -3,12 +3,10 @@
 import os
 import argparse
 import json
-import subprocess
 import concurrent.futures
 import pandas as pd
 import re
 from get_file_metadata_helper import get_file_metadata
-import pdb
 
 
 def process_cnv(cnv_fn, cur_cnv_dict, samp_id):
@@ -43,7 +41,6 @@ def get_ploidy(obj):
 
 def process_table(cbio_dx, file_meta_dict):
     try:
-
         # project/disease name should be name of directory hosting datasheet
         sys.stderr.write("Processing " + cbio_dx + " project" + "\n")
         new_cnv = open(out_dir + cbio_dx + ".predicted_cnv.txt", "w")
@@ -86,9 +83,10 @@ def process_table(cbio_dx, file_meta_dict):
                     new_cnv.write("\t" + ploidy_dict[samp])
             new_cnv.write("\n")
         new_cnv.close()
+        return 0, cbio_tum_id
     except Exception as e:
-        print(e)
-        exit(1)
+        print(e, file=sys.stderr)
+        return 1, cbio_tum_id
 
 
 
@@ -149,10 +147,12 @@ def process_table(cbio_dx, file_meta_dict):
     sys.stderr.write("output dir already exists\n")
 file_meta_dict = get_file_metadata(args.table, "cnv")
 with concurrent.futures.ProcessPoolExecutor(config_data["cpus"]) as executor:
-    results = {
-        executor.submit(process_table, cbio_dx, file_meta_dict): cbio_dx
-        for cbio_dx in file_meta_dict
-    }
+    results = { executor.submit(process_table, cbio_dx, file_meta_dict): cbio_dx for cbio_dx in file_meta_dict }
+    for result in concurrent.futures.as_completed(results):
+        if result.result()[0]:
+            print("Failed processing " + result.result()[1], file=sys.stderr)
+            exit(1)
+
 # for cbio_dx in file_meta_dict:
 #     process_table(cbio_dx, file_meta_dict)
 # sys.stderr.write("Done, check logs\n")
diff --git a/scripts/cnv_3_gistic_style.py b/scripts/cnv_3_gistic_style.py
@@ -8,6 +8,7 @@
 from get_file_metadata_helper import get_file_metadata
 import pandas as pd
 import numpy as np
+import pdb
 
 parser = argparse.ArgumentParser(
     description="Convert merged cnv values to discrete coded values."
@@ -107,7 +108,10 @@ def mt_adjust_cn(obj):
 for fname in fname_list:
     parts = re.search("^" + args.merged_cnv_dir + "/(.*).predicted_cnv.txt", fname)
     cbio_dx = parts.group(1)
-    data = pd.read_csv(fname, sep="\t")
+    try:
+        data = pd.read_csv(fname, sep="\t")
+    except Exception as e:
+        print(e, file=sys.stderr)
     data.set_index("Hugo_Symbol")
     # sample list would be cbio ids
     samp_list = list(data.columns)[1:]
diff --git a/scripts/genomics_file_cbio_package_build.py b/scripts/genomics_file_cbio_package_build.py
@@ -45,6 +45,9 @@ def process_maf(maf_loc_dict, cbio_id_table, data_config_file, dgd_status):
     maf_dir = maf_loc_dict["kf"]
     if args.dgd_status == "dgd":
         maf_dir = maf_loc_dict["dgd"]
+    else:
+        # KF can be in multiple palces
+        maf_dir = ",".join(maf_dir)
     maf_header = maf_loc_dict["header"]
     maf_cmd = "{}maf_merge.py -t {} -i {} -m {} -j {} -f {} 2> collate_mafs.log".format(
         script_dir, cbio_id_table, maf_header, maf_dir, data_config_file, dgd_status
diff --git a/scripts/maf_merge.py b/scripts/maf_merge.py
@@ -24,9 +24,7 @@ def filter_entry(entry, tum_id, norm_id, tid_idx, nid_idx, v_idx, h_idx, maf_exc
         return None
 
 
-def process_maf(
-    maf_fn, new_maf, maf_exc, tum_id, norm_id
-):
+def process_maf(maf_fn, new_maf, maf_exc, tum_id, norm_id):
     """
     Iterate over maf file, skipping header lines since the files are being merged.
     With possiblility of mixed source, search headers
@@ -69,9 +67,7 @@ def process_maf(
     cur_maf.close()
 
 
-def process_tbl(
-    cbio_dx, file_meta_dict, print_head
-):
+def process_tbl(cbio_dx, file_meta_dict, print_head):
     """
     Probaby a less likely scenario, but can split out into multiple projects based on dict
     """
@@ -84,28 +80,10 @@ def process_tbl(
         for cbio_tum_id in file_meta_dict[cbio_dx]:
             cbio_norm_id = file_meta_dict[cbio_dx][cbio_tum_id]["cbio_norm_id"]
             fname = file_meta_dict[cbio_dx][cbio_tum_id]["fname"]
-            sys.stderr.write(
-                "Found relevant maf to process for "
-                + " "
-                + cbio_tum_id
-                + " "
-                + cbio_norm_id
-                + " "
-                + file_meta_dict[cbio_dx][cbio_tum_id]["kf_tum_id"]
-                + " "
-                + file_meta_dict[cbio_dx][cbio_tum_id]["kf_norm_id"]
-                + " "
-                + fname
-                + "\n"
-            )
-            sys.stderr.flush()
-            process_maf(
-                maf_dir + fname,
-                new_maf,
-                maf_exc,
-                cbio_tum_id,
-                cbio_norm_id,
-            )
+            print("Found relevant maf to process for {} {} {} {} {}".format(
+                cbio_tum_id, cbio_norm_id, file_meta_dict[cbio_dx][cbio_tum_id]["kf_tum_id"], file_meta_dict[cbio_dx][cbio_tum_id]["kf_norm_id"], fname),
+                file=sys.stderr)
+            process_maf(maf_dir + fname, new_maf, maf_exc, cbio_tum_id, cbio_norm_id)
             x += 1
         sys.stderr.write(
             "Completed processing " + str(x) + " entries in " + cbio_dx + "\n"
@@ -130,7 +108,7 @@ def process_tbl(
         "-i", "--header", action="store", dest="header", help="File with maf header only"
     )
     parser.add_argument(
-        "-m", "--maf-dir", action="store", dest="maf_dir", help="maf file directory"
+        "-m", "--maf-dirs", action="store", dest="maf_dirs", help="comma-separated list of maf file directories"
     )
     parser.add_argument(
         "-j",
@@ -155,10 +133,21 @@ def process_tbl(
     args = parser.parse_args()
     with open(args.config_file) as f:
         config_data = json.load(f)
-    # get maf file ext
-    maf_dir = args.maf_dir
-    if maf_dir[-1] != "/":
-        maf_dir += "/"
+    # Create symlinks to mafs in one place for ease of processing
+    maf_dir = "MAFS/"
+    maf_dirs_in = args.maf_dirs
+    print("Symlinking maf files from {} to {}".format(maf_dirs_in, maf_dir), file=sys.stderr)
+    os.makedirs("MAFS", exist_ok=True)
+    for dirname in maf_dirs_in.split(","):
+        abs_path = os.path.abspath(dirname)
+        for fname in os.listdir(dirname):
+            try:
+                src = os.path.join(abs_path, fname)
+                dest = os.path.join(maf_dir, fname)
+                os.symlink(src, dest)
+            except Exception as e:
+                print(e, file=sys.stderr)
+                print("Could not sym link {} in {}".format(fname, dirname))
     # If DGD maf only, else if both, dgd maf wil be handled separately, or not at all if no dgd and kf only
 
     file_meta_dict = get_file_metadata(args.table, "DGD_MAF")