Skip to content

Commit 49157bb

Browse files
authored
Merge pull request #60 from kids-first/feature/mb-update-pbta-20240322
🔨 Update PBTA 20240322
2 parents c03c5d2 + 5a8fcdd commit 49157bb

12 files changed

+98
-50
lines changed

STUDY_CONFIGS/aml_sd_pet7q6f2_2018_data_processing_config.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
"file_loc_defs": {
1212
"_comment": "edit the values based on existing/anticipated source file locations, relative to working directory of the script being run",
1313
"mafs": {
14-
"kf": "annotated_public_outputs",
14+
"kf": ["annotated_public_outputs"],
1515
"header": "/home/ubuntu/tools/kf-cbioportal-etl/REFS/maf_KF_CONSENSUS.txt"
1616
},
1717
"rsem": "RSEM_gene",

STUDY_CONFIGS/bllnos_sd_z6mwd3h0_2018_data_processing_config.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
"file_loc_defs": {
1717
"_comment": "edit the values based on existing/anticipated source file locations, relative to working directory of the script being run",
1818
"mafs": {
19-
"kf": "annotated_public_outputs",
19+
"kf": ["annotated_public_outputs"],
2020
"header": "/home/ubuntu/tools/kf-cbioportal-etl/REFS/maf_KF_CONSENSUS_r105.txt"
2121
},
2222
"cnvs": {

STUDY_CONFIGS/chdm_sd_7spqtt8m_data_processing_config.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
"file_loc_defs": {
1717
"_comment": "edit the values based on existing/anticipated source file locations, relative to working directory of the script being run",
1818
"mafs": {
19-
"kf": "annotated_public_outputs",
19+
"kf": ["annotated_public_outputs"],
2020
"header": "/home/ubuntu/tools/kf-cbioportal-etl/REFS/maf_KF_CONSENSUS_r105.txt"
2121
},
2222
"cnvs": {

STUDY_CONFIGS/data_processing_config.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
"file_loc_defs": {
1717
"_comment": "edit the values based on existing/anticipated source file locations, relative to working directory of the script being run",
1818
"mafs": {
19-
"kf": "annotated_public_outputs",
19+
"kf": ["annotated_public_outputs"],
2020
"dgd": "annotated_public",
2121
"header": "/home/ubuntu/tools/kf-cbioportal-etl/REFS/maf_KF_CONSENSUS_r105.txt"
2222
},

STUDY_CONFIGS/os_sd_zxjffmef_2015_data_processing_config.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
"file_loc_defs": {
1313
"_comment": "edit the values based on existing/anticipated source file locations, relative to working directory of the script being run",
1414
"mafs": {
15-
"kf": "annotated_public_outputs",
15+
"kf": ["annotated_public_outputs"],
1616
"header": "/home/ubuntu/tools/kf-cbioportal-etl/REFS/maf_KF_CONSENSUS_r105.txt"
1717
},
1818
"cnvs": {

STUDY_CONFIGS/pbta_all_case_meta_config.json

+10-1
Original file line numberDiff line numberDiff line change
@@ -248,8 +248,17 @@
248248
"table": "bix_genomics_file.sd_bhjxbdqk_mioncoseq-genomics_file_manifest",
249249
"file_type": ["RSEM_gene","annofuse_filtered_fusions_tsv","annotated_public_outputs","ctrlfreec_pval","ctrlfreec_info","ctrlfreec_bam_seg"],
250250
"out_file": "mioncoseq_genomics_file_manifest.txt"
251+
},
252+
"dgd_cbtn_panel": {
253+
"table": "bix_genomics_file.sd_bhjxbdqk_dgd_panel-genomics_file_manifest",
254+
"file_type": ["RSEM_gene","annofuse_filtered_fusions_tsv","annotated_public_outputs","ctrlfreec_pval","ctrlfreec_info","ctrlfreec_bam_seg"],
255+
"out_file": "dgd_cbtn_panel_genomics_file_manifest.txt"
256+
},
257+
"dgd__panel": {
258+
"table": "bix_genomics_file.sd_6g58hhsx_dgd_panel-genomics_file_manifest",
259+
"file_type": ["RSEM_gene","annofuse_filtered_fusions_tsv","annotated_public_outputs","ctrlfreec_pval","ctrlfreec_info","ctrlfreec_bam_seg"],
260+
"out_file": "dgd_panel_genomics_file_manifest.txt"
251261
}
252-
253262
},
254263
"sample_head": {
255264
"table": "template_sample_header.txt"
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
{
2+
"bedtools": "bedtools",
3+
"cp_only_script": "/home/ubuntu/tools/kf-cbioportal-etl/scripts/get_cbio_copy_only_num.pl",
4+
"bed_genes": "/home/ubuntu/tools/kf-cbioportal-etl/REFS/Homo_sapiens.GRCh38.105.chr.gtf_genes.bed",
5+
"hugo_tsv": "/home/ubuntu/tools/kf-cbioportal-etl/REFS/HUGO_2021-06-01_EntrezID.tsv",
6+
"entrez_tsv": "/home/ubuntu/tools/kf-cbioportal-etl/REFS/EntrezGeneId_HugoGeneSymbol_2021-06-01.txt",
7+
"rna_ext_list": {
8+
"expression": "rsem.genes.results.gz",
9+
"fusion": "annoFuse_filter.tsv"
10+
},
11+
"dna_ext_list": {
12+
"mutation": "consensus_somatic.norm.annot.public.maf",
13+
"copy_number": "controlfreec.CNVs.p.value.txt",
14+
"seg": "controlfreec.seg"
15+
},
16+
"file_loc_defs": {
17+
"_comment": "edit the values based on existing/anticipated source file locations, relative to working directory of the script being run",
18+
"mafs": {
19+
"kf": ["annotated_public_outputs", "consensus_public_outputs"],
20+
"dgd": "annotated_public",
21+
"header": "/home/ubuntu/tools/kf-cbioportal-etl/REFS/maf_KF_CONSENSUS_r105.txt"
22+
},
23+
"cnvs": {
24+
"pval": "ctrlfreec_pval",
25+
"info": "ctrlfreec_info",
26+
"seg": "ctrlfreec_bam_seg"
27+
},
28+
"rsem": "RSEM_gene",
29+
"fusion": "annofuse_filtered_fusions_tsv",
30+
"dgd_fusion": "fusion-dgd.tsv.gz"
31+
},
32+
"dl_file_type_list": ["RSEM_gene","annofuse_filtered_fusions_tsv","annotated_public_outputs",
33+
"ctrlfreec_pval","ctrlfreec_info","ctrlfreec_bam_seg", "DGD_MAF"],
34+
"ens_gene_list":"/home/ubuntu/tools/kf-cbioportal-etl/REFS/gencode27_gene_list.txt",
35+
"script_dir": "/home/ubuntu/tools/kf-cbioportal-etl/scripts/",
36+
"cbioportal_validator": "/home/ubuntu/tools/cbioportal/core/src/main/scripts/importer/validateData.py",
37+
"cna_flag": 1,
38+
"cnv_high_gain": 4,
39+
"cnv_min_len": 50000,
40+
"rna_flag": 1,
41+
"cpus": 8,
42+
"threads": 40
43+
}

STUDY_CONFIGS/pbta_all_treatment_data_processing_config.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
"file_loc_defs": {
1717
"_comment": "edit the values based on existing/anticipated source file locations, relative to working directory of the script being run",
1818
"mafs": {
19-
"kf": "annotated_public_outputs",
19+
"kf": ["annotated_public_outputs"],
2020
"header": "/home/ubuntu/tools/kf-cbioportal-etl/REFS/maf_KF_CONSENSUS.txt"
2121
},
2222
"cnvs": {

scripts/cnv_2_merge.py

+9-9
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,10 @@
33
import os
44
import argparse
55
import json
6-
import subprocess
76
import concurrent.futures
87
import pandas as pd
98
import re
109
from get_file_metadata_helper import get_file_metadata
11-
import pdb
1210

1311

1412
def process_cnv(cnv_fn, cur_cnv_dict, samp_id):
@@ -43,7 +41,6 @@ def get_ploidy(obj):
4341

4442
def process_table(cbio_dx, file_meta_dict):
4543
try:
46-
4744
# project/disease name should be name of directory hosting datasheet
4845
sys.stderr.write("Processing " + cbio_dx + " project" + "\n")
4946
new_cnv = open(out_dir + cbio_dx + ".predicted_cnv.txt", "w")
@@ -86,9 +83,10 @@ def process_table(cbio_dx, file_meta_dict):
8683
new_cnv.write("\t" + ploidy_dict[samp])
8784
new_cnv.write("\n")
8885
new_cnv.close()
86+
return 0, cbio_tum_id
8987
except Exception as e:
90-
print(e)
91-
exit(1)
88+
print(e, file=sys.stderr)
89+
return 1, cbio_tum_id
9290

9391

9492

@@ -149,10 +147,12 @@ def process_table(cbio_dx, file_meta_dict):
149147
sys.stderr.write("output dir already exists\n")
150148
file_meta_dict = get_file_metadata(args.table, "cnv")
151149
with concurrent.futures.ProcessPoolExecutor(config_data["cpus"]) as executor:
152-
results = {
153-
executor.submit(process_table, cbio_dx, file_meta_dict): cbio_dx
154-
for cbio_dx in file_meta_dict
155-
}
150+
results = { executor.submit(process_table, cbio_dx, file_meta_dict): cbio_dx for cbio_dx in file_meta_dict }
151+
for result in concurrent.futures.as_completed(results):
152+
if result.result()[0]:
153+
print("Failed processing " + result.result()[1], file=sys.stderr)
154+
exit(1)
155+
156156
# for cbio_dx in file_meta_dict:
157157
# process_table(cbio_dx, file_meta_dict)
158158
# sys.stderr.write("Done, check logs\n")

scripts/cnv_3_gistic_style.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from get_file_metadata_helper import get_file_metadata
99
import pandas as pd
1010
import numpy as np
11+
import pdb
1112

1213
parser = argparse.ArgumentParser(
1314
description="Convert merged cnv values to discrete coded values."
@@ -107,7 +108,10 @@ def mt_adjust_cn(obj):
107108
for fname in fname_list:
108109
parts = re.search("^" + args.merged_cnv_dir + "/(.*).predicted_cnv.txt", fname)
109110
cbio_dx = parts.group(1)
110-
data = pd.read_csv(fname, sep="\t")
111+
try:
112+
data = pd.read_csv(fname, sep="\t")
113+
except Exception as e:
114+
print(e, file=sys.stderr)
111115
data.set_index("Hugo_Symbol")
112116
# sample list would be cbio ids
113117
samp_list = list(data.columns)[1:]

scripts/genomics_file_cbio_package_build.py

+3
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,9 @@ def process_maf(maf_loc_dict, cbio_id_table, data_config_file, dgd_status):
4545
maf_dir = maf_loc_dict["kf"]
4646
if args.dgd_status == "dgd":
4747
maf_dir = maf_loc_dict["dgd"]
48+
else:
49+
# KF can be in multiple palces
50+
maf_dir = ",".join(maf_dir)
4851
maf_header = maf_loc_dict["header"]
4952
maf_cmd = "{}maf_merge.py -t {} -i {} -m {} -j {} -f {} 2> collate_mafs.log".format(
5053
script_dir, cbio_id_table, maf_header, maf_dir, data_config_file, dgd_status

scripts/maf_merge.py

+22-33
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,7 @@ def filter_entry(entry, tum_id, norm_id, tid_idx, nid_idx, v_idx, h_idx, maf_exc
2424
return None
2525

2626

27-
def process_maf(
28-
maf_fn, new_maf, maf_exc, tum_id, norm_id
29-
):
27+
def process_maf(maf_fn, new_maf, maf_exc, tum_id, norm_id):
3028
"""
3129
Iterate over maf file, skipping header lines since the files are being merged.
3230
With possiblility of mixed source, search headers
@@ -69,9 +67,7 @@ def process_maf(
6967
cur_maf.close()
7068

7169

72-
def process_tbl(
73-
cbio_dx, file_meta_dict, print_head
74-
):
70+
def process_tbl(cbio_dx, file_meta_dict, print_head):
7571
"""
7672
Probaby a less likely scenario, but can split out into multiple projects based on dict
7773
"""
@@ -84,28 +80,10 @@ def process_tbl(
8480
for cbio_tum_id in file_meta_dict[cbio_dx]:
8581
cbio_norm_id = file_meta_dict[cbio_dx][cbio_tum_id]["cbio_norm_id"]
8682
fname = file_meta_dict[cbio_dx][cbio_tum_id]["fname"]
87-
sys.stderr.write(
88-
"Found relevant maf to process for "
89-
+ " "
90-
+ cbio_tum_id
91-
+ " "
92-
+ cbio_norm_id
93-
+ " "
94-
+ file_meta_dict[cbio_dx][cbio_tum_id]["kf_tum_id"]
95-
+ " "
96-
+ file_meta_dict[cbio_dx][cbio_tum_id]["kf_norm_id"]
97-
+ " "
98-
+ fname
99-
+ "\n"
100-
)
101-
sys.stderr.flush()
102-
process_maf(
103-
maf_dir + fname,
104-
new_maf,
105-
maf_exc,
106-
cbio_tum_id,
107-
cbio_norm_id,
108-
)
83+
print("Found relevant maf to process for {} {} {} {} {}".format(
84+
cbio_tum_id, cbio_norm_id, file_meta_dict[cbio_dx][cbio_tum_id]["kf_tum_id"], file_meta_dict[cbio_dx][cbio_tum_id]["kf_norm_id"], fname),
85+
file=sys.stderr)
86+
process_maf(maf_dir + fname, new_maf, maf_exc, cbio_tum_id, cbio_norm_id)
10987
x += 1
11088
sys.stderr.write(
11189
"Completed processing " + str(x) + " entries in " + cbio_dx + "\n"
@@ -130,7 +108,7 @@ def process_tbl(
130108
"-i", "--header", action="store", dest="header", help="File with maf header only"
131109
)
132110
parser.add_argument(
133-
"-m", "--maf-dir", action="store", dest="maf_dir", help="maf file directory"
111+
"-m", "--maf-dirs", action="store", dest="maf_dirs", help="comma-separated list of maf file directories"
134112
)
135113
parser.add_argument(
136114
"-j",
@@ -155,10 +133,21 @@ def process_tbl(
155133
args = parser.parse_args()
156134
with open(args.config_file) as f:
157135
config_data = json.load(f)
158-
# get maf file ext
159-
maf_dir = args.maf_dir
160-
if maf_dir[-1] != "/":
161-
maf_dir += "/"
136+
# Create symlinks to mafs in one place for ease of processing
137+
maf_dir = "MAFS/"
138+
maf_dirs_in = args.maf_dirs
139+
print("Symlinking maf files from {} to {}".format(maf_dirs_in, maf_dir), file=sys.stderr)
140+
os.makedirs("MAFS", exist_ok=True)
141+
for dirname in maf_dirs_in.split(","):
142+
abs_path = os.path.abspath(dirname)
143+
for fname in os.listdir(dirname):
144+
try:
145+
src = os.path.join(abs_path, fname)
146+
dest = os.path.join(maf_dir, fname)
147+
os.symlink(src, dest)
148+
except Exception as e:
149+
print(e, file=sys.stderr)
150+
print("Could not sym link {} in {}".format(fname, dirname))
162151
# If DGD maf only, else if both, dgd maf wil be handled separately, or not at all if no dgd and kf only
163152

164153
file_meta_dict = get_file_metadata(args.table, "DGD_MAF")

0 commit comments

Comments
 (0)