Skip to content

Commit 094e37b

Browse files
authored
Merge pull request #28 from kids-first/bug/mb-fix-config
🔧 fixed pbta config
2 parents c21e05d + 1e2f25f commit 094e37b

File tree

2 files changed

+251
-9
lines changed

2 files changed

+251
-9
lines changed
Lines changed: 251 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,251 @@
1+
# Author: Run Jin
2+
# Add PedCBio Sample Name Column Addition
3+
4+
suppressPackageStartupMessages({
5+
library("optparse")
6+
library("tidyverse")
7+
library("readr")
8+
library("tidyr")
9+
})
10+
11+
#### Parse command line options ------------------------------------------------
12+
option_list <- list(
13+
make_option(c("-i","--histology_file"),type="character",
14+
help="histology file to update"),
15+
make_option(c("-c","--cbio_names_dir"),type="character",
16+
help="directory with .csv files ONLY with existing cBio sample names from D3b warehouse"),
17+
make_option(c("-o","--output_path"),type="character",
18+
help="output path + file name")
19+
20+
21+
)
22+
opt <- parse_args(OptionParser(option_list=option_list))
23+
# Get file paths
24+
hist_file <- opt$histology_file
25+
input_dir <- opt$cbio_names_dir
26+
out_path <- opt$output_path
27+
28+
### Read in files
29+
# histology file
30+
histology_df <- readr::read_tsv(hist_file, guess_max = 100000)
31+
32+
# tmp update for broad histology bug, to be fixed in v11
33+
histology_df <- histology_df %>%
34+
mutate(broad_histology = ifelse(broad_histology == "Hematologic malignancies",
35+
"Hematologic malignancy", broad_histology),
36+
# move GTEX to cancer group for pedcbio viz
37+
cancer_group = case_when(
38+
cohort == "GTEx" ~ paste0(gtex_group, " (GTEx)"),
39+
TRUE ~ as.character(cancer_group)),
40+
harmonized_diagnosis = case_when(
41+
cohort == "GTEx" ~ paste0(gtex_subgroup, " (GTEx)"),
42+
# add NBL subtyping to harm dx
43+
(cohort == "TARGET" | cohort == "GMKF") & !is.na(molecular_subtype) ~ paste0(cancer_group, ", ", molecular_subtype),
44+
# for remaining without a harm dx, use cancer group
45+
is.na(harmonized_diagnosis) ~ cancer_group,
46+
TRUE ~ as.character(harmonized_diagnosis)))
47+
48+
filled_harm_dx <- histology_df %>%
49+
filter(!is.na(harmonized_diagnosis)) # 32652
50+
51+
table(histology_df$cohort) # 17382 gtex
52+
table(histology_df$sample_type) # 15270 tumor
53+
54+
# read in all the pedcbio files
55+
cbio_names_list <- list.files(input_dir)
56+
57+
files_list <- lapply(cbio_names_list, function(x){
58+
x <- readr::read_csv(file.path(input_dir, x))
59+
})
60+
61+
62+
### Update the ID for those samples if available
63+
formatted_specimens_list <- lapply(files_list, function(x){
64+
65+
# format them and select relevant columns
66+
formatted <- x %>%
67+
dplyr::mutate(specimen_id = gsub("\\{", "", specimen_id)) %>%
68+
dplyr::mutate(specimen_id = gsub("\\}", "", specimen_id)) %>%
69+
dplyr::mutate(Kids_First_Biospecimen_ID = strsplit(specimen_id, ",")) %>%
70+
tidyr::unnest(Kids_First_Biospecimen_ID) %>%
71+
dplyr::select(Kids_First_Biospecimen_ID, formatted_sample_id)
72+
73+
# add those to histology file
74+
histology_formatted_added <- histology_df %>%
75+
dplyr::filter(Kids_First_Biospecimen_ID %in% formatted$Kids_First_Biospecimen_ID) %>%
76+
left_join(formatted)
77+
78+
return(histology_formatted_added)
79+
})
80+
81+
# get the combined data
82+
combined_histology_formatted_added <- do.call("rbind", formatted_specimens_list)
83+
84+
85+
### Add IDs to those without sample id already
86+
# find the samples that do not have an ID yet
87+
histology_df_no_format_id <- histology_df %>%
88+
dplyr::filter(!Kids_First_Biospecimen_ID %in% combined_histology_formatted_added$Kids_First_Biospecimen_ID)
89+
90+
91+
#### Handle each cohort at a time - start with PBTA
92+
# get all sample IDs in the PBTA cohort
93+
sample_ids_pbta <- histology_df_no_format_id %>%
94+
dplyr::filter(cohort == "PBTA") %>%
95+
pull(sample_id) %>%
96+
unique()
97+
98+
# find the samples that need additional `tie-breaker`
99+
specimens_id_need_tiebreak <- c()
100+
101+
for (i in 1:length(sample_ids_pbta)){
102+
# deal with one sample at a time
103+
sample_id_of_interest <- sample_ids_pbta[i]
104+
105+
# find the number of compositions
106+
each_specimen_need_tiebreak <- histology_df_no_format_id %>%
107+
dplyr::filter(sample_type == "Tumor") %>%
108+
dplyr::filter(sample_id == sample_id_of_interest) %>%
109+
group_by(experimental_strategy) %>%
110+
dplyr::mutate(n_sample_type = n()) %>%
111+
dplyr::filter(n_sample_type >1) %>%
112+
pull(Kids_First_Biospecimen_ID)
113+
114+
# store the samples with multiple compositions
115+
specimens_id_need_tiebreak <- c(specimens_id_need_tiebreak, each_specimen_need_tiebreak)
116+
}
117+
118+
# see the samples that need tiebreak
119+
pbta_add_tiebreak <- histology_df_no_format_id %>%
120+
dplyr::filter(Kids_First_Biospecimen_ID %in% specimens_id_need_tiebreak,
121+
sample_type != "Normal")
122+
123+
# For DNA, we can use aliquot ID to separate them
124+
# For RNA, different specimens are generated from the same aliqout but used different seuqencing method, hence RNA_library type can be used to separate them
125+
pbta_add_tiebreak <- pbta_add_tiebreak %>%
126+
dplyr::mutate(formatted_sample_id = case_when(
127+
experimental_strategy == "RNA-Seq" ~ paste0(sample_id, "-", RNA_library),
128+
experimental_strategy != "RNA-Seq" ~ paste0(sample_id, "-", aliquot_id)
129+
))
130+
131+
stopifnot(nrow(pbta_add_tiebreak) != unique(pbta_add_tiebreak$formatted_sample_id))
132+
133+
### Generate pedcbio ID for GTEx dataset
134+
# The format of the sample ID is GTEX-[donor ID]-[tissue site ID]-SM-[aliquot ID].
135+
# The donor ID (e.g. GTEX-14753) should be used to link between the various RNA-seq and genotype samples that come from the same donor.
136+
gtex_add_tiebreak <- histology_df_no_format_id %>%
137+
dplyr::filter(cohort == "GTEx") %>%
138+
dplyr::mutate(formatted_sample_id = paste0(Kids_First_Participant_ID, "-", aliquot_id))
139+
140+
stopifnot(nrow(gtex_add_tiebreak) != unique(gtex_add_tiebreak$formatted_sample_id))
141+
142+
143+
### Generate pedcbio ID for TARGET dataset
144+
# Similar to PBTA cohort, only need to add a tiebreak when multiple samples with the same experimental strategy
145+
# The different is - we need to use `Kids_First_Participant_ID`
146+
147+
# get all participant IDs
148+
participant_ids_target <- histology_df_no_format_id %>%
149+
dplyr::filter(cohort == "TARGET") %>%
150+
pull(Kids_First_Participant_ID) %>%
151+
unique()
152+
153+
# find the samples that need additional `tie-breaker`
154+
specimens_id_need_tiebreak <- c()
155+
156+
for (i in 1:length(participant_ids_target)){
157+
# deal with one sample at a time
158+
participant_of_interest <- participant_ids_target[i]
159+
160+
# find the number of compositions
161+
each_specimen_need_tiebreak <- histology_df_no_format_id %>%
162+
dplyr::filter(Kids_First_Participant_ID == participant_of_interest) %>%
163+
group_by(experimental_strategy) %>%
164+
dplyr::mutate(n_sample_type = n()) %>%
165+
dplyr::filter(n_sample_type >1) %>%
166+
pull(Kids_First_Biospecimen_ID)
167+
168+
# store the samples with multiple compositions
169+
specimens_id_need_tiebreak <- c(specimens_id_need_tiebreak, each_specimen_need_tiebreak)
170+
}
171+
172+
# see the samples that need tiebreak
173+
target_add_tiebreak <- histology_df_no_format_id %>%
174+
dplyr::filter(Kids_First_Biospecimen_ID %in% specimens_id_need_tiebreak)
175+
176+
# For TARGET cohort, looks like using the last 7 digits from the specimen ID + Participant ID should be enough
177+
target_add_tiebreak <- target_add_tiebreak %>%
178+
dplyr::mutate(formatted_sample_id = gsub("^.{0,10}", "", Kids_First_Biospecimen_ID)) %>%
179+
dplyr::mutate(formatted_sample_id = case_when(
180+
sample_type == "Normal" ~ paste0(formatted_sample_id, "-N"),
181+
TRUE ~ formatted_sample_id
182+
))
183+
stopifnot(nrow(target_add_tiebreak) != unique(target_add_tiebreak$formatted_sample_id))
184+
185+
186+
### Generate pedcbio ID for TCGA
187+
# TCGA-{TSS}-{Participant}-{Sample Vial}-{Portion Analyte}-{Plate}-{Center}
188+
# Again using the same method as PBTA and TARGET
189+
190+
# get all participant IDs
191+
participant_ids_tcga <- histology_df_no_format_id %>%
192+
dplyr::filter(cohort == "TCGA") %>%
193+
pull(Kids_First_Participant_ID) %>%
194+
unique()
195+
196+
# find the samples that need additional `tie-breaker`
197+
specimens_id_need_tiebreak <- c()
198+
199+
for (i in 1:length(participant_ids_tcga)){
200+
# deal with one sample at a time
201+
participant_of_interest <- participant_ids_tcga[i]
202+
203+
# find the number of compositions
204+
each_specimen_need_tiebreak <- histology_df_no_format_id %>%
205+
dplyr::filter(Kids_First_Participant_ID == participant_of_interest) %>%
206+
group_by(experimental_strategy) %>%
207+
dplyr::mutate(n_sample_type = n()) %>%
208+
dplyr::filter(n_sample_type >1) %>%
209+
pull(Kids_First_Biospecimen_ID)
210+
211+
# store the samples with multiple compositions
212+
specimens_id_need_tiebreak <- c(specimens_id_need_tiebreak, each_specimen_need_tiebreak)
213+
}
214+
215+
# see the samples that need tiebreak
216+
tcga_add_tiebreak <- histology_df_no_format_id %>%
217+
dplyr::filter(Kids_First_Biospecimen_ID %in% specimens_id_need_tiebreak)
218+
219+
# From what we can see, using Kids_First_Biospecimen_ID minus the center code is sufficient
220+
tcga_add_tiebreak <- tcga_add_tiebreak %>%
221+
dplyr::mutate(formatted_sample_id = stringr::str_sub(Kids_First_Biospecimen_ID, start = 9, end = -4)) %>%
222+
dplyr::mutate(formatted_sample_id = case_when(
223+
sample_type == "Normal" ~ paste0(formatted_sample_id, "-N"),
224+
TRUE ~ formatted_sample_id
225+
))
226+
227+
stopifnot(nrow(tcga_add_tiebreak) != unique(tcga_add_tiebreak$formatted_sample_id))
228+
229+
### Combine all the samples with tiebreak
230+
# combine all tie break needed samples
231+
all_tiebreaks <- bind_rows(combined_histology_formatted_added,
232+
pbta_add_tiebreak,
233+
gtex_add_tiebreak,
234+
target_add_tiebreak,
235+
tcga_add_tiebreak)
236+
237+
# for samples no need for tie break - use `sample_id` for PBTA and participant id for the rest
238+
no_need_for_tiebreaks <- histology_df %>%
239+
dplyr::filter(!Kids_First_Biospecimen_ID %in% all_tiebreaks$Kids_First_Biospecimen_ID) %>%
240+
dplyr::mutate(formatted_sample_id = case_when(
241+
cohort == "PBTA" ~ sample_id,
242+
cohort == "DGD" ~ sample_id,
243+
TRUE ~ Kids_First_Participant_ID
244+
))
245+
246+
# combine the files
247+
histology_all_fixed <- bind_rows(all_tiebreaks, no_need_for_tiebreaks)
248+
249+
# write out the results
250+
histology_all_fixed %>%
251+
readr::write_tsv(out_path)

STUDY_CONFIGS/pbta_all_case_meta_config.json

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -161,15 +161,6 @@
161161
"genetic_alteration_type": "CLINICAL",
162162
"datatype": "SAMPLE_ATTRIBUTES"
163163
}
164-
},
165-
"genes": {
166-
"_comment": "see https://docs.cbioportal.org/5.1-data-loading/data-loading/file-formats#gene-panel-matrix-file for detailed specifics",
167-
"cbio_name": "data_gene_matrix_CHOP.txt",
168-
"meta_file_attr": {
169-
"genetic_alteration_type": "GENE_PANEL_MATRIX",
170-
"datatype": "GENE_PANEL_MATRIX",
171-
"data_filename": "data_gene_matrix_CHOP.txt"
172-
}
173164
}
174165
}
175166
},

0 commit comments

Comments
 (0)