-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy path4_Figure3_Wider_contexts.R
221 lines (193 loc) · 9.79 KB
/
4_Figure3_Wider_contexts.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
# Code accompanying the publication # "Mutational signature in colorectal cancer induced by genotoxic pks+ E.Coli"
# C. Pleguezuelos-Manzano, J. Puschhof and A. Rosendahl Huber et al.
# Description: Analysis of the wider context of SBS and T-deletion mutations (Figure 3A-B)
# Please direct any questions or comments to [email protected]
# ===== LOAD LOCAL FUNCTIONS AND SET VARIABLES =====
# Source script which load all preset functions and variables
library(ggseqlogo)
setwd(dir)
options(stringsAsFactors = F)
# ------ Context SNVs
# ---- Functions -----
get_context = function(vcf, size_context){
gr = rowRanges(vcf)
strand = ifelse(ref(vcf) == "G" | ref(vcf) == "A", '-', "+")
start = start(vcf)
ref = as.character(gr$REF)
alt = unlist(CharacterList(gr$ALT))
type = paste0(ref, "/", alt)
chromosome = paste0("chr", as.character(seqnames(vcf)))
context = getSeq(Hsapiens, chromosome, start = start - size_context,
end = start + size_context,
strand = strand)
context_table = data.frame(chr = chromosome, position = start, type = type, strand = strand, context = as.character(context))
rownames(context_table) = names(vcf)
return(context_table)
}
# Load data
vcfs_path <- get_vcfs("Data/STE0072/I5/SNVs/", pattern = '.vcf')
vcf_list = list()
for (i in 1:nrow(vcfs_path)){
vcf = readVcf(vcfs_path$vcf_files[i], ref_genome)
vcf_list[[vcfs_path$sample_names[i]]] = vcf
}
grl_list = lapply(vcf_list, granges)
vcfs_grl = GRangesList(grl_list)
DYE_context = lapply(vcf_list[1:3], get_context, 10)
DYE_context = do.call("rbind", EKO_context)
EKO_context = lapply(vcf_list[4:6], get_context, 10)
EKO_context = do.call("rbind", EKO_context)
EWT_context = lapply(vcf_list[7:9], get_context, 10)
EWT_context = do.call("rbind", EWT_context)
EWT_context_table = EWT_context$context
EWT_TN_context = EWT_context[grep(pattern = "A/|T/", EWT_context$type),]
EKO_TN_context = EKO_context[grep(pattern = "A/|T/", EKO_context$type),]
EKO_TN = ggseqlogo(EKO_TN_context$context) + scale_y_continuous(expand = c(0,0), limits = c(0, 2)) + theme_classic()
EWT_TN = ggseqlogo(EWT_TN_context$context) + scale_y_continuous(expand = c(0,0), limits = c(0, 2)) + theme_classic()
ggsave("Figures/Main/Figure3/Figure_3A_SeqLogo_bits_EWT.pdf", EWT_TN, width = 7, height = 3)
EWT_TN
# ----- Extended Context Indels ----
vcf_fnames = list.files("Data/STE0072/I5/Indels/", recursive = T, full.names = F)
vcf_fnames_full = paste0("Data/STE0072/I5/Indels/", vcf_fnames)
sample_names = gsub("_.*$", "", vcf_fnames)
head(sample_names)
#Read in indels. (You can't use read_vcfs_as_granges as this function removes indels)
chroms = paste0("chr", c(1:22, "X"))
vcfs_list = lapply(vcf_fnames_full, function(x){
vcf = readVcf(x, genome = "hg19")
gr = granges(vcf)
gr = remove_snvs(gr)
seqlevelsStyle(gr) = "UCSC"
seqlevels(gr, pruning.mode = "coarse") = chroms
return(gr)
})
vcfs_grl = GRangesList(vcfs_list)
names(vcfs_grl) = sample_names
grl_exp = get_indel_context(vcfs_grl, ref_genome)
indel_counts = count_indel_contexts(grl = grl_exp)
# update the get_1bp_dels function to make sure no T-homopolymer lengths are missed.
get_1bp_dels = function(gr, mut_size, ref_genome){
gr = gr[mut_size == -1]
if (length(gr) == 0){
return(gr)
}
del_bases = gr$REF %>% as.character() %>% substring(2)
withCallingHandlers({#Flank the granges object, to get a sequence, that can be searched for repeats. This can result in a warning message, when the flanked range extends beyond the chrom lenght. This message is suppressed.
gr_extended = flank(gr, 300, start = F)
}, warning = function(w) {
if (grepl("out-of-bound range located on sequence", conditionMessage(w)))
invokeRestart("muffleWarning")
})
gr_extended %<>% trim() #Trim the ranges that are extended beyond the actual length of the chromosome.
seq = getSeq(eval(as.symbol(ref_genome)), gr_extended)
seq_z = str_replace_all(as.character(seq), del_bases, rep("Z", length(seq))) #For each mut replace the deleted basetype in the flanking sequence with Zs.
homopolymer_length = gsub("[^Z].*", "", as.character(seq_z)) %>% nchar() + 1 #Remove all bases after the Zs and count how many bases are left. Add one for the deleted base itself.
del_bases[del_bases == "A"] = "T"
del_bases[del_bases == "G"] = "C"
gr$muttype = paste0(del_bases, "_deletion")
gr$muttype_sub = homopolymer_length
return(gr)
}
approach2_id_context_table = function(gr_exp) {
size_context = 10
# Remove T-homopolymer
gr_exp = gr_exp[gr_exp$muttype == "T_deletion",]
homopolymer_length = gr_exp$muttype_sub
muts = length(gr_exp)
start = start(gr_exp)
end = end(gr_exp)
ref = as.character(gr_exp$REF)
chr = as.character(seqnames(gr_exp))
strand = ifelse(substr(ref, 2, 2) == "A","-","+")
total_context = c()
for (i in 1:length(strand)){
print(i)
if (strand[i] == "+") {
upstr_context = as.character(getSeq(Hsapiens, chr[i],
start = start[i] - size_context + 1, # add +1 value since start base is the base before the homopolymer
end = start[i],
strand = strand[i]))
downstr_context = as.character(getSeq(Hsapiens, chr[i],
start = end[i],
end = end[i] + homopolymer_length[i] + size_context,
strand = strand[i]))
downstr_context = substr(downstr_context, 1, 11)
context = paste0(upstr_context, downstr_context)
}
if (strand[i] == "-") {
downstr_context = as.character(getSeq(Hsapiens, chr[i],
start = start[i] + homopolymer_length[i] - size_context,
end = start[i] + homopolymer_length[i],
strand = strand[i]))
downstr_context = substr(downstr_context, 1, 11)
upstr_context = as.character(getSeq(Hsapiens, chr[i],
start = start[i] + homopolymer_length[i] + 1 ,# add +1 value since start base is the base before the homopolymer
end = start[i] + homopolymer_length[i] + size_context,
strand = strand[i]))
context = paste0(upstr_context, downstr_context)
}
total_context = c(total_context, context)
}
return(total_context)
}
total_context_EWT = lapply(grl_exp[7:9], approach2_id_context_table)
total_context_EWT = unlist(total_context_EWT)
Logo_plot_EWT = ggseqlogo(total_context_EWT) + theme_BM() +scale_y_continuous(expand = c(0,0), limits = c(0, 2))
Logo_plot_EWT
ggsave("Figures/Main/Figure3/Figure_3B_Seqlogo_indels_non_condensed_EWT.pdf", Logo_plot_EWT, width = 7, height = 3)
# ---- Plot Extended data figure 4 -------
# - approach 2 with different indel sizes:
grl_exp_EWT = GRangesList(unlist(grl_exp[7:9]))
names(grl_exp_EWT) = "Total_Tdels_EWT"
approach_2_id_context_length = function(grl_exp, length_hp) {
for (name in names(grl_exp)){
gr_exp = grl_exp[[name]]
size_context = 10
gr_exp = gr_exp[gr_exp$muttype == "T_deletion",]
gr_exp = gr_exp[gr_exp$muttype_sub == length_hp,]
homopolymer_length = gr_exp$muttype_sub
start = as.numeric(start(gr_exp))
end = as.numeric(end(gr_exp))
chr = as.character(seqnames(gr_exp))
ref = gr_exp$REF
strand = ifelse(substr(ref, 2, 2) == "A","-","+")
total_context = c()
for (i in 1:length(strand)){
if (strand[i] == "+") {
upstr_context = as.character(getSeq(Hsapiens, chr[i],
start = start[i] - size_context + 1, # add +1 value since start base is the base before the homopolymer
end = start[i],
strand = strand[i]))
downstr_context = as.character(getSeq(Hsapiens, chr[i],
start = end[i],
end = end[i] + homopolymer_length[i] + size_context,
strand = strand[i]))
downstr_context = substr(downstr_context, 1, 11)
context = paste0(upstr_context, downstr_context)
}
if (strand[i] == "-") {
downstr_context = as.character(getSeq(Hsapiens, chr[i],
start = start[i] + homopolymer_length[i] - size_context, # add +1 value since start base is the base before the homopolymer
end = start[i] + homopolymer_length[i],
strand = strand[i]))
downstr_context = substr(downstr_context, 1, 11)
upstr_context = as.character(getSeq(Hsapiens, chr[i],
start = start[i] + homopolymer_length[i] + 1 ,
end = start[i] + homopolymer_length[i] + size_context,
strand = strand[i]))
context = paste0(upstr_context, downstr_context)
}
total_context = c(total_context, context)
}
plot_mutation_profile(total_context, paste0("Figures/ExtendedData/Extended_data_figure_4/",name, "T_homopolymer", as.character(length_hp),"approach2"))
}
}
# Plot the 10bp mutational context for deletions in T-homopolymers of length 1 till 8 bp
approach_2_id_context_length(grl_exp_EWT, 1)
approach_2_id_context_length(grl_exp_EWT, 2)
approach_2_id_context_length(grl_exp_EWT, 3)
approach_2_id_context_length(grl_exp_EWT, 4)
approach_2_id_context_length(grl_exp_EWT, 5)
approach_2_id_context_length(grl_exp_EWT, 6)
approach_2_id_context_length(grl_exp_EWT, 7)
approach_2_id_context_length(grl_exp_EWT, 8)