Skip to content

Commit 76fe08e

Browse files
Add files via upload
1 parent 9a23bca commit 76fe08e

File tree

2 files changed

+481
-0
lines changed

2 files changed

+481
-0
lines changed
Lines changed: 280 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,280 @@
1+
---
2+
title: "deseq2_2probands_vs_unaffected"
3+
author: "Gurpreet Kaur"
4+
date: "17 March 2025"
5+
output: html_document
6+
---
7+
8+
***
9+
###### (I) Prep
10+
# Create folder for results
11+
```{r}
12+
# Create analysis folder
13+
if (!dir.exists("deseq2/results/batch-corr_limma/2probands_vs_unaffected")) {
14+
dir.create("deseq2/results/batch-corr_limma/2probands_vs_unaffected")
15+
}
16+
```
17+
18+
# Define the list of sample IDs to exclude and run DESeq
19+
```{r}
20+
# 7 probands from other 6 families (family 6 has 2 probands), mothers of family 2 (3 rep) and mother of family 8 = 12 samples -> 35-12 = 23 samples
21+
exclude_ids <- c("LW000002_r", "LW000006_r", "LW000009_r", "LW000011_r", "LW000088_r", "LW002056", "LW002066" , "LW002072",
22+
"LW000005_r", "LW000005_r_top", "LW000005_r_deep_LW002086",
23+
"LW002076")
24+
25+
# Subset colData to exclude unwanted samples and set 'Samples_id' column as the row names in the samples_subset data frame
26+
samples_subset <- samples_all4deseq_pid[!(rownames(samples_all4deseq_pid) %in% exclude_ids), ]
27+
rownames(samples_subset) <- samples_subset$Samples_id
28+
29+
library(writexl)
30+
write_xlsx(samples_subset, "./data/samples_subset.xlsx")
31+
32+
# Subset countData to match the samples
33+
counts_subset <- counts[, rownames(samples_subset)]
34+
```
35+
36+
###### (II) DESeq2 Analysis
37+
# Assign factors for DESeq2 design
38+
```{r}
39+
library(DESeq2) # DESeq2_1.36.0; R version 4.2.3
40+
samples_subset$Affected = factor(samples_subset$Affected)
41+
samples_subset$Family = factor(samples_subset$Family)
42+
samples_subset$Replicate = factor(samples_subset$Replicate)
43+
samples_subset$Batch = factor(samples_subset$Batch)
44+
samples_subset$Gender = factor(samples_subset$Gender)
45+
samples_subset$Relation = factor(samples_subset$Relation)
46+
samples_subset$Patient_id = factor(samples_subset$Patient_id)
47+
48+
# Create the new DESeqDataSet object
49+
dds_subset <- DESeqDataSetFromMatrix(countData = round(counts_subset), colData = samples_subset, design = ~ Batch + Replicate + Family + Affected)
50+
51+
dds_subset
52+
```
53+
# Pre-filtering: Keep only rows that have atleast 10 reads total
54+
```{r}
55+
keep_subset = rowSums(counts(dds_subset)) >= 10
56+
dds_subset = dds_subset[keep_subset,]
57+
dim(dds_subset)
58+
```
59+
60+
# Differential expression analysis via DESeq() [do normalization as well]
61+
```{r}
62+
dds_subset = DESeq(dds_subset)
63+
head(counts(dds_subset, normalized=TRUE))
64+
resultsNames(dds_subset)
65+
```
66+
```{r}
67+
saveRDS(dds_subset, file = "./dds_subset.rds")
68+
```
69+
70+
###### (III) Transformation and check PCA for batch effect
71+
# Run vsd
72+
```{r}
73+
vsd_subset = vst(dds_subset, blind=FALSE)
74+
head(assay(vsd_subset), 3)
75+
saveRDS(vsd_subset, file = "./vsd_subset.rds")
76+
```
77+
78+
###### (IV) Batch correction and check with PCA
79+
# limma::removeBatchEffect()
80+
```{r}
81+
counts_vst_subset = assay(vsd_subset)
82+
mm = model.matrix(~ Replicate + Family + Affected, colData(vsd_subset))
83+
counts_vst_subset_limma = limma::removeBatchEffect(counts_vst_subset, batch=vsd_subset$Batch, design=mm)
84+
write.csv(counts_vst_subset_limma, file="./deseq2/results/batch-corr_limma/2probands_vs_unaffected/counts_vst_subset_limma.csv")
85+
```
86+
87+
```{r}
88+
vsd_subset_limma = vsd_subset
89+
assay(vsd_subset_limma) = counts_vst_subset_limma
90+
saveRDS(vsd_subset_limma, file = "./vsd_subset_limma.rds")
91+
```
92+
93+
###### (V) Contrast: Affected_Yes_vs_No
94+
```{r}
95+
resultsNames(dds_subset)
96+
```
97+
98+
# (1) Retreive DEGs: res_aff_vs_unaff, genename and res_aff_vs_unaff_05
99+
```{r}
100+
res_aff_vs_unaff_subset = results(dds_subset, contrast=c("Affected", "Yes", "No"))
101+
res_aff_vs_unaff_subset = res_aff_vs_unaff_subset[order(res_aff_vs_unaff_subset$padj),]
102+
head(res_aff_vs_unaff_subset)
103+
#summary(res_aff_vs_unaff_subset)
104+
res_aff_vs_unaff_subset_df = as.data.frame(res_aff_vs_unaff_subset)
105+
106+
# Adding gene name
107+
res_aff_vs_unaff_subset_df_genename = res_aff_vs_unaff_subset_df
108+
res_aff_vs_unaff_subset_df_genename$Ensembl_ID = row.names(res_aff_vs_unaff_subset_df)
109+
res_aff_vs_unaff_subset_df_genename = merge(x=res_aff_vs_unaff_subset_df_genename, y=gene_names, by.x ="Ensembl_ID", by.y="Ensembl_ID", all.x=T)
110+
res_aff_vs_unaff_subset_df_genename = res_aff_vs_unaff_subset_df_genename[,c(dim(res_aff_vs_unaff_subset_df_genename)[2],1:dim(res_aff_vs_unaff_subset_df_genename)[2]-1)]
111+
res_aff_vs_unaff_subset_df_genename = res_aff_vs_unaff_subset_df_genename[order(res_aff_vs_unaff_subset_df_genename[,"padj"]),]
112+
rownames(res_aff_vs_unaff_subset_df_genename) = res_aff_vs_unaff_subset_df_genename$Ensembl_ID
113+
114+
# Subset: padj<0.05
115+
res_aff_vs_unaff_subset_df_genename_05 = subset(res_aff_vs_unaff_subset_df_genename, padj < 0.05) # 150 degs
116+
res_aff_vs_unaff_subset_df_genename_05 = res_aff_vs_unaff_subset_df_genename_05[order(res_aff_vs_unaff_subset_df_genename_05$padj),]
117+
head(res_aff_vs_unaff_subset_df_genename_05)
118+
#summary(res_aff_vs_unaff_subset_df_genename_05)
119+
```
120+
```{r}
121+
write.csv(res_aff_vs_unaff_subset_df_genename, file = "./deseq2/results/batch-corr_limma/2probands_vs_unaffected/res_aff_vs_unaff_subset_genename.csv" )
122+
write.csv(res_aff_vs_unaff_subset_df_genename_05, file = "./deseq2/results/batch-corr_limma/2probands_vs_unaffected/res_aff_vs_unaff_subset_genename_05.csv")
123+
```
124+
125+
# (2) DEGs visualization:
126+
## (i) Heatmaps
127+
```{r}
128+
# All 150 sig. DEGs with Family ids
129+
library(pheatmap)
130+
topgenes_aff_vs_unaff_05 = rownames(res_aff_vs_unaff_subset_df_genename_05)
131+
topgenes_aff_vs_unaff_05
132+
133+
# Subsetting assay data
134+
topgenes_aff_vs_unaff_05 = assay(vsd_subset_limma)[topgenes_aff_vs_unaff_05,]
135+
136+
# Centering the data
137+
topgenes_aff_vs_unaff_05 = topgenes_aff_vs_unaff_05 - rowMeans(topgenes_aff_vs_unaff_05)
138+
139+
df = as.data.frame(colData(vsd_subset_limma)[,c("Batch", "Replicate", "Family", "Affected")])
140+
141+
ann_col = list(
142+
Affected = c("No"= "grey90", "Yes"= "grey40"),
143+
Replicate = c("1"="saddlebrown", "2"="rosybrown", "3"="peachpuff"),
144+
Batch = c("1" ="orange", "2"="blue"),
145+
Family = c( "1" = "darkred", "2" = "salmon", "3" = "navy", "4" = "purple", "5" = "magenta", "6" = "seagreen", "7" = "chocolate", "8" = "royalblue" )
146+
)
147+
148+
# Column label
149+
matched_indices <- match(row.names(df), samples_subset$Samples_id)
150+
labels_col_df <- samples_subset[matched_indices, c("Family", "Relation", "Replicate")]
151+
# Concatenate columns to form labels
152+
labels_col <- apply(labels_col_df, 1, function(x) paste(x, collapse = " - "))
153+
154+
# Generate heatmap
155+
png("./deseq2/results/batch-corr_limma/2probands_vs_unaffected/heatmap_res_aff_vs_unaff_subset_150degs_famid.png", width = 5600, height = 3200, res = 300)
156+
ComplexHeatmap::pheatmap(topgenes_aff_vs_unaff_05, annotation_col=df, labels_row = res_aff_vs_unaff_subset_df_genename_05$gene_name,
157+
annotation_colors = ann_col,
158+
labels_col = labels_col,
159+
scale = "row", clustering_method = "complete", clustering_distance_rows = "euclidean",
160+
fontsize=10, fontsize_col=10, legend=TRUE, legend_breaks = c(-2,-1,0,1,2), fontsize_row = 5, angle_col = "45"
161+
)
162+
dev.off()
163+
```
164+
165+
# (2) DEGs visualization:
166+
## (ii) EnhancedVolcano - Sig. DEGs
167+
```{r}
168+
#devtools::install_github('kevinblighe/EnhancedVolcano') # v1.13.2
169+
library(EnhancedVolcano)
170+
171+
# Range of log2FC in res_aff_vs_unaff_subset_df_genename_05: -4.5 to 24.9
172+
# Rnge of padj: 8.426784e-06 to 4.948156e-02
173+
174+
png("./deseq2/results/batch-corr_limma/2probands_vs_unaffected/EnhancedVolcano_res_aff_vs_unaff_subset_05_degs_final.png", width = 5600, height = 3200, res=300)
175+
keyvals = ifelse(
176+
res_aff_vs_unaff_subset_df_genename$log2FoldChange < 0, 'mediumblue',
177+
ifelse(res_aff_vs_unaff_subset_df_genename$log2FoldChange > 0, 'darkgoldenrod2', 'black'))
178+
keyvals[is.na(keyvals)] <- 'black'
179+
names(keyvals)[keyvals == 'darkgoldenrod2'] <- 'Up'
180+
names(keyvals)[keyvals == 'mediumblue'] <- 'Down'
181+
182+
e = EnhancedVolcano(res_aff_vs_unaff_subset_df_genename,
183+
lab = res_aff_vs_unaff_subset_df_genename$gene_name,
184+
x = 'log2FoldChange', y = 'padj', title = '2 Probands vs. Unaffected', subtitle = '150 DEGs (padjCutoff=0.05, FCcutoff=2)',
185+
selectLab = (res_aff_vs_unaff_subset_df_genename$gene_name[res_aff_vs_unaff_subset_df_genename$gene_name %in%
186+
res_aff_vs_unaff_subset_df_genename_05$gene_name])[which(names(keyvals) %in% c('Up', 'Down'))],
187+
pCutoff = 0.05, FCcutoff = 2,
188+
cutoffLineWidth = 1,
189+
pointSize = 3.0,
190+
legendLabSize = 12, legendIconSize = 8.0, legendLabels=c('Down','Up'),
191+
drawConnectors = TRUE,
192+
colCustom = keyvals
193+
)
194+
e + ggplot2::coord_cartesian(xlim = c(-10, 25), ylim = c(0.0, 6.0)) + ggplot2::scale_x_continuous(breaks=seq(-10,25, 1))
195+
dev.off()
196+
```
197+
198+
# (3) DEGs individual plots:
199+
## (i) plotCounts DEGs
200+
```{r}
201+
# Create the deseq2/results/batch-corr_limma/plotCounts folder
202+
if (!dir.exists("deseq2/results/batch-corr_limma/2probands_vs_unaffected/plotCounts_150degs_pid")) {
203+
dir.create("deseq2/results/batch-corr_limma/2probands_vs_unaffected/plotCounts_150degs_pid")
204+
}
205+
```
206+
207+
```{r}
208+
row.names(samples_subset) = samples_subset$Samples_id
209+
```
210+
211+
```{r}
212+
# Define colors for 8 families
213+
library(RColorBrewer)
214+
additional_colors <- brewer.pal(8, "Set1")
215+
216+
# Remove two colors to replace with salmon and royalblue for specific families # for family 2 and 8
217+
additional_colors <- setdiff(additional_colors, c("salmon", "royalblue"))
218+
# Function to map specific family values to colors
219+
assign_colors <- function(family_levels)
220+
{
221+
# Start by making a named list of colors, initially empty
222+
color_map <- setNames(vector("character", length(family_levels)), family_levels)
223+
# Set specific colors for known values
224+
color_map["2"] <- "salmon"
225+
color_map["8"] <- "royalblue"
226+
# Assign remaining colors from the palette to other family values
227+
remaining_values <- setdiff(family_levels, c("2", "8"))
228+
color_map[remaining_values] <- head(additional_colors, length(remaining_values))
229+
return(color_map)
230+
}
231+
```
232+
233+
```{r}
234+
# Extract genes table from res
235+
goi_table = res_aff_vs_unaff_subset_df_genename_05
236+
237+
# Make plotCounts boxplot
238+
genes_oi_plot_Ensembl = goi_table$Ensembl_ID
239+
genes_oi_plot_gene_name = goi_table$gene_name
240+
genes_oi_plot_padj = goi_table$padj
241+
genes_oi_plot_log2FC = goi_table$log2FoldChange
242+
Family = factor(samples_subset$Family)
243+
244+
library(DESeq2)
245+
library(ggplot2)
246+
library(ggrepel)
247+
248+
for (i in seq_along(genes_oi_plot_Ensembl)) {
249+
boxplot_counts = plotCounts(dds_subset, gene=genes_oi_plot_Ensembl[i], intgroup=c("Affected"), returnData=TRUE, normalized = T)
250+
boxplot_counts$variable = row.names(boxplot_counts)
251+
252+
# Add 'Patient_id' and 'Replicate' as new columns and update row names
253+
#boxplot_counts$Patient_id = samples_subset[rownames(boxplot_counts), "Patient_id", drop = TRUE]
254+
boxplot_counts$Family = samples_subset[rownames(boxplot_counts), "Family", drop = TRUE]
255+
boxplot_counts$Relation = samples_subset[rownames(boxplot_counts), "Relation", drop = TRUE]
256+
boxplot_counts$Replicate = samples_subset[rownames(boxplot_counts), "Replicate", drop = TRUE]
257+
rownames(boxplot_counts) = with(samples_subset[rownames(boxplot_counts), ], paste(Family, Relation, Replicate, sep = "_"))
258+
259+
boxplot_counts$Family <- as.factor(boxplot_counts$Family)
260+
261+
# Retrieve unique family values and assign colors
262+
family_levels <- levels(boxplot_counts$Family)
263+
family_colors <- assign_colors(family_levels)
264+
265+
plot = ggplot(data=boxplot_counts, aes(x=Affected, y=count, fill=Affected)) +
266+
geom_boxplot(position=position_dodge()) +
267+
geom_jitter(position=position_dodge(.8)) +
268+
labs(title = paste("Gene",genes_oi_plot_gene_name[i],sep = "_",genes_oi_plot_Ensembl[i]),
269+
subtitle = paste("padj=",genes_oi_plot_padj[i],"; log2FC=",genes_oi_plot_log2FC[i])) +
270+
xlab("") + ylab("Normalized gene counts") +
271+
theme_bw() + theme(text = element_text(size=6), axis.text.x = element_text(angle=45, vjust=1,hjust=1)) +
272+
scale_fill_manual(values = c("No"= "grey90", "Yes"= "grey40")) +
273+
scale_color_manual(values = family_colors) +
274+
# geom_point(color = "grey70", size=0.2) +
275+
geom_point(data=boxplot_counts, aes(x=Affected, y=count, color=Family), size = 2) +
276+
geom_text_repel(aes(label = rownames(boxplot_counts), color=Family), min.segment.length = 0, max.overlaps = Inf, box.padding = 0.5)
277+
ggsave(filename=paste("DEG_",genes_oi_plot_gene_name[i],"_",genes_oi_plot_Ensembl[i],".png"),
278+
width=10, height=5, plot=plot, path = "./deseq2/results/batch-corr_limma/2probands_vs_unaffected/plotCounts_150degs_pid")
279+
}
280+
```

0 commit comments

Comments
 (0)