22title : " RNA-Seq differential gene expression analysis for muscular dystrophy samples"
33author : " Gurpreet Kaur"
44date : " `r Sys.Date()`"
5- output : html_document
5+ output : github_document
66---
77
88``` {r setup, include=FALSE}
@@ -14,7 +14,7 @@ getwd()
1414***
1515# BiocManager Installation
1616``` {r, BiocManager Installation}
17- if (!requireNamespace("BiocManager", quietly = TRUE))
17+ if (!requireNamespace("BiocManager", quietly = TRUE))
1818install.packages("BiocManager")
1919```
2020
@@ -49,15 +49,15 @@ if (!dir.exists("deseq2/results/batch-corr_limma/plotCounts")) {
4949``` {r, read counts file}
5050library(readr)
5151counts = read_tsv(file.path(getwd(),"star_salmon","salmon.merged.gene_counts_length_scaled.tsv"))
52- counts = data.frame(counts, row.names = 1)
52+ counts = data.frame(counts, row.names = 1)
5353counts$Ensembl_ID = row.names(counts)
5454drop = c("Ensembl_ID","gene_name")
55- gene_names = counts[,drop]
56- counts = counts[ , !(names(counts) %in% drop)]
55+ gene_names = counts[,drop]
56+ counts = counts[ , !(names(counts) %in% drop)]
5757head(counts,5)
5858```
5959
60- ``` {r}
60+ ``` {r}
6161# Analyze sample-related data (metadata), edit for desired columns, make in same sample order as per counts and read in
6262library(readr)
6363samples_all4deseq = read_csv(file.path(getwd(),"data","samples_all_edit.csv"))
@@ -66,23 +66,23 @@ View(samples_all4deseq)
6666```
6767
6868``` {r}
69- # Check consistency in samples name and order in counts and samples_all4deseq
69+ # Check consistency in samples name and order in counts and samples_all4deseq
7070## Edit and read file (sample_all_edit.csv) again if not TRUE
71- all(rownames(samples_all4deseq) %in% colnames(counts))
72- all(rownames(samples_all4deseq) == colnames(counts))
71+ all(rownames(samples_all4deseq) %in% colnames(counts))
72+ all(rownames(samples_all4deseq) == colnames(counts))
7373```
7474
7575# 2) DESeq2 Analysis
7676``` {r, assign factors for DESeq2 design}
7777library(DESeq2) # DESeq2_1.36.0; R version 4.2.3
7878samples_all4deseq
79- samples_all4deseq$Family = factor(samples_all4deseq$Family)
80- samples_all4deseq$Affected = factor(samples_all4deseq$Affected)
81- samples_all4deseq$Replicate = factor(samples_all4deseq$Replicate)
82- samples_all4deseq$Batch = factor(samples_all4deseq$Batch)
79+ samples_all4deseq$Family = factor(samples_all4deseq$Family)
80+ samples_all4deseq$Affected = factor(samples_all4deseq$Affected)
81+ samples_all4deseq$Replicate = factor(samples_all4deseq$Replicate)
82+ samples_all4deseq$Batch = factor(samples_all4deseq$Batch)
8383# Need later for labelling
84- samples_all4deseq$Gender = factor(samples_all4deseq$Gender)
85- samples_all4deseq$Relation = factor(samples_all4deseq$Relation)
84+ samples_all4deseq$Gender = factor(samples_all4deseq$Gender)
85+ samples_all4deseq$Relation = factor(samples_all4deseq$Relation)
8686```
8787
8888``` {r, Affected_Yes_vs_No}
@@ -92,7 +92,7 @@ dim(dds)
9292
9393``` {r}
9494# Pre-filtering: Keep only rows that have atleast 10 reads total
95- keep = rowSums(counts(dds)) >= 10
95+ keep = rowSums(counts(dds)) >= 10
9696dds = dds[keep,]
9797dim(dds)
9898```
@@ -114,9 +114,9 @@ counts_norm = counts(dds, normalized=TRUE)
114114```
115115
116116***
117- # 3) Count data transformation
117+ # 3) Count data transformation
118118``` {r}
119- vsd = vst(dds, blind=FALSE)
119+ vsd = vst(dds, blind=FALSE)
120120```
121121
122122***
@@ -144,9 +144,9 @@ res_aff_vs_unaff = results(dds, contrast=c("Affected", "Yes", "No"))
144144res_aff_vs_unaff= res_aff_vs_unaff[order(res_aff_vs_unaff$padj),]
145145head(res_aff_vs_unaff)
146146summary(res_aff_vs_unaff)
147- write.csv(res_aff_vs_unaff, file="./deseq2/results/batch-corr_limma/res_aff_vs_unaff.csv")
147+ write.csv(res_aff_vs_unaff, file="./deseq2/results/batch-corr_limma/res_aff_vs_unaff.csv")
148148res_aff_vs_unaff_df = as.data.frame(res_aff_vs_unaff)
149- res_aff_vs_unaff_05 = subset(res_aff_vs_unaff_df, padj < 0.05) # 1957 obs.
149+ res_aff_vs_unaff_05 = subset(res_aff_vs_unaff_df, padj < 0.05) # 1957 obs.
150150```
151151
152152## (ii) plotCounts: Genes
@@ -165,7 +165,7 @@ row.names(samples_all4deseq_pid) = samples_all4deseq_pid$Samples_id
165165
166166``` {r}
167167# Genes of interest (goi)
168- genes = c("VMA21","ENO3")
168+ genes = c("VMA21","ENO3")
169169vsd_limma_genename = merge(x=gene_names, y=assay(vsd_limma), by.x = "Ensembl_ID", by.y="row.names")
170170goi = intersect(genes, vsd_limma_genename$gene_name)
171171```
@@ -180,11 +180,11 @@ write.csv(goi_table, file="./deseq2/results/batch-corr_limma/res_aff_vs_unaff_df
180180# Define colors for 8 families
181181library(RColorBrewer)
182182additional_colors <- brewer.pal(8, "Set1")
183-
183+
184184# Remove two colors to replace with salmon and royalblue for specific families # for family 2 and 8
185- additional_colors <- setdiff(additional_colors, c("salmon", "royalblue"))
185+ additional_colors <- setdiff(additional_colors, c("salmon", "royalblue"))
186186# Function to map specific family values to colors
187- assign_colors <- function(family_levels)
187+ assign_colors <- function(family_levels)
188188 {
189189 # Start by making a named list of colors, initially empty
190190 color_map <- setNames(vector("character", length(family_levels)), family_levels)
@@ -202,7 +202,7 @@ genes_oi_plot_Ensembl = goi_table$Ensembl_ID
202202genes_oi_plot_gene_name = goi_table$gene_name
203203genes_oi_plot_padj = goi_table$padj
204204genes_oi_plot_log2FC = goi_table$log2FoldChange
205- Family = factor(samples_all4deseq_pid$Family)
205+ Family = factor(samples_all4deseq_pid$Family)
206206
207207library(DESeq2)
208208library(ggplot2)
@@ -211,31 +211,31 @@ library(ggrepel)
211211for (i in seq_along(genes_oi_plot_Ensembl)) {
212212boxplot_counts = plotCounts(dds, gene=genes_oi_plot_Ensembl[i], intgroup=c("Affected"), returnData=TRUE, normalized = T)
213213boxplot_counts$variable = row.names(boxplot_counts)
214-
214+
215215 # Add 'Patient_id' and 'Replicate' as new columns and update row names
216216 boxplot_counts$Replicate = samples_all4deseq_pid[rownames(boxplot_counts), "Replicate", drop = TRUE]
217217 boxplot_counts$Patient_id = samples_all4deseq_pid[rownames(boxplot_counts), "Patient_id", drop = TRUE]
218218 boxplot_counts$Family = samples_all4deseq_pid[rownames(boxplot_counts), "Family", drop = TRUE]
219219 rownames(boxplot_counts) = with(samples_all4deseq_pid[rownames(boxplot_counts), ], paste(Family, Relation, Replicate, sep = "_"))
220-
220+
221221 boxplot_counts$Family <- as.factor(boxplot_counts$Family)
222-
222+
223223 # Retrieve unique family values and assign colors
224224 family_levels <- levels(boxplot_counts$Family)
225225 family_colors <- assign_colors(family_levels)
226-
226+
227227 plot = ggplot(data=boxplot_counts, aes(x=Affected, y=count, fill=Affected)) +
228- geom_boxplot(position=position_dodge()) +
229- geom_jitter(position=position_dodge(.8)) +
230- labs(title = paste("Gene",genes_oi_plot_gene_name[i],sep = "_",genes_oi_plot_Ensembl[i]),
231- subtitle = paste("padj=",genes_oi_plot_padj[i],"; log2FC=",genes_oi_plot_log2FC[i])) +
232- xlab("") + ylab("Normalized gene counts") +
233- theme_bw() + theme(text = element_text(size=6), axis.text.x = element_text(angle=45, vjust=1,hjust=1)) +
234- scale_fill_manual(values = c("No"= "grey90", "Yes"= "grey40")) +
228+ geom_boxplot(position=position_dodge()) +
229+ geom_jitter(position=position_dodge(.8)) +
230+ labs(title = paste("Gene",genes_oi_plot_gene_name[i],sep = "_",genes_oi_plot_Ensembl[i]),
231+ subtitle = paste("padj=",genes_oi_plot_padj[i],"; log2FC=",genes_oi_plot_log2FC[i])) +
232+ xlab("") + ylab("Normalized gene counts") +
233+ theme_bw() + theme(text = element_text(size=6), axis.text.x = element_text(angle=45, vjust=1,hjust=1)) +
234+ scale_fill_manual(values = c("No"= "grey90", "Yes"= "grey40")) +
235235 scale_color_manual(values = family_colors) +
236236 geom_point(data=boxplot_counts, aes(x=Affected, y=count, color=Family), size = 2) +
237- geom_text_repel(aes(label = rownames(boxplot_counts), color=Family), min.segment.length = 0, max.overlaps = Inf, box.padding = 0.5)
238- ggsave(filename=paste("Gene-of-interest",genes_oi_plot_gene_name[i],"_",genes_oi_plot_Ensembl[i],".png"),
237+ geom_text_repel(aes(label = rownames(boxplot_counts), color=Family), min.segment.length = 0, max.overlaps = Inf, box.padding = 0.5)
238+ ggsave(filename=paste("Gene-of-interest",genes_oi_plot_gene_name[i],"_",genes_oi_plot_Ensembl[i],".png"),
239239 width=10, height=5, plot=plot, path = "./deseq2/results/batch-corr_limma/plotCounts_noid")
240240}
241241```
@@ -259,19 +259,19 @@ res_2_vs_8_df_genename = res_2_vs_8_df_genename[order(res_2_vs_8_df_genename[,"p
259259write.csv(res_2_vs_8_df_genename,file="./deseq2/results/batch-corr_limma/family2vs8/res_2_vs_8_genename.csv" )
260260
261261# Subset: padj<0.05 }
262- res_2_vs_8_df_genename_05= subset(res_2_vs_8_df_genename, padj < 0.05)
262+ res_2_vs_8_df_genename_05= subset(res_2_vs_8_df_genename, padj < 0.05)
263263res_2_vs_8_df_genename_05 = res_2_vs_8_df_genename_05[order(res_2_vs_8_df_genename_05$padj),]
264264head(res_2_vs_8_df_genename_05)
265265summary(res_2_vs_8_df_genename_05)
266266write.csv(res_2_vs_8_df_genename_05, file="./deseq2/results/batch-corr_limma/family2vs8/res_2_vs_8_df_genename_05.csv") # 137
267267```
268268
269269## (ii) Volcano plot
270- ``` {r}
270+ ``` {r}
271271#devtools::install_github('kevinblighe/EnhancedVolcano') # v1.13.2; FCcutoff = 1 (default)
272272library(EnhancedVolcano)
273273png("./deseq2/results/batch-corr_limma/family2vs8/EnhancedVolcano_res_2_vs_8_all-degs.png", width = 2600, height = 3000, res=300)
274- EnhancedVolcano(res_2_vs_8_df_genename, lab = res_2_vs_8_df_genename$gene_name,
274+ EnhancedVolcano(res_2_vs_8_df_genename, lab = res_2_vs_8_df_genename$gene_name,
275275 x = 'log2FoldChange', y = 'padj', title = 'Family 2 vs 8', subtitle = 'All DEGs (padjcutoff=0.05, FCcutoff = 1)' ,
276276 pCutoff = 0.05, FCcutoff = 1, pointSize = 2.0, labSize = 3.0, legendLabels=c('Not sig.','log2FC','padj', 'padj & log2FC'))
277277dev.off()
@@ -303,46 +303,46 @@ genes_oi_plot_Ensembl = goi_table_2_vs_8$Ensembl_ID
303303genes_oi_plot_gene_name = goi_table_2_vs_8$gene_name
304304genes_oi_plot_padj = goi_table_2_vs_8$padj
305305genes_oi_plot_log2FC = goi_table_2_vs_8$log2FoldChange
306- Family = factor(samples_family2and8$Family)
306+ Family = factor(samples_family2and8$Family)
307307
308308for (i in seq_along(genes_oi_plot_Ensembl)) {
309309boxplot_counts = plotCounts(dds, gene=genes_oi_plot_Ensembl[i], intgroup=c("Affected"), returnData=TRUE, normalized = T)
310310boxplot_counts$variable = row.names(boxplot_counts)
311-
311+
312312 # Find intersection of rownames
313313 common_rownames = intersect(rownames(boxplot_counts), rownames(samples_family2and8)) # Family 2 and 8
314-
314+
315315 # Subset boxplot_counts based on common rownames
316316 boxplot_counts_2_and_8 = boxplot_counts[common_rownames, , drop=FALSE]
317-
317+
318318 # Add 'Patient_id' and 'Replicate' as new columns and update row names
319319 boxplot_counts_2_and_8$Replicate = samples_family2and8[rownames(boxplot_counts_2_and_8), "Replicate", drop = TRUE]
320320 boxplot_counts_2_and_8$Patient_id = samples_family2and8[rownames(boxplot_counts_2_and_8), "Patient_id", drop = TRUE]
321321 boxplot_counts_2_and_8$Family = samples_family2and8[rownames(boxplot_counts_2_and_8), "Family", drop = TRUE]
322322 rownames(boxplot_counts_2_and_8) = with(samples_family2and8[rownames(boxplot_counts_2_and_8), ], paste(Patient_id, Replicate, sep = "_"))
323323
324324 # Plot
325- boxplot_counts_2_and_8$Family = as.factor(boxplot_counts_2_and_8$Family)
325+ boxplot_counts_2_and_8$Family = as.factor(boxplot_counts_2_and_8$Family)
326326 plot = ggplot2::ggplot(data=boxplot_counts_2_and_8, aes(x=Affected, y=count, fill=Affected)) +
327- geom_boxplot(position=position_dodge()) +
328- geom_jitter(position=position_dodge(.8)) +
329- labs(title = paste("Gene",genes_oi_plot_gene_name[i],sep = "_",genes_oi_plot_Ensembl[i]),
330- subtitle = paste("padj=",genes_oi_plot_padj[i],"; log2FC=",genes_oi_plot_log2FC[i])) +
331- xlab("") + ylab("Normalized gene counts") +
332- theme_bw() + theme(text = element_text(size=6), axis.text.x = element_text(angle=45, vjust=1,hjust=1)) +
333- scale_fill_manual(values = c("No"= "grey90", "Yes"= "grey40")) +
327+ geom_boxplot(position=position_dodge()) +
328+ geom_jitter(position=position_dodge(.8)) +
329+ labs(title = paste("Gene",genes_oi_plot_gene_name[i],sep = "_",genes_oi_plot_Ensembl[i]),
330+ subtitle = paste("padj=",genes_oi_plot_padj[i],"; log2FC=",genes_oi_plot_log2FC[i])) +
331+ xlab("") + ylab("Normalized gene counts") +
332+ theme_bw() + theme(text = element_text(size=6), axis.text.x = element_text(angle=45, vjust=1,hjust=1)) +
333+ scale_fill_manual(values = c("No"= "grey90", "Yes"= "grey40")) +
334334 scale_color_manual(values = c("2" = "salmon", "8" = "royalblue")) +
335335 #geom_point(data=boxplot_counts_2_and_8, aes(x=Affected, y=count, color=Family), size = 2) +
336336 geom_point(data=boxplot_counts_2_and_8, aes(color=Family), size = 2) +
337- ggrepel::geom_text_repel(aes(label = rownames(boxplot_counts_2_and_8), color=Family), min.segment.length = 0, max.overlaps = Inf, box.padding = 0.5)
338-
339- ggsave(filename=paste("Gene-of-interest_",genes_oi_plot_gene_name[i],"_",genes_oi_plot_Ensembl[i],".png"),
337+ ggrepel::geom_text_repel(aes(label = rownames(boxplot_counts_2_and_8), color=Family), min.segment.length = 0, max.overlaps = Inf, box.padding = 0.5)
338+
339+ ggsave(filename=paste("Gene-of-interest_",genes_oi_plot_gene_name[i],"_",genes_oi_plot_Ensembl[i],".png"),
340340 width=10, height=5, plot=plot, path = "./deseq2/results/batch-corr_limma/family2vs8/plotCounts_goi")
341341}
342342```
343343
344344***
345- # Save packages info
345+ # Save packages info
346346``` {r}
347347sink("./sessionInfo.txt")
348348sessionInfo()
0 commit comments