|
| 1 | +--- |
| 2 | +title: "deseq2_2probands_vs_unaffected" |
| 3 | +author: "Gurpreet Kaur" |
| 4 | +date: "17 March 2025" |
| 5 | +output: html_document |
| 6 | +--- |
| 7 | + |
| 8 | +*** |
| 9 | +###### (I) Prep |
| 10 | +# Create folder for results |
| 11 | +```{r} |
| 12 | +# Create analysis folder |
| 13 | +if (!dir.exists("deseq2/results/batch-corr_limma/2probands_vs_unaffected")) { |
| 14 | + dir.create("deseq2/results/batch-corr_limma/2probands_vs_unaffected") |
| 15 | +} |
| 16 | +``` |
| 17 | + |
| 18 | +# Define the list of sample IDs to exclude and run DESeq |
| 19 | +```{r} |
| 20 | +# 7 probands from other 6 families (family 6 has 2 probands), mothers of family 2 (3 rep) and mother of family 8 = 12 samples -> 35-12 = 23 samples |
| 21 | +exclude_ids <- c("LW000002_r", "LW000006_r", "LW000009_r", "LW000011_r", "LW000088_r", "LW002056", "LW002066" , "LW002072", |
| 22 | + "LW000005_r", "LW000005_r_top", "LW000005_r_deep_LW002086", |
| 23 | + "LW002076") |
| 24 | +
|
| 25 | +# Subset colData to exclude unwanted samples and set 'Samples_id' column as the row names in the samples_subset data frame |
| 26 | +samples_subset <- samples_all4deseq_pid[!(rownames(samples_all4deseq_pid) %in% exclude_ids), ] |
| 27 | +rownames(samples_subset) <- samples_subset$Samples_id |
| 28 | +
|
| 29 | +library(writexl) |
| 30 | +write_xlsx(samples_subset, "./data/samples_subset.xlsx") |
| 31 | +
|
| 32 | +# Subset countData to match the samples |
| 33 | +counts_subset <- counts[, rownames(samples_subset)] |
| 34 | +``` |
| 35 | + |
| 36 | +###### (II) DESeq2 Analysis |
| 37 | +# Assign factors for DESeq2 design |
| 38 | +```{r} |
| 39 | +library(DESeq2) # DESeq2_1.36.0; R version 4.2.3 |
| 40 | +samples_subset$Affected = factor(samples_subset$Affected) |
| 41 | +samples_subset$Family = factor(samples_subset$Family) |
| 42 | +samples_subset$Replicate = factor(samples_subset$Replicate) |
| 43 | +samples_subset$Batch = factor(samples_subset$Batch) |
| 44 | +samples_subset$Gender = factor(samples_subset$Gender) |
| 45 | +samples_subset$Relation = factor(samples_subset$Relation) |
| 46 | +samples_subset$Patient_id = factor(samples_subset$Patient_id) |
| 47 | +
|
| 48 | +# Create the new DESeqDataSet object |
| 49 | +dds_subset <- DESeqDataSetFromMatrix(countData = round(counts_subset), colData = samples_subset, design = ~ Batch + Replicate + Family + Affected) |
| 50 | +
|
| 51 | +dds_subset |
| 52 | +``` |
| 53 | +# Pre-filtering: Keep only rows that have atleast 10 reads total |
| 54 | +```{r} |
| 55 | +keep_subset = rowSums(counts(dds_subset)) >= 10 |
| 56 | +dds_subset = dds_subset[keep_subset,] |
| 57 | +dim(dds_subset) |
| 58 | +``` |
| 59 | + |
| 60 | +# Differential expression analysis via DESeq() [do normalization as well] |
| 61 | +```{r} |
| 62 | +dds_subset = DESeq(dds_subset) |
| 63 | +head(counts(dds_subset, normalized=TRUE)) |
| 64 | +resultsNames(dds_subset) |
| 65 | +``` |
| 66 | +```{r} |
| 67 | +saveRDS(dds_subset, file = "./dds_subset.rds") |
| 68 | +``` |
| 69 | + |
| 70 | +###### (III) Transformation and check PCA for batch effect |
| 71 | +# Run vsd |
| 72 | +```{r} |
| 73 | +vsd_subset = vst(dds_subset, blind=FALSE) |
| 74 | +head(assay(vsd_subset), 3) |
| 75 | +saveRDS(vsd_subset, file = "./vsd_subset.rds") |
| 76 | +``` |
| 77 | + |
| 78 | +###### (IV) Batch correction and check with PCA |
| 79 | +# limma::removeBatchEffect() |
| 80 | +```{r} |
| 81 | +counts_vst_subset = assay(vsd_subset) |
| 82 | +mm = model.matrix(~ Replicate + Family + Affected, colData(vsd_subset)) |
| 83 | +counts_vst_subset_limma = limma::removeBatchEffect(counts_vst_subset, batch=vsd_subset$Batch, design=mm) |
| 84 | +write.csv(counts_vst_subset_limma, file="./deseq2/results/batch-corr_limma/2probands_vs_unaffected/counts_vst_subset_limma.csv") |
| 85 | +``` |
| 86 | + |
| 87 | +```{r} |
| 88 | +vsd_subset_limma = vsd_subset |
| 89 | +assay(vsd_subset_limma) = counts_vst_subset_limma |
| 90 | +saveRDS(vsd_subset_limma, file = "./vsd_subset_limma.rds") |
| 91 | +``` |
| 92 | + |
| 93 | +###### (V) Contrast: Affected_Yes_vs_No |
| 94 | +```{r} |
| 95 | +resultsNames(dds_subset) |
| 96 | +``` |
| 97 | + |
| 98 | +# (1) Retreive DEGs: res_aff_vs_unaff, genename and res_aff_vs_unaff_05 |
| 99 | +```{r} |
| 100 | +res_aff_vs_unaff_subset = results(dds_subset, contrast=c("Affected", "Yes", "No")) |
| 101 | +res_aff_vs_unaff_subset = res_aff_vs_unaff_subset[order(res_aff_vs_unaff_subset$padj),] |
| 102 | +head(res_aff_vs_unaff_subset) |
| 103 | +#summary(res_aff_vs_unaff_subset) |
| 104 | +res_aff_vs_unaff_subset_df = as.data.frame(res_aff_vs_unaff_subset) |
| 105 | +
|
| 106 | +# Adding gene name |
| 107 | +res_aff_vs_unaff_subset_df_genename = res_aff_vs_unaff_subset_df |
| 108 | +res_aff_vs_unaff_subset_df_genename$Ensembl_ID = row.names(res_aff_vs_unaff_subset_df) |
| 109 | +res_aff_vs_unaff_subset_df_genename = merge(x=res_aff_vs_unaff_subset_df_genename, y=gene_names, by.x ="Ensembl_ID", by.y="Ensembl_ID", all.x=T) |
| 110 | +res_aff_vs_unaff_subset_df_genename = res_aff_vs_unaff_subset_df_genename[,c(dim(res_aff_vs_unaff_subset_df_genename)[2],1:dim(res_aff_vs_unaff_subset_df_genename)[2]-1)] |
| 111 | +res_aff_vs_unaff_subset_df_genename = res_aff_vs_unaff_subset_df_genename[order(res_aff_vs_unaff_subset_df_genename[,"padj"]),] |
| 112 | +rownames(res_aff_vs_unaff_subset_df_genename) = res_aff_vs_unaff_subset_df_genename$Ensembl_ID |
| 113 | +
|
| 114 | +# Subset: padj<0.05 |
| 115 | +res_aff_vs_unaff_subset_df_genename_05 = subset(res_aff_vs_unaff_subset_df_genename, padj < 0.05) # 150 degs |
| 116 | +res_aff_vs_unaff_subset_df_genename_05 = res_aff_vs_unaff_subset_df_genename_05[order(res_aff_vs_unaff_subset_df_genename_05$padj),] |
| 117 | +head(res_aff_vs_unaff_subset_df_genename_05) |
| 118 | +#summary(res_aff_vs_unaff_subset_df_genename_05) |
| 119 | +``` |
| 120 | +```{r} |
| 121 | +write.csv(res_aff_vs_unaff_subset_df_genename, file = "./deseq2/results/batch-corr_limma/2probands_vs_unaffected/res_aff_vs_unaff_subset_genename.csv" ) |
| 122 | +write.csv(res_aff_vs_unaff_subset_df_genename_05, file = "./deseq2/results/batch-corr_limma/2probands_vs_unaffected/res_aff_vs_unaff_subset_genename_05.csv") |
| 123 | +``` |
| 124 | + |
| 125 | +# (2) DEGs visualization: |
| 126 | +## (i) Heatmaps |
| 127 | +```{r} |
| 128 | +# All 150 sig. DEGs with Family ids |
| 129 | +library(pheatmap) |
| 130 | +topgenes_aff_vs_unaff_05 = rownames(res_aff_vs_unaff_subset_df_genename_05) |
| 131 | +topgenes_aff_vs_unaff_05 |
| 132 | +
|
| 133 | +# Subsetting assay data |
| 134 | +topgenes_aff_vs_unaff_05 = assay(vsd_subset_limma)[topgenes_aff_vs_unaff_05,] |
| 135 | +
|
| 136 | +# Centering the data |
| 137 | +topgenes_aff_vs_unaff_05 = topgenes_aff_vs_unaff_05 - rowMeans(topgenes_aff_vs_unaff_05) |
| 138 | +
|
| 139 | +df = as.data.frame(colData(vsd_subset_limma)[,c("Batch", "Replicate", "Family", "Affected")]) |
| 140 | +
|
| 141 | +ann_col = list( |
| 142 | + Affected = c("No"= "grey90", "Yes"= "grey40"), |
| 143 | + Replicate = c("1"="saddlebrown", "2"="rosybrown", "3"="peachpuff"), |
| 144 | + Batch = c("1" ="orange", "2"="blue"), |
| 145 | + Family = c( "1" = "darkred", "2" = "salmon", "3" = "navy", "4" = "purple", "5" = "magenta", "6" = "seagreen", "7" = "chocolate", "8" = "royalblue" ) |
| 146 | + ) |
| 147 | +
|
| 148 | +# Column label |
| 149 | +matched_indices <- match(row.names(df), samples_subset$Samples_id) |
| 150 | +labels_col_df <- samples_subset[matched_indices, c("Family", "Relation", "Replicate")] |
| 151 | + # Concatenate columns to form labels |
| 152 | +labels_col <- apply(labels_col_df, 1, function(x) paste(x, collapse = " - ")) |
| 153 | +
|
| 154 | +# Generate heatmap |
| 155 | +png("./deseq2/results/batch-corr_limma/2probands_vs_unaffected/heatmap_res_aff_vs_unaff_subset_150degs_famid.png", width = 5600, height = 3200, res = 300) |
| 156 | +ComplexHeatmap::pheatmap(topgenes_aff_vs_unaff_05, annotation_col=df, labels_row = res_aff_vs_unaff_subset_df_genename_05$gene_name, |
| 157 | + annotation_colors = ann_col, |
| 158 | + labels_col = labels_col, |
| 159 | + scale = "row", clustering_method = "complete", clustering_distance_rows = "euclidean", |
| 160 | + fontsize=10, fontsize_col=10, legend=TRUE, legend_breaks = c(-2,-1,0,1,2), fontsize_row = 5, angle_col = "45" |
| 161 | + ) |
| 162 | +dev.off() |
| 163 | +``` |
| 164 | + |
| 165 | +# (2) DEGs visualization: |
| 166 | +## (ii) EnhancedVolcano - Sig. DEGs |
| 167 | +```{r} |
| 168 | +#devtools::install_github('kevinblighe/EnhancedVolcano') # v1.13.2 |
| 169 | +library(EnhancedVolcano) |
| 170 | +
|
| 171 | +# Range of log2FC in res_aff_vs_unaff_subset_df_genename_05: -4.5 to 24.9 |
| 172 | +# Rnge of padj: 8.426784e-06 to 4.948156e-02 |
| 173 | +
|
| 174 | +png("./deseq2/results/batch-corr_limma/2probands_vs_unaffected/EnhancedVolcano_res_aff_vs_unaff_subset_05_degs_final.png", width = 5600, height = 3200, res=300) |
| 175 | +keyvals = ifelse( |
| 176 | + res_aff_vs_unaff_subset_df_genename$log2FoldChange < 0, 'mediumblue', |
| 177 | + ifelse(res_aff_vs_unaff_subset_df_genename$log2FoldChange > 0, 'darkgoldenrod2', 'black')) |
| 178 | +keyvals[is.na(keyvals)] <- 'black' |
| 179 | +names(keyvals)[keyvals == 'darkgoldenrod2'] <- 'Up' |
| 180 | +names(keyvals)[keyvals == 'mediumblue'] <- 'Down' |
| 181 | +
|
| 182 | +e = EnhancedVolcano(res_aff_vs_unaff_subset_df_genename, |
| 183 | + lab = res_aff_vs_unaff_subset_df_genename$gene_name, |
| 184 | + x = 'log2FoldChange', y = 'padj', title = '2 Probands vs. Unaffected', subtitle = '150 DEGs (padjCutoff=0.05, FCcutoff=2)', |
| 185 | + selectLab = (res_aff_vs_unaff_subset_df_genename$gene_name[res_aff_vs_unaff_subset_df_genename$gene_name %in% |
| 186 | + res_aff_vs_unaff_subset_df_genename_05$gene_name])[which(names(keyvals) %in% c('Up', 'Down'))], |
| 187 | + pCutoff = 0.05, FCcutoff = 2, |
| 188 | + cutoffLineWidth = 1, |
| 189 | + pointSize = 3.0, |
| 190 | + legendLabSize = 12, legendIconSize = 8.0, legendLabels=c('Down','Up'), |
| 191 | + drawConnectors = TRUE, |
| 192 | + colCustom = keyvals |
| 193 | + ) |
| 194 | +e + ggplot2::coord_cartesian(xlim = c(-10, 25), ylim = c(0.0, 6.0)) + ggplot2::scale_x_continuous(breaks=seq(-10,25, 1)) |
| 195 | +dev.off() |
| 196 | +``` |
| 197 | + |
| 198 | +# (3) DEGs individual plots: |
| 199 | +## (i) plotCounts DEGs |
| 200 | +```{r} |
| 201 | +# Create the deseq2/results/batch-corr_limma/plotCounts folder |
| 202 | +if (!dir.exists("deseq2/results/batch-corr_limma/2probands_vs_unaffected/plotCounts_150degs_pid")) { |
| 203 | + dir.create("deseq2/results/batch-corr_limma/2probands_vs_unaffected/plotCounts_150degs_pid") |
| 204 | +} |
| 205 | +``` |
| 206 | + |
| 207 | +```{r} |
| 208 | +row.names(samples_subset) = samples_subset$Samples_id |
| 209 | +``` |
| 210 | + |
| 211 | +```{r} |
| 212 | +# Define colors for 8 families |
| 213 | +library(RColorBrewer) |
| 214 | +additional_colors <- brewer.pal(8, "Set1") |
| 215 | +
|
| 216 | +# Remove two colors to replace with salmon and royalblue for specific families # for family 2 and 8 |
| 217 | +additional_colors <- setdiff(additional_colors, c("salmon", "royalblue")) |
| 218 | +# Function to map specific family values to colors |
| 219 | +assign_colors <- function(family_levels) |
| 220 | + { |
| 221 | + # Start by making a named list of colors, initially empty |
| 222 | + color_map <- setNames(vector("character", length(family_levels)), family_levels) |
| 223 | + # Set specific colors for known values |
| 224 | + color_map["2"] <- "salmon" |
| 225 | + color_map["8"] <- "royalblue" |
| 226 | + # Assign remaining colors from the palette to other family values |
| 227 | + remaining_values <- setdiff(family_levels, c("2", "8")) |
| 228 | + color_map[remaining_values] <- head(additional_colors, length(remaining_values)) |
| 229 | + return(color_map) |
| 230 | +} |
| 231 | +``` |
| 232 | + |
| 233 | +```{r} |
| 234 | +# Extract genes table from res |
| 235 | +goi_table = res_aff_vs_unaff_subset_df_genename_05 |
| 236 | +
|
| 237 | +# Make plotCounts boxplot |
| 238 | +genes_oi_plot_Ensembl = goi_table$Ensembl_ID |
| 239 | +genes_oi_plot_gene_name = goi_table$gene_name |
| 240 | +genes_oi_plot_padj = goi_table$padj |
| 241 | +genes_oi_plot_log2FC = goi_table$log2FoldChange |
| 242 | +Family = factor(samples_subset$Family) |
| 243 | +
|
| 244 | +library(DESeq2) |
| 245 | +library(ggplot2) |
| 246 | +library(ggrepel) |
| 247 | +
|
| 248 | +for (i in seq_along(genes_oi_plot_Ensembl)) { |
| 249 | + boxplot_counts = plotCounts(dds_subset, gene=genes_oi_plot_Ensembl[i], intgroup=c("Affected"), returnData=TRUE, normalized = T) |
| 250 | + boxplot_counts$variable = row.names(boxplot_counts) |
| 251 | +
|
| 252 | + # Add 'Patient_id' and 'Replicate' as new columns and update row names |
| 253 | + #boxplot_counts$Patient_id = samples_subset[rownames(boxplot_counts), "Patient_id", drop = TRUE] |
| 254 | + boxplot_counts$Family = samples_subset[rownames(boxplot_counts), "Family", drop = TRUE] |
| 255 | + boxplot_counts$Relation = samples_subset[rownames(boxplot_counts), "Relation", drop = TRUE] |
| 256 | + boxplot_counts$Replicate = samples_subset[rownames(boxplot_counts), "Replicate", drop = TRUE] |
| 257 | + rownames(boxplot_counts) = with(samples_subset[rownames(boxplot_counts), ], paste(Family, Relation, Replicate, sep = "_")) |
| 258 | +
|
| 259 | + boxplot_counts$Family <- as.factor(boxplot_counts$Family) |
| 260 | +
|
| 261 | + # Retrieve unique family values and assign colors |
| 262 | + family_levels <- levels(boxplot_counts$Family) |
| 263 | + family_colors <- assign_colors(family_levels) |
| 264 | +
|
| 265 | + plot = ggplot(data=boxplot_counts, aes(x=Affected, y=count, fill=Affected)) + |
| 266 | + geom_boxplot(position=position_dodge()) + |
| 267 | + geom_jitter(position=position_dodge(.8)) + |
| 268 | + labs(title = paste("Gene",genes_oi_plot_gene_name[i],sep = "_",genes_oi_plot_Ensembl[i]), |
| 269 | + subtitle = paste("padj=",genes_oi_plot_padj[i],"; log2FC=",genes_oi_plot_log2FC[i])) + |
| 270 | + xlab("") + ylab("Normalized gene counts") + |
| 271 | + theme_bw() + theme(text = element_text(size=6), axis.text.x = element_text(angle=45, vjust=1,hjust=1)) + |
| 272 | + scale_fill_manual(values = c("No"= "grey90", "Yes"= "grey40")) + |
| 273 | + scale_color_manual(values = family_colors) + |
| 274 | + # geom_point(color = "grey70", size=0.2) + |
| 275 | + geom_point(data=boxplot_counts, aes(x=Affected, y=count, color=Family), size = 2) + |
| 276 | + geom_text_repel(aes(label = rownames(boxplot_counts), color=Family), min.segment.length = 0, max.overlaps = Inf, box.padding = 0.5) |
| 277 | + ggsave(filename=paste("DEG_",genes_oi_plot_gene_name[i],"_",genes_oi_plot_Ensembl[i],".png"), |
| 278 | + width=10, height=5, plot=plot, path = "./deseq2/results/batch-corr_limma/2probands_vs_unaffected/plotCounts_150degs_pid") |
| 279 | +} |
| 280 | +``` |
0 commit comments