diff --git a/candice-key.csv b/candice-key.csv index 1f18443..7abb373 100644 --- a/candice-key.csv +++ b/candice-key.csv @@ -1,7 +1,7 @@ -"sample","type" -"DMSO1","control" -"DMSO2","control" -"DMSO3","control" -"TCDD1","treatment" -"TCDD2","treatment" -"TCDD3","treatment" +"sample","type","treatment" +"DMSO1","control","DMSO" +"DMSO2","control","DMSO" +"DMSO3","control","DMSO" +"TCDD1","treatment","TCDD" +"TCDD2","treatment","TCDD" +"TCDD3","treatment","TCDD" diff --git a/run_edger.R b/run_edger.R index c6fe528..76fe3b6 100644 --- a/run_edger.R +++ b/run_edger.R @@ -1,140 +1,142 @@ -library(edgeR) - -# From: http://stackoverflow.com/questions/1815606/rscript-determine-path-of-the-executing-script -# So we can copy this script to where the analysis output is saved -thisFile <- function() { - cmdArgs <- commandArgs(trailingOnly = FALSE) - needle <- "--file=" - match <- grep(needle, cmdArgs) - if (length(match) > 0) { - # Rscript - return(normalizePath(sub(needle, "", cmdArgs[match]))) - } else { - # 'source'd via R console - return(normalizePath(sys.frames()[[1]]$ofile)) - } -} - -#args = c("lists/organoid","keys/organoid.csv") -args = c("lists/candice_list","keys/candice-key.csv") - -# Get counts file from analysis/fname/fname.T.csv -bname = basename(args[1]) -fname = paste(bname,"-count.T.csv",sep='') -dat = read.csv(file.path("analysis",bname,fname), header = TRUE, row.names=1) - -# Filter low count reads -keep = rowSums(cpm(dat) > 3) >= 3 -counts = dat[keep,] - -## Read in key file -key = read.csv(args[2], header=TRUE, row.names=1) - -############################################# -## Create model comparison matrix with sample key -############################################# -# Lampe Biopsy main -sel = grepl("MT-.*", rownames(counts)) + grepl("ERCC-.*", rownames(counts)) + grepl("mt-.*", rownames(counts)) -# We have to filter out 6123 because its not full rank -# > summary(factor(sapply(names(counts), function(x) substr(x,2,5)))) -# 6102 6103 6105 6106 6108 6110 6121 6122 6123 6127 -# 8 8 8 8 8 8 8 8 6 8 -counts = counts[!sel,] -# counts = counts[!sel,!grepl("6123", names(counts))] -# key = key[!grepl("6123", rownames(key)),] -ename = "edger-pair-treatment" -factors = key[order(rownames(key)), c(sample,type)] -factors$type = factor(factors$type) -# factors$treatment = relevel(factors$treatment, "placebo") -design = model.matrix(~sample+type, data=factors) -groups = factors$type -#groups = factor(paste(key$tissue,key$location,sep='.')) -############################################# -system(paste("mkdir -p ",file.path("analysis",bname,ename))) -file.copy(thisFile(), file.path("analysis", bname, ename, "edger_script.R")) - - -counts = counts[,order(names(counts))] - -######################## -# run Pairwise analysis ... -######################## -# y = DGEList(counts=counts, group=factors) -# y = calcNormFactors(y) -# y = estimateCommonDisp(y) -# y = estimateTagwiseDisp(y) - -######################## -# or run GLM analysis -######################## -y = DGEList(counts=counts) -y = calcNormFactors(y) -y = estimateGLMCommonDisp(y, design) -y = estimateGLMTrendedDisp(y, design) -y = estimateGLMTagwiseDisp(y, design) -fit = glmFit(y, design) - -## get normalized counts for each group for outputting to summary spreadsheet -scaled.counts = data.frame(mapply(`*`, counts, y$samples$lib.size * - y$samples$norm.factors/mean(y$samples$lib.size))) -rownames(scaled.counts) = rownames(counts) -dfs = split.data.frame(t(scaled.counts), groups) -dfss = sapply(dfs, colMeans) - -#group_names = levels(groups) -#group_names_means = sapply(group_names, function(x) paste("mean_",x,sep=""), USE.NAMES=FALSE) -#colnames(dfss) = group_names_means - -#### Write results -run_analysis = function(outfile, contrast=NULL, coef=NULL) { - # Pairwise test - # lrt = exactTest(y) - # GLM Test - lrt = glmLRT(fit, contrast=contrast, coef=coef) - - ot1 = topTags(lrt,n=nrow(counts),sort.by="PValue")$table - #if (is.null(contrast)) { - #sel = which(as.logical(contrast)) - #ot1 = merge(ot1, dfss[,sel], by=0) - #} else { - #ot1 = merge(ot1, dfss, by=0) - #} - ot1 = merge(ot1, dfss, by=0) - ot1 = ot1[order(ot1$FDR),] # Sort by ascending FDR - write.csv(ot1,outfile,row.names=FALSE) - - if (!is.null(lrt$table$logFC)){ - detags = rownames(ot1)[ot1$FDR < 0.05] - png(paste(outfile,".png",sep="")) - plotSmear(lrt, de.tags=detags) - abline(h=c(-2,2),col="blue") - dev.off() - } - sink(file=file.path(dirname(outfile), "summary.txt"), append=TRUE) - print(outfile) - print(summary(decideTestsDGE(lrt, p=0.05, adjust="BH"))) - sink(NULL) - - #print(cpm(y)[detags,]) - #print(summary(decideTestsDGE(lrt, p=0.05, adjust="none"))) -} - -meta_run = function(coef) {run_analysis(file.path("analysis",bname,ename,paste(colnames(design)[coef],".csv",sep="")),coef=coef)} - -meta_run(dim(design)[2]) -meta_run(dim(design)[2]-1) -meta_run(dim(design)[2]-2) - -system(paste("cat",file.path("analysis",bname,ename,"summary.txt"))) - -# Pairwise test -# run_analysis(file.path("analysis",bname,paste(ename,".csv",sep=""))) - -## output MA, MDS, etc.., plots -png(file.path("analysis",bname,ename,"edger-mds.png")) -p = plotMDS(y) -dev.off() - -png(file.path("analysis",bname,ename,"edger-bcv.png")) -p = plotBCV(y) -dev.off() +library(edgeR) + +# From: http://stackoverflow.com/questions/1815606/rscript-determine-path-of-the-executing-script +# So we can copy this script to where the analysis output is saved +thisFile <- function() { + cmdArgs <- commandArgs(trailingOnly = FALSE) + needle <- "--file=" + match <- grep(needle, cmdArgs) + if (length(match) > 0) { + # Rscript + return(normalizePath(sub(needle, "", cmdArgs[match]))) + } else { + # 'source'd via R console + return(normalizePath(sys.frames()[[1]]$ofile)) + } +} + +#args = c("lists/organoid","keys/organoid.csv") +#args = c("ahr-list","ahr-key") +args = c("lists/candice_list","keys/candice-key.csv") + +# Get counts file from analysis/fname/fname.T.csv +bname = basename(args[1]) +fname = paste(bname,"-count.T.csv",sep='') +dat = read.csv(file.path("analysis",bname,fname), header = TRUE, row.names=1) + +# Filter low count reads +keep = rowSums(cpm(dat) > 3) >= 3 +counts = dat[keep,] + +## Read in key file +key = read.csv(args[2], header=TRUE, row.names=1) + +############################################# +## Create model comparison matrix with sample key +############################################# +# Lampe Biopsy main +sel = grepl("MT-.*", rownames(counts)) + grepl("ERCC-.*", rownames(counts)) + grepl("mt-.*", rownames(counts)) +# We have to filter out 6123 because its not full rank +# > summary(factor(sapply(names(counts), function(x) substr(x,2,5)))) +# 6102 6103 6105 6106 6108 6110 6121 6122 6123 6127 +# 8 8 8 8 8 8 8 8 6 8 +counts = counts[!sel,] +# counts = counts[!sel,!grepl("6123", names(counts))] +# key = key[!grepl("6123", rownames(key)),] +ename = "edger-pair-treatment2" +factors = key[order(rownames(key)), c("type", "treatment")] +# factors$idnum = factor(factors$idnum) +# factors$treatment = relevel(factors$treatment, "placebo") +design = model.matrix(~type+treatment, data=factors) +groups = factors$treatment +#groups = factor(paste(key$type,key$treatment,sep='.')) +############################################# +system(paste("mkdir -p ",file.path("analysis",bname,ename))) +file.copy(thisFile(), file.path("analysis", bname, ename, "edger_script.R")) + + +counts = counts[,order(names(counts))] + +######################## +# run Pairwise analysis ... +######################## +y = DGEList(counts=counts, group=groups) +y = calcNormFactors(y) +y = estimateCommonDisp(y) +y = estimateTagwiseDisp(y) + +######################## +# or run GLM analysis +######################## +#y = DGEList(counts=counts) +#y = calcNormFactors(y) +#y = estimateGLMCommonDisp(y, design) +#y = estimateGLMTrendedDisp(y, design) +#y = estimateGLMTagwiseDisp(y, design) +#fit = glmFit(y, design) + +## get normalized counts for each group for outputting to summary spreadsheet +scaled.counts = data.frame(mapply(`*`, counts, y$samples$lib.size * + y$samples$norm.factors/mean(y$samples$lib.size))) +rownames(scaled.counts) = rownames(counts) +dfs = split.data.frame(t(scaled.counts), groups) +dfss = sapply(dfs, colMeans) + +#group_names = levels(groups) +#group_names_means = sapply(group_names, function(x) paste("mean_",x,sep=""), USE.NAMES=FALSE) +#colnames(dfss) = group_names_means + +#### Write results +run_analysis = function(outfile, contrast=NULL, coef=NULL) { + # Pairwise test + lrt = exactTest(y) + # GLM Test + # lrt = glmLRT(fit, contrast=contrast, coef=coef) + + ot1 = topTags(lrt,n=nrow(counts),sort.by="PValue")$table + #if (is.null(contrast)) { + #sel = which(as.logical(contrast)) + #ot1 = merge(ot1, dfss[,sel], by=0) + #} else { + #ot1 = merge(ot1, dfss, by=0) + #} + ot1 = merge(ot1, dfss, by=0) + ot1 = ot1[order(ot1$FDR),] # Sort by ascending FDR + write.csv(ot1,outfile,row.names=FALSE) + + if (!is.null(lrt$table$logFC)){ + detags = rownames(ot1)[ot1$FDR < 0.05] + png(paste(outfile,".png",sep="")) + plotSmear(lrt, de.tags=detags) + abline(h=c(-2,2),col="blue") + dev.off() + } + sink(file=file.path(dirname(outfile), "summary.txt"), append=TRUE) + print(outfile) + print(summary(decideTestsDGE(lrt, p=0.05, adjust="BH"))) + sink(NULL) + + #print(cpm(y)[detags,]) + #print(summary(decideTestsDGE(lrt, p=0.05, adjust="none"))) +} + +meta_run = function(coef) {run_analysis(file.path("analysis",bname,ename,paste(colnames(design)[coef],".csv",sep="")),coef=coef)} + +meta_run(dim(design)[2]) +#meta_run(dim(design)[2]-1) +#meta_run(dim(design)[2]-2) + +system(paste("cat",file.path("analysis",bname,ename,"summary.txt"))) + +# Pairwise test +# run_analysis(file.path("analysis",bname,paste(ename,".csv",sep=""))) + +## output MA, MDS, etc.., plots +png(file.path("analysis",bname,ename,"edger-mds.png")) +p = plotMDS(y) +dev.off() + +png(file.path("analysis",bname,ename,"edger-bcv.png")) +p = plotBCV(y) +dev.off() +