-
Notifications
You must be signed in to change notification settings - Fork 26
/
bsdata_glm.R
121 lines (101 loc) · 5.34 KB
/
bsdata_glm.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
"
DESCRIPTION:
Read matrix files for chrom and produce GLM output
USAGE:
-- Processing 'chrM'
R CMD BATCH -chrM bsdata_glm.R chrM/bsdata_glm.Rout
SEE ALSO:
For processing command line args from R BATCH
cd /lustre/sblab/berald01/projects/20120522-oxbsseq-mesc/slx-6278/genome/bsdata
"
# ------------------------- Setting up ----------------------------------------
source('/home/berald01/svn_checkout/bioinformatics-misc/glm.R')
source('/home/berald01/svn_checkout/bioinformatics-misc/BSdata.R')
source('/home/berald01/svn_checkout/bioinformatics-misc/makeTransparent.R')
NPROCS<- 4
args <- commandArgs(trailingOnly = F)
CHROM<- args[length(args)]
CHROM<- sub("^-", "", CHROM, perl= TRUE)
setwd(CHROM)
cat(getwd())
# -----------------------------------------------------------------------------
cpg<- read.bsdata('genome', mat= c('loci', 'cnt_met', 'tot_reads'), gzip= FALSE, nrows= -1)
cpg@design$bs<- c('oxBS', 'oxBS', 'oxBS', 'BS', 'BS', 'BS')
cpg@pct_met<- as.ffdf(100 * (cpg@cnt_met[,] / cpg@tot_reads[,]))
# save.BSdata(cpg, 'cpg')
# ffload('cpg')
# Read count
# ----------
pdf('tot_reads.hist.pdf', width= 15/2.54, height= 12/2.54, pointsize= 9)
par(mfrow= c(2, 3), mar= c(2.5, 3, 2, 0.2), oma= c(3,3,2,0))
for(x in colnames(cpg@tot_reads)){
readQ<- quantile(cpg@tot_reads[, x], 0.95)
y<- ifelse(cpg@tot_reads[, x] > readQ, readQ, cpg@tot_reads[, x] )
hist(y, main= '', col= 'grey80', xlab= '', ylab= '', breaks= 10)
mtext(side= 3, line= -1, text= x, cex= 0.8)
print(x)
}
mtext(side= 1, text= 'Read depth', font= 2, outer= TRUE)
mtext(side= 2, text= 'Frequency', outer= TRUE, font= 2)
mtext(side= 3, text= sprintf('%s Depth of coverage at CpG sites', CHROM), outer= TRUE, font= 2, line= 0)
dev.off()
## system(sprintf('scp tot_reads.hist.pdf $mac_office:$cri_public_projects/20120522-oxbsseq-mesc/Documents/slx-6278/genome/%s', CHROM))
# Methylation
# -----------
## Select CpG with coverage n<x<m
m<- 5
Q<- 0.95
pdf('pct_met.hist.pdf', width= 15/2.54, height= 12/2.54, pointsize= 9)
par(mfrow= c(2, 3), mar= c(2.5, 3, 2, 0.2), oma= c(3,3,2,0))
for(x in colnames(cpg@pct_met)){
y<- cpg@pct_met[, x]
tot<- cpg@tot_reads[, x]
readQ<- quantile(cpg@tot_reads[, x], Q)
y<- y[which(tot >= m & tot <= readQ)]
hist(y, main= '', col= 'grey80', xlab= '', ylab= '', breaks= 25)
mtext(side= 3, line= -1, text= paste(x, '\nn= ', length(y), sep= ''), cex= 0.8, col= 'darkblue')
print(x)
}
mtext(side= 1, text= '% methylated', font= 2, outer= TRUE)
mtext(side= 2, text= 'Frequency', outer= TRUE, font= 2)
mtext(side= 3, text= sprintf('%s CpG sites with depth %sx to quantile(%s)', CHROM, m, Q), outer= TRUE, font= 2, line= 0)
dev.off()
## system(sprintf('scp pct_met.hist.pdf $mac_office:$cri_public_projects/20120522-oxbsseq-mesc/Documents/slx-6278/genome/%s', CHROM))
# Filter positions with not-too-high coverage
# -------------------------------------------
filter<- which(rowSums(cpg@tot_reads[,] >= 5 & cpg@tot_reads[,] <= 1000) >= 4) ## Positions with coverage x in at least y libs
cpg.2<- BSdataApply(cpg, FUN= function(x) x[filter,])
pdf('hist_depth_comb_libs.pdf', width= 18/2.54, height= 10/2.54, pointsize= 10)
par(mfrow= c(1,2))
hist(rowSums(cpg.2@tot_reads[,]),
main= sprintf('Depth of coverage summed across %s libraries\nn= %s', ncol(cpg.2@tot_reads), nrow(cpg.2@tot_reads)),
xlab= 'Cumulated depth of coverage', ylab= 'No. CpG', breaks= 20)
hist(rowMeans(cpg.2@pct_met[,]),
main= sprintf('%s Methylation averaged across %s libraries\nn= %s', CHROM, ncol(cpg.2@tot_reads), nrow(cpg.2@tot_reads)),
xlab= '% methylation', ylab= 'No. CpG', breaks= 20)
dev.off()
## system(sprintf('scp hist_depth_comb_libs.pdf $mac_office:$cri_public_projects/20120522-oxbsseq-mesc/Documents/slx-6278/genome/%s', CHROM))
# GLM
# ---
# bsobj<- BSdataApply(cpg.2, FUN= function(x) x[1:10000,])
## UNCOMMENT TO REPEAT GLM
glmbs<- glmBS.2(bsobj= cpg.2, bs= cpg.2@design$bs, contrast= c('BS', 'oxBS'), family= 'binomial', nthreads= NPROCS)
row.names(glmbs)<- row.names(cpg.2@cnt_met)
write.table(glmbs, file= 'glmbs.txt', row.names= FALSE, col.names= TRUE, sep= '\t', quote= FALSE)
## UNCOMMENT TO GET PREVIOUS GLM
glmbs<- read.table('glmbs.txt', header= TRUE, sep= '\t', stringsAsFactors= FALSE)
## MEMO: Estimate is "oxBS - BS" (reverse alphanumeric order)
pneg<- glmbs$pvalue[which(glmbs$estimate > 0)] ## Pvalues of negative estimates.
ppos<- glmbs$pvalue[which(glmbs$estimate < 0)]
pdf('hist_pvals_glmbn.paired.pdf', width= 18/2.54, height= 10/2.54, pointsize= 8)
par(las= 0, mfrow= c(1, 2))
h1<- hist(pneg, border= 'transparent', add= FALSE, col= 'grey70', freq= FALSE, breaks= 20, xlab= 'P-value', main= "P-values from GLM binom")
h2<- hist(ppos, border= 'red', freq= FALSE, add= TRUE, breaks= 20)
points(pch= 17, col= 'blue', x= 0.05, y= 0)
h1<- hist(pneg, border= 'transparent', add= FALSE, col= 'grey70', freq= TRUE, breaks= 20, xlab= 'P-value', main= sprintf("%s P-values from GLM binom\nn= %s", CHROM, nrow(glmbs)))
h2<- hist(ppos, border= 'red', freq= TRUE, add= TRUE, breaks= 20)
points(pch= 17, col= 'blue', x= 0.05, y= 0)
legend('topleft', legend=c('Pval +ve %', 'Pval -ve %'), col= c('red', 'grey70'), pch= 15, bg= 'white')
dev.off()
## system(sprintf('scp hist_pvals_glmbn.paired.pdf $mac_office:$cri_public_projects/20120522-oxbsseq-mesc/Documents/slx-6278/genome/%s', CHROM))
quit(save= 'no')