-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathvk_dada2_tutorial.R
202 lines (141 loc) · 7.18 KB
/
vk_dada2_tutorial.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
# This exercise is taken from the excellent Dada2 tutorial
# You can read that tutorial in depth here: https://benjjneb.github.io/dada2/tutorial.html
# This tutorial was written by: Benjamin Callahan
# It is distributed under a CC BY 4.0 license
# For convienience, I've copied all the commands you need to download the packages
# here and have copied the relevant code chunks.
# To get this tutorial to run on the lab computer or your own computer
# you will need a copy of dada2 and phyloseq (as you know this is sometimes the most time
# consuming part of the whole process).
# You will also need the example data to complete the tutorial.
# The links to download that data are included here a few lines down.
# We will run through this example in class today.
# There is no lab report associated with this tutorial.
# I don't expect you to perfectly understand all the commands that are run in this tutorial
# However, you should be able to understand WHY the steps are being completed
# and generally what the steps are doing.
# If you have questions, please consult the original tutorial for in-depth explainations
# as you review this script after class.
install.packages("devtools")
library("devtools")
devtools::install_github("benjjneb/dada2", ref="v1.14") # change the ref argument to get other versions
# Follow the promts to update packages if needed.
# The install will take a few minutes.
# load the pacakge:
library(dada2); packageVersion("dada2")
# download the example files: http://www.mothur.org/w/images/d/d6/MiSeqSOPData.zip
# store a variable named path
path <- "~/Desktop/MiSeq_SOP/"
# set the path
setwd(path)
# take a look at the files in the folder:
list.files(path)
# Forward and reverse fastq filenames have format: SAMPLENAME_R1_001.fastq and SAMPLENAME_R2_001.fastq
fnFs <- sort(list.files(path, pattern="_R1_001.fastq", full.names = TRUE))
fnRs <- sort(list.files(path, pattern="_R2_001.fastq", full.names = TRUE))
# Extract sample names, assuming filenames have format: SAMPLENAME_XXX.fastq
sample.names <- sapply(strsplit(basename(fnFs), "_"), `[`, 1)
# visualize the quality profiles of forward reads
plotQualityProfile(fnFs[1:2])
# These look good.
# We will truncate the forward reads at position 240 (trimming the last 10 nucleotides).
# Now we visualize the quality profile of the reverse reads:
plotQualityProfile(fnRs[1:2])
# Based on these profiles, we will truncate the reverse reads at position 160
# where the quality distribution crashes.
# Place filtered files in filtered/ subdirectory
filtFs <- file.path(path, "filtered", paste0(sample.names, "_F_filt.fastq.gz"))
filtRs <- file.path(path, "filtered", paste0(sample.names, "_R_filt.fastq.gz"))
names(filtFs) <- sample.names
names(filtRs) <- sample.names
out <- filterAndTrim(fnFs, filtFs, fnRs, filtRs, truncLen=c(240,160),
maxN=0, maxEE=c(2,2), truncQ=2, rm.phix=TRUE,
compress=TRUE, multithread=TRUE) # On Windows set multithread=FALSE
head(out)
# Learn error rates for forward and reverse reads
# These commands will take about 3 minutes each to run
errF <- learnErrors(filtFs, multithread=TRUE)
errR <- learnErrors(filtRs, multithread=TRUE)
plotErrors(errF, nominalQ=TRUE)
# Apply the core sample infrence algorithm for F and R
dadaFs <- dada(filtFs, err=errF, multithread=TRUE)
dadaRs <- dada(filtRs, err=errR, multithread=TRUE)
# merge the paired reads
mergers <- mergePairs(dadaFs, filtFs, dadaRs, filtRs, verbose=TRUE)
# Inspect the merged files
# Inspect the merger data.frame from the first sample
head(mergers[[1]])
# Make a sequence table
seqtab <- makeSequenceTable(mergers)
dim(seqtab)
# Inspect distribution of sequence lengths
table(nchar(getSequences(seqtab)))
# Remove chimeras
seqtab.nochim <- removeBimeraDenovo(seqtab, method="consensus", multithread=TRUE, verbose=TRUE)
dim(seqtab.nochim)
sum(seqtab.nochim)/sum(seqtab)
# Chimeras account for about 4% of the merged reads.
# Track reads through the pipeline
getN <- function(x) sum(getUniques(x))
track <- cbind(out, sapply(dadaFs, getN), sapply(dadaRs, getN), sapply(mergers, getN), rowSums(seqtab.nochim))
# If processing a single sample, remove the sapply calls: e.g. replace sapply(dadaFs, getN) with getN(dadaFs)
colnames(track) <- c("input", "filtered", "denoisedF", "denoisedR", "merged", "nonchim")
rownames(track) <- sample.names
head(track)
# download the silva_nr_v132_train_set.fa.gz file from this link:
# https://zenodo.org/record/1172783#.XdwdeOdKjUI
# put it in the same directory as the sequence files
# Assign taxonomy
# this will take a pretty long time
taxa <- assignTaxonomy(seqtab.nochim, "silva_nr_v132_train_set.fa", multithread=TRUE)
# inspect the assignments
taxa.print <- taxa # Removing sequence rownames for display only
rownames(taxa.print) <- NULL
head(taxa.print)
# Evaluate accuaracy of DADA2 on the mock community of 20 known strains
unqs.mock <- seqtab.nochim["Mock",]
unqs.mock <- sort(unqs.mock[unqs.mock>0], decreasing=TRUE) # Drop ASVs absent in the Mock
cat("DADA2 inferred", length(unqs.mock), "sample sequences present in the Mock community.\n")
mock.ref <- getSequences(file.path(path, "HMP_MOCK.v35.fasta"))
match.ref <- sum(sapply(names(unqs.mock), function(x) any(grepl(x, mock.ref))))
cat("Of those,", sum(match.ref), "were exact matches to the expected reference sequences.\n")
# One nice thing about doing assingments with DADA2 is that it plays nicely with phyloseq
# Like dada2 this doesn't exisit in CRAN, so instalation is a bit tricky
# This can take a while, it's a big package.
source('http://bioconductor.org/biocLite.R')
biocLite('phyloseq')
library(phyloseq); packageVersion("phyloseq")
library(Biostrings); packageVersion("Biostrings")
library(ggplot2); packageVersion("ggplot2")
theme_set(theme_bw())
# Make a sample data table from file names
samples.out <- rownames(seqtab.nochim)
subject <- sapply(strsplit(samples.out, "D"), `[`, 1)
gender <- substr(subject,1,1)
subject <- substr(subject,2,999)
day <- as.integer(sapply(strsplit(samples.out, "D"), `[`, 2))
samdf <- data.frame(Subject=subject, Gender=gender, Day=day)
samdf$When <- "Early"
samdf$When[samdf$Day>100] <- "Late"
rownames(samdf) <- samples.out
# construct a phyloseq object
ps <- phyloseq(otu_table(seqtab.nochim, taxa_are_rows=FALSE),
sample_data(samdf),
tax_table(taxa))
ps <- prune_samples(sample_names(ps) != "Mock", ps) # Remove mock sample
dna <- Biostrings::DNAStringSet(taxa_names(ps))
names(dna) <- taxa_names(ps)
ps <- merge_phyloseq(ps, dna)
taxa_names(ps) <- paste0("ASV", seq(ntaxa(ps)))
ps
# now we can use some of the built in functions in phyloseq to look at our data
plot_richness(ps, x="Day", measures=c("Shannon", "Simpson"), color="When")
# Transform data to proportions as appropriate for Bray-Curtis distances
ps.prop <- transform_sample_counts(ps, function(otu) otu/sum(otu))
ord.nmds.bray <- ordinate(ps.prop, method="NMDS", distance="bray")
plot_ordination(ps.prop, ord.nmds.bray, color="When", title="Bray NMDS")
# make a bar plot
top20 <- names(sort(taxa_sums(ps), decreasing=TRUE))[1:20]
ps.top20 <- transform_sample_counts(ps, function(OTU) OTU/sum(OTU))
ps.top20 <- prune_taxa(top20, ps.top20)
plot_bar(ps.top20, x="Day", fill="Family") + facet_wrap(~When, scales="free_x")