Skip to content

Commit 51413d3

Browse files
committed
Impose a minimum repeat instance length
1 parent 20a8ab2 commit 51413d3

File tree

2 files changed

+325
-53
lines changed

2 files changed

+325
-53
lines changed

docs/overview.org

Lines changed: 195 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -42,22 +42,23 @@
4242
and the output location for our final figures and tables.
4343

4444
#+BEGIN_SRC groovy :tangle ../main.nf :shebang #!/usr/bin/env nextflow
45-
VERSION = "1.0.0"
46-
47-
params.reference = 'data/example/genome.fasta'
48-
params.trnaprot = 'http://www.hrt.msu.edu/uploads/535/78637/Tpases020812.gz'
49-
params.trnanuc = 'http://gtrnadb2009.ucsc.edu/download/tRNAs/eukaryotic-tRNAs.fa.gz'
50-
params.outdir = 'output'
51-
52-
trnanuc = file(params.trnanuc)
53-
trnaprot = file(params.trnaprot)
54-
reference = file(params.reference)
55-
56-
log.info ""
57-
log.info "R E P E A T M A S K I N G ~ version " + VERSION
58-
log.info "reference input : ${params.reference}"
59-
log.info "output directory : ${params.outdir}"
60-
log.info ""
45+
VERSION = "1.0.0"
46+
47+
params.reference = 'data/example/genome.fasta'
48+
params.trnaprot = 'http://www.hrt.msu.edu/uploads/535/78637/Tpases020812.gz'
49+
params.trnanuc = 'http://gtrnadb2009.ucsc.edu/download/tRNAs/eukaryotic-tRNAs.fa.gz'
50+
params.outdir = 'output'
51+
params.minRepLength = 250
52+
53+
trnanuc = file(params.trnanuc)
54+
trnaprot = file(params.trnaprot)
55+
reference = file(params.reference)
56+
57+
log.info ""
58+
log.info "R E P E A T M A S K I N G ~ version " + VERSION
59+
log.info "reference input : ${params.reference}"
60+
log.info "output directory : ${params.outdir}"
61+
log.info ""
6162
#+END_SRC
6263

6364
** Collection of relatively recent LTR retrotranposons
@@ -539,15 +540,99 @@
539540
#+END_SRC
540541

541542
#+BEGIN_SRC groovy :tangle ../main.nf
542-
identityUnknown = Channel.create()
543-
identityKnown = Channel.create()
544-
545-
rmOutput
546-
.splitFasta(record: [id: true, text: true])
547-
.choice(identityUnknown, identityKnown) { record -> record.id =~ /#Unknown/ ? 0 : 1 }
548-
549-
repeatmaskerUnknowns = identityUnknown.collectFile() { record -> ['unknown.fasta', record.text] }
550-
repeatmaskerKnowns = identityKnown.collectFile() { record -> ['known.fasta', record.text] }
543+
identityUnknown = Channel.create()
544+
identityKnown = Channel.create()
545+
546+
rmOutput
547+
.splitFasta(record: [id: true, text: true])
548+
.choice(identityUnknown, identityKnown) { record -> record.id =~ /#Unknown/ ? 0 : 1 }
549+
550+
identityUnknown
551+
.collectFile() { record -> ['unknown.fasta', record.text] }
552+
.set { repeatmaskerUnknowns }
553+
554+
identityKnown
555+
.collectFile() { record -> ['known.fasta', record.text] }
556+
.tap { repeatmaskerKnowns1 }
557+
.set{ repeatmaskerKnowns }
558+
#+END_SRC
559+
560+
We'd like to get a better idea about what these 'unknown elements'
561+
are. It's possible that they are known transposons that have been
562+
RIPped beyond recognition. To retrieve these, we can pull out an
563+
ailgnment of these repeat instances (with RepeatMasker), run [[https://bmcgenomics.biomedcentral.com/articles/10.1186/1471-2164-11-655][DeRIP]]
564+
over each alignment and then push them back into the identification
565+
pipeline (RepeatModeler classify script).
566+
567+
#+BEGIN_SRC groovy :tangle ../main.nf
568+
process searchForUnidentifiedElements {
569+
container 'robsyme/nf-repeatmasking'
570+
571+
input:
572+
file 'genome.fasta' from reference
573+
file 'unknown_elements.fasta' from repeatmaskerKnowns1
574+
575+
output:
576+
set 'genome.fasta.align', 'unknown_elements.fasta' into unknownAlignments
577+
578+
"""
579+
RepeatMasker \
580+
-lib unknown_elements.fasta \
581+
-alignments \
582+
-nolow \
583+
-no_is \
584+
-dir . \
585+
-inv \
586+
genome.fasta
587+
"""
588+
}
589+
590+
process derip {
591+
input:
592+
set 'genome.fasta.align', 'unknown_elements.fasta' from unknownAlignments
593+
594+
output:
595+
file 'deripped.unknowns.fasta' into derippedUnknowns
596+
597+
"""
598+
rmalignment_to_fasta.rb genome.fasta.align unknown_elements.fasta
599+
for file in alignment*; do
600+
derip.rb \$file
601+
done > deripped.unknowns.fasta
602+
"""
603+
}
604+
605+
process classifyDeripped {
606+
container 'repeats'
607+
608+
input:
609+
file 'transposases.fasta.gz' from trnaprot
610+
file 'repeatmodeler_unknowns.deripped.fasta' from derippedUnknowns
611+
612+
output:
613+
file 'identified_elements.txt' into identifiedDerippedTransposons
614+
file 'unknown_elements.txt' into unidentifiedDerippedTransposons
615+
616+
"""
617+
zcat transposases.fasta.gz > transposases.fasta
618+
makeblastdb \
619+
-in transposases.fasta \
620+
-dbtype prot
621+
blastx \
622+
-query repeatmodeler_unknowns.deripped.fasta \
623+
-db transposases.fasta \
624+
-evalue 1e-10 \
625+
-num_descriptions 10 \
626+
-num_threads ${task.cpus} \
627+
-out modelerunknown_blast_results.txt
628+
transposon_blast_parse.pl \
629+
--blastx modelerunknown_blast_results.txt \
630+
--modelerunknown repeatmodeler_unknowns.deripped.fasta
631+
"""
632+
}
633+
634+
identifiedDerippedTransposons.subscribe{ println("Identified, deripped: ${it}") }
635+
551636
#+END_SRC
552637

553638
#+BEGIN_SRC groovy :tangle ../main.nf
@@ -561,6 +646,7 @@
561646

562647
output:
563648
file 'identified_elements.txt' into identifiedTransposons
649+
file 'unknown_elements.txt' into unidentifiedTransposons
564650

565651
"""
566652
zcat transposases.fasta.gz > transposases.fasta
@@ -579,6 +665,48 @@
579665
--modelerunknown repeatmodeler_unknowns.fasta
580666
"""
581667
}
668+
669+
#+END_SRC
670+
671+
** Noncoding RNA
672+
673+
We'll borrow the excellent ncRNA prediction steps from the
674+
excellent [[https://github.com/sanger-pathogens/companion][Companion annotation pipeline]].
675+
676+
#+BEGIN_SRC groovy :tangle ../main.nf
677+
process predict_ncRNA {
678+
container 'sangerpathogens/companion:latest'
679+
680+
input:
681+
file 'genome.fasta' from reference
682+
683+
output:
684+
file 'cm_out' into cmtblouts
685+
686+
"""
687+
cp /opt/data/cm/rnas.cm models.cm
688+
cmpress -F models.cm
689+
cmsearch --cpu 1 --tblout cm_out --cut_ga models.cm genome.fasta
690+
"""
691+
}
692+
693+
process merge_ncrnas {
694+
container 'sangerpathogens/companion:latest'
695+
cache 'deep'
696+
697+
input:
698+
file 'cmtblout' from cmtblouts
699+
700+
output:
701+
file 'ncrna.gff3' into ncrnafile
702+
703+
"""
704+
infernal_to_gff3.lua < ${cmtblout} > 1
705+
gt gff3 -sort -tidy -retainids 1 > ncrna.gff3
706+
"""
707+
}
708+
709+
ncrnafile.println()
582710
#+END_SRC
583711

584712
** Final Masking
@@ -592,30 +720,47 @@
592720
#+END_SRC
593721

594722
#+BEGIN_SRC groovy :tangle ../main.nf
595-
process repeatMaskerKnowns {
596-
container 'robsyme/nf-repeatmasking'
597-
publishDir "${params.outdir}/repeatMaskerKnowns", mode: 'copy'
598-
599-
input:
600-
file 'reference.fasta' from reference
601-
set 'knownTransposons.lib', 'allLTRs.lib' from knownRepeats
602-
603-
output:
604-
set 'reference.fasta.out', 'reference.fasta.masked' into repeatMaskerKnownsMasked
605-
set 'reference.fasta.out.gff', 'knownRepeats.library.fasta'
606-
607-
"""
608-
cat *.lib > knownRepeats.library.fasta
609-
RepeatMasker \
610-
-lib knownRepeats.library.fasta \
611-
-nolow \
612-
-no_is \
613-
-dir . \
614-
-gff \
615-
-s \
616-
reference.fasta
617-
"""
618-
}
723+
process repeatMaskerKnowns {
724+
container 'robsyme/nf-repeatmasking'
725+
publishDir "${params.outdir}/repeatMaskerKnowns", mode: 'copy'
726+
727+
input:
728+
file 'reference.fasta' from reference
729+
set 'knownTransposons.lib', 'allLTRs.lib' from knownRepeats
730+
731+
output:
732+
set 'reference.fasta.out', 'reference.fasta.masked' into repeatMaskerKnownsMasked
733+
set 'reference.fasta.out.gff', 'knownRepeats.library.fasta'
734+
735+
"""
736+
cat *.lib > knownRepeats.library.fasta
737+
RepeatMasker \
738+
-lib knownRepeats.library.fasta \
739+
-nolow \
740+
-no_is \
741+
-dir . \
742+
-gff \
743+
-s \
744+
reference.fasta
745+
"""
746+
}
747+
748+
process removeShortMatches {
749+
container 'robsyme/nf-repeatmasking'
750+
input:
751+
file 'reference.fa' from reference
752+
set 'rm.out', 'rm.masked' from repeatMaskerKnownsMasked
753+
754+
output:
755+
set 'rm.trimmed.out', 'rm.trimmed.masked' from repeatMaskerKnownsMaskedTrimmed
756+
757+
"""
758+
head -n 3 rm.out > rm.trimmed.out
759+
tail -n +4 rm.out | awk '\$7 - \$6 > ${params.minRepLength}' >> rm.trimmed.out
760+
tail -n +4 rm.out | awk 'BEGIN{OFS="\\t"} \$7 - \$6 > ${params.minRepLength} {print $5, $6, $7}' >> rm.trimmed.bed
761+
maskFastaFromBed -fi reference.fa -bed rm.trimmed.bed -fo rm.trimmed.masked -soft
762+
"""
763+
}
619764
#+END_SRC
620765

621766
#+BEGIN_SRC groovy :tangle ../main.nf
@@ -624,7 +769,7 @@
624769

625770
input:
626771
file 'reference.fa' from reference
627-
set 'rm.out', 'rm.masked' from repeatMaskerKnownsMasked
772+
set 'rm.out', 'rm.masked' from repeatMaskerKnownsMaskedTrimmed
628773

629774
output:
630775
file 'summary.tsv' into repeatmaskerSummaryTable

0 commit comments

Comments
 (0)