|
42 | 42 | and the output location for our final figures and tables. |
43 | 43 |
|
44 | 44 | #+BEGIN_SRC groovy :tangle ../main.nf :shebang #!/usr/bin/env nextflow |
45 | | - VERSION = "1.0.0" |
46 | | - |
47 | | - params.reference = 'data/example/genome.fasta' |
48 | | - params.trnaprot = 'http://www.hrt.msu.edu/uploads/535/78637/Tpases020812.gz' |
49 | | - params.trnanuc = 'http://gtrnadb2009.ucsc.edu/download/tRNAs/eukaryotic-tRNAs.fa.gz' |
50 | | - params.outdir = 'output' |
51 | | - |
52 | | - trnanuc = file(params.trnanuc) |
53 | | - trnaprot = file(params.trnaprot) |
54 | | - reference = file(params.reference) |
55 | | - |
56 | | - log.info "" |
57 | | - log.info "R E P E A T M A S K I N G ~ version " + VERSION |
58 | | - log.info "reference input : ${params.reference}" |
59 | | - log.info "output directory : ${params.outdir}" |
60 | | - log.info "" |
| 45 | + VERSION = "1.0.0" |
| 46 | + |
| 47 | + params.reference = 'data/example/genome.fasta' |
| 48 | + params.trnaprot = 'http://www.hrt.msu.edu/uploads/535/78637/Tpases020812.gz' |
| 49 | + params.trnanuc = 'http://gtrnadb2009.ucsc.edu/download/tRNAs/eukaryotic-tRNAs.fa.gz' |
| 50 | + params.outdir = 'output' |
| 51 | + params.minRepLength = 250 |
| 52 | + |
| 53 | + trnanuc = file(params.trnanuc) |
| 54 | + trnaprot = file(params.trnaprot) |
| 55 | + reference = file(params.reference) |
| 56 | + |
| 57 | + log.info "" |
| 58 | + log.info "R E P E A T M A S K I N G ~ version " + VERSION |
| 59 | + log.info "reference input : ${params.reference}" |
| 60 | + log.info "output directory : ${params.outdir}" |
| 61 | + log.info "" |
61 | 62 | #+END_SRC |
62 | 63 |
|
63 | 64 | ** Collection of relatively recent LTR retrotranposons |
|
539 | 540 | #+END_SRC |
540 | 541 |
|
541 | 542 | #+BEGIN_SRC groovy :tangle ../main.nf |
542 | | - identityUnknown = Channel.create() |
543 | | - identityKnown = Channel.create() |
544 | | - |
545 | | - rmOutput |
546 | | - .splitFasta(record: [id: true, text: true]) |
547 | | - .choice(identityUnknown, identityKnown) { record -> record.id =~ /#Unknown/ ? 0 : 1 } |
548 | | - |
549 | | - repeatmaskerUnknowns = identityUnknown.collectFile() { record -> ['unknown.fasta', record.text] } |
550 | | - repeatmaskerKnowns = identityKnown.collectFile() { record -> ['known.fasta', record.text] } |
| 543 | + identityUnknown = Channel.create() |
| 544 | + identityKnown = Channel.create() |
| 545 | + |
| 546 | + rmOutput |
| 547 | + .splitFasta(record: [id: true, text: true]) |
| 548 | + .choice(identityUnknown, identityKnown) { record -> record.id =~ /#Unknown/ ? 0 : 1 } |
| 549 | + |
| 550 | + identityUnknown |
| 551 | + .collectFile() { record -> ['unknown.fasta', record.text] } |
| 552 | + .set { repeatmaskerUnknowns } |
| 553 | + |
| 554 | + identityKnown |
| 555 | + .collectFile() { record -> ['known.fasta', record.text] } |
| 556 | + .tap { repeatmaskerKnowns1 } |
| 557 | + .set{ repeatmaskerKnowns } |
| 558 | + #+END_SRC |
| 559 | + |
| 560 | + We'd like to get a better idea about what these 'unknown elements' |
| 561 | + are. It's possible that they are known transposons that have been |
| 562 | + RIPped beyond recognition. To retrieve these, we can pull out an |
| 563 | + ailgnment of these repeat instances (with RepeatMasker), run [[https://bmcgenomics.biomedcentral.com/articles/10.1186/1471-2164-11-655][DeRIP]] |
| 564 | + over each alignment and then push them back into the identification |
| 565 | + pipeline (RepeatModeler classify script). |
| 566 | + |
| 567 | + #+BEGIN_SRC groovy :tangle ../main.nf |
| 568 | + process searchForUnidentifiedElements { |
| 569 | + container 'robsyme/nf-repeatmasking' |
| 570 | + |
| 571 | + input: |
| 572 | + file 'genome.fasta' from reference |
| 573 | + file 'unknown_elements.fasta' from repeatmaskerKnowns1 |
| 574 | + |
| 575 | + output: |
| 576 | + set 'genome.fasta.align', 'unknown_elements.fasta' into unknownAlignments |
| 577 | + |
| 578 | + """ |
| 579 | + RepeatMasker \ |
| 580 | + -lib unknown_elements.fasta \ |
| 581 | + -alignments \ |
| 582 | + -nolow \ |
| 583 | + -no_is \ |
| 584 | + -dir . \ |
| 585 | + -inv \ |
| 586 | + genome.fasta |
| 587 | + """ |
| 588 | + } |
| 589 | + |
| 590 | + process derip { |
| 591 | + input: |
| 592 | + set 'genome.fasta.align', 'unknown_elements.fasta' from unknownAlignments |
| 593 | + |
| 594 | + output: |
| 595 | + file 'deripped.unknowns.fasta' into derippedUnknowns |
| 596 | + |
| 597 | + """ |
| 598 | + rmalignment_to_fasta.rb genome.fasta.align unknown_elements.fasta |
| 599 | + for file in alignment*; do |
| 600 | + derip.rb \$file |
| 601 | + done > deripped.unknowns.fasta |
| 602 | + """ |
| 603 | + } |
| 604 | + |
| 605 | + process classifyDeripped { |
| 606 | + container 'repeats' |
| 607 | + |
| 608 | + input: |
| 609 | + file 'transposases.fasta.gz' from trnaprot |
| 610 | + file 'repeatmodeler_unknowns.deripped.fasta' from derippedUnknowns |
| 611 | + |
| 612 | + output: |
| 613 | + file 'identified_elements.txt' into identifiedDerippedTransposons |
| 614 | + file 'unknown_elements.txt' into unidentifiedDerippedTransposons |
| 615 | + |
| 616 | + """ |
| 617 | + zcat transposases.fasta.gz > transposases.fasta |
| 618 | + makeblastdb \ |
| 619 | + -in transposases.fasta \ |
| 620 | + -dbtype prot |
| 621 | + blastx \ |
| 622 | + -query repeatmodeler_unknowns.deripped.fasta \ |
| 623 | + -db transposases.fasta \ |
| 624 | + -evalue 1e-10 \ |
| 625 | + -num_descriptions 10 \ |
| 626 | + -num_threads ${task.cpus} \ |
| 627 | + -out modelerunknown_blast_results.txt |
| 628 | + transposon_blast_parse.pl \ |
| 629 | + --blastx modelerunknown_blast_results.txt \ |
| 630 | + --modelerunknown repeatmodeler_unknowns.deripped.fasta |
| 631 | + """ |
| 632 | + } |
| 633 | + |
| 634 | + identifiedDerippedTransposons.subscribe{ println("Identified, deripped: ${it}") } |
| 635 | + |
551 | 636 | #+END_SRC |
552 | 637 |
|
553 | 638 | #+BEGIN_SRC groovy :tangle ../main.nf |
|
561 | 646 |
|
562 | 647 | output: |
563 | 648 | file 'identified_elements.txt' into identifiedTransposons |
| 649 | + file 'unknown_elements.txt' into unidentifiedTransposons |
564 | 650 |
|
565 | 651 | """ |
566 | 652 | zcat transposases.fasta.gz > transposases.fasta |
|
579 | 665 | --modelerunknown repeatmodeler_unknowns.fasta |
580 | 666 | """ |
581 | 667 | } |
| 668 | + |
| 669 | + #+END_SRC |
| 670 | + |
| 671 | +** Noncoding RNA |
| 672 | + |
| 673 | + We'll borrow the excellent ncRNA prediction steps from the |
| 674 | + excellent [[https://github.com/sanger-pathogens/companion][Companion annotation pipeline]]. |
| 675 | + |
| 676 | + #+BEGIN_SRC groovy :tangle ../main.nf |
| 677 | + process predict_ncRNA { |
| 678 | + container 'sangerpathogens/companion:latest' |
| 679 | + |
| 680 | + input: |
| 681 | + file 'genome.fasta' from reference |
| 682 | + |
| 683 | + output: |
| 684 | + file 'cm_out' into cmtblouts |
| 685 | + |
| 686 | + """ |
| 687 | + cp /opt/data/cm/rnas.cm models.cm |
| 688 | + cmpress -F models.cm |
| 689 | + cmsearch --cpu 1 --tblout cm_out --cut_ga models.cm genome.fasta |
| 690 | + """ |
| 691 | + } |
| 692 | + |
| 693 | + process merge_ncrnas { |
| 694 | + container 'sangerpathogens/companion:latest' |
| 695 | + cache 'deep' |
| 696 | + |
| 697 | + input: |
| 698 | + file 'cmtblout' from cmtblouts |
| 699 | + |
| 700 | + output: |
| 701 | + file 'ncrna.gff3' into ncrnafile |
| 702 | + |
| 703 | + """ |
| 704 | + infernal_to_gff3.lua < ${cmtblout} > 1 |
| 705 | + gt gff3 -sort -tidy -retainids 1 > ncrna.gff3 |
| 706 | + """ |
| 707 | + } |
| 708 | + |
| 709 | + ncrnafile.println() |
582 | 710 | #+END_SRC |
583 | 711 |
|
584 | 712 | ** Final Masking |
|
592 | 720 | #+END_SRC |
593 | 721 |
|
594 | 722 | #+BEGIN_SRC groovy :tangle ../main.nf |
595 | | - process repeatMaskerKnowns { |
596 | | - container 'robsyme/nf-repeatmasking' |
597 | | - publishDir "${params.outdir}/repeatMaskerKnowns", mode: 'copy' |
598 | | - |
599 | | - input: |
600 | | - file 'reference.fasta' from reference |
601 | | - set 'knownTransposons.lib', 'allLTRs.lib' from knownRepeats |
602 | | - |
603 | | - output: |
604 | | - set 'reference.fasta.out', 'reference.fasta.masked' into repeatMaskerKnownsMasked |
605 | | - set 'reference.fasta.out.gff', 'knownRepeats.library.fasta' |
606 | | - |
607 | | - """ |
608 | | - cat *.lib > knownRepeats.library.fasta |
609 | | - RepeatMasker \ |
610 | | - -lib knownRepeats.library.fasta \ |
611 | | - -nolow \ |
612 | | - -no_is \ |
613 | | - -dir . \ |
614 | | - -gff \ |
615 | | - -s \ |
616 | | - reference.fasta |
617 | | - """ |
618 | | - } |
| 723 | + process repeatMaskerKnowns { |
| 724 | + container 'robsyme/nf-repeatmasking' |
| 725 | + publishDir "${params.outdir}/repeatMaskerKnowns", mode: 'copy' |
| 726 | + |
| 727 | + input: |
| 728 | + file 'reference.fasta' from reference |
| 729 | + set 'knownTransposons.lib', 'allLTRs.lib' from knownRepeats |
| 730 | + |
| 731 | + output: |
| 732 | + set 'reference.fasta.out', 'reference.fasta.masked' into repeatMaskerKnownsMasked |
| 733 | + set 'reference.fasta.out.gff', 'knownRepeats.library.fasta' |
| 734 | + |
| 735 | + """ |
| 736 | + cat *.lib > knownRepeats.library.fasta |
| 737 | + RepeatMasker \ |
| 738 | + -lib knownRepeats.library.fasta \ |
| 739 | + -nolow \ |
| 740 | + -no_is \ |
| 741 | + -dir . \ |
| 742 | + -gff \ |
| 743 | + -s \ |
| 744 | + reference.fasta |
| 745 | + """ |
| 746 | + } |
| 747 | + |
| 748 | + process removeShortMatches { |
| 749 | + container 'robsyme/nf-repeatmasking' |
| 750 | + input: |
| 751 | + file 'reference.fa' from reference |
| 752 | + set 'rm.out', 'rm.masked' from repeatMaskerKnownsMasked |
| 753 | + |
| 754 | + output: |
| 755 | + set 'rm.trimmed.out', 'rm.trimmed.masked' from repeatMaskerKnownsMaskedTrimmed |
| 756 | + |
| 757 | + """ |
| 758 | + head -n 3 rm.out > rm.trimmed.out |
| 759 | + tail -n +4 rm.out | awk '\$7 - \$6 > ${params.minRepLength}' >> rm.trimmed.out |
| 760 | + tail -n +4 rm.out | awk 'BEGIN{OFS="\\t"} \$7 - \$6 > ${params.minRepLength} {print $5, $6, $7}' >> rm.trimmed.bed |
| 761 | + maskFastaFromBed -fi reference.fa -bed rm.trimmed.bed -fo rm.trimmed.masked -soft |
| 762 | + """ |
| 763 | + } |
619 | 764 | #+END_SRC |
620 | 765 |
|
621 | 766 | #+BEGIN_SRC groovy :tangle ../main.nf |
|
624 | 769 |
|
625 | 770 | input: |
626 | 771 | file 'reference.fa' from reference |
627 | | - set 'rm.out', 'rm.masked' from repeatMaskerKnownsMasked |
| 772 | + set 'rm.out', 'rm.masked' from repeatMaskerKnownsMaskedTrimmed |
628 | 773 |
|
629 | 774 | output: |
630 | 775 | file 'summary.tsv' into repeatmaskerSummaryTable |
|
0 commit comments