diff --git a/INSTALL.sh b/INSTALL.sh
index 9c73968..da6d8c0 100644
--- a/INSTALL.sh
+++ b/INSTALL.sh
@@ -4,6 +4,11 @@
# params:
# 1 guppy version
+if [ -e "./mop_preprocess/bin/guppy_basecaller" ] ; then
+ echo "unlinking previously installed versions"
+ cd mop_preprocess/bin; find . -maxdepth 1 -type l | xargs rm; cd ../../
+fi
+
if [ x"$1" == x ]; then
GUPPY_VER='3.4.5'
else
@@ -11,24 +16,37 @@ else
fi
-wget https://cdn.oxfordnanoportal.com/software/analysis/ont-guppy_${GUPPY_VER}_linux64.tar.gz
+wget https://cdn.oxfordnanoportal.com/software/analysis/ont-guppy_${GUPPY_VER}_linux64.tar.gz
if [ $? -eq 0 ]; then
echo "INSTALLING GUPPY VERSION ${GUPPY_VER}"
else
- echo "GUPPY VERSION ${GUPPY_VER} is not found"
+ echo "GUPPY VERSION ${GUPPY_VER} is not found"
exit
fi
tar -zvxf ont-guppy_${GUPPY_VER}_linux64.tar.gz
-mv ont-guppy mop_preprocess/bin/
+
+wget https://biocore.crg.eu/public/mop3_pub/models.tar
+mv models.tar mop_preprocess/guppy_models/
+cd mop_preprocess/guppy_models; tar -xvf models.tar; rm models.tar; cd ../../
+
+mkdir -p mop_preprocess/bin/ont-guppy_${GUPPY_VER}
+mv ont-guppy/* mop_preprocess/bin/ont-guppy_${GUPPY_VER}
+for i in mop_preprocess/guppy_models/*.gz; do gzip -cd $i > mop_preprocess/bin/ont-guppy_${GUPPY_VER}/data/`basename $i .gz`; done
+rmdir ont-guppy
+
cd mop_preprocess/bin
-ln -sf ont-guppy/bin/guppy_* .
-ln -sf ont-guppy/lib/* .
+ln -s ont-guppy_${GUPPY_VER}/bin/guppy_* .
+ln -s ont-guppy_${GUPPY_VER}/lib/* .
cd ../../
-if [ ! -e "mop_preprocess/bin/ont-guppy/lib/libz.so" ] ; then
- unlink mop_preprocess/bin/ont-guppy/lib/libz.so
- cd mop_preprocess/bin/ont-guppy/lib/
- ln -sf libz.so.1 libz.so
- cd ../../../../
+
+if [ ! -e "./mop_preprocess/bin/ont-guppy_${GUPPY_VER}/lib/libz.so" ] ; then
+ if [ -e "./mop_preprocess/bin/ont-guppy_${GUPPY_VER}/lib/libz.so.1" ] ; then
+ unlink mop_preprocess/bin/ont-guppy_${GUPPY_VER}/lib/libz.so
+ cd mop_preprocess/bin/ont-guppy_${GUPPY_VER}/lib/
+ ln -s libz.so.1 libz.so
+ cd ../../../../
+ fi
fi
-rm ont-guppy_${GUPPY_VER}_linux64.tar.gz
+
+rm ./ont-guppy_${GUPPY_VER}_linux64.tar.gz
diff --git a/LICENSE b/LICENSE
index b2b7cf2..c218e7e 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,6 +1,6 @@
MIT License
-Copyright (c) 2021 Biocore@CRG
+Copyright (c) 2022 Biocore@CRG
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
diff --git a/README.md b/README.md
index 4972374..670feb2 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,6 @@
-# MoP2- DSL2 version of Master of Pores
+# MoP3 - Master of Pores 3
[](https://cloud.docker.com/u/biocorecrg/repository/docker/biocorecrg/nanopore/builds)
-[](https://github.com/biocorecrg/MoP2/actions/workflows/build.yml)
-[](https://opensource.org/licenses/MIT)
+[](https://github.com/biocorecrg/master_of_pores/actions/workflows/build.yml)[](https://opensource.org/licenses/MIT)
[](https://www.nextflow.io/)
[](https://www.nextflow.io/)
[](https://www.sylabs.io/)
@@ -9,10 +8,22 @@
-
-Inspired by Metallica's [Master Of Puppets](https://www.youtube.com/watch?v=S7blkui3nQc)
+
+
+
+Master of Pores is a pipeline written in Nextflow DSL2 for the analysis of Nanopore data.
+
+
+It can handle reads from direct RNAseq, cDNAseq, DNAseq etc.
+
+
+
+
+
+
+The name is inspired by the Metallica's [Master Of Puppets](https://www.youtube.com/watch?v=S7blkui3nQc)
## Install
Please install nextflow and singularity or docker before.
@@ -20,18 +31,13 @@ Please install nextflow and singularity or docker before.
Then download the repo:
```
-git clone --depth 1 --recurse-submodules git@github.com:biocorecrg/MOP2.git
-```
-
-or
-```
-git clone --depth 1 --recurse-submodules https://github.com/biocorecrg/MOP2.git
+git clone --depth 1 --recurse-submodules https://github.com/biocorecrg/master_of_pores.git
```
-You can use INSTALL.sh to download the version 3.4.5 of guppy or you can replace it with the version you prefer. Please consider that the support of VBZ compression of fast5 started with version 3.4.X.
+You can use INSTALL.sh to download the version 3.4.5 of guppy or you can replace it with the version you prefer. Please consider that the support of VBZ compression of fast5 started with version 3.4.X.
```
-cd MOP2; sh INSTALL.sh 3.4.5
+cd master_of_pores; sh INSTALL.sh
```
## Testing
@@ -39,9 +45,23 @@ You can replace ```-with-singularity``` with ```-with-docker``` if you want to u
```
cd mop_preprocess
-nextflow run mop_preprocess.nf -with-singularity -bg -profile local > log
+nextflow run mop_preprocess.nf -with-singularity -bg -profile local -params-file params.yaml > log
+```
+
+## Upgrading
+To upgrade the tool you can type:
```
+git pull --recurse-submodules
+```
+
+## Documentation
+The documentation is available at [https://biocorecrg.github.io/master_of_pores/](https://biocorecrg.github.io/master_of_pores/)
+
+## Contact
+Please open an [issue](https://github.com/biocorecrg/master_of_pores/issues) if you encounter any issues/troubles.
+However, please go over the previous issues (including closed issues) before opening a new issue, as your same exact question might have been already answered previously. Thank you!
+
## Reference
If you use this tool, please cite our papers:
@@ -52,7 +72,3 @@ Cozzuto L, Delgado-Tejedor A, Hermoso Pulido T, Novoa EM, Ponomarenko J. *N. Met
["MasterOfPores: A Workflow for the Analysis of Oxford Nanopore Direct RNA Sequencing Datasets"
Luca Cozzuto, Huanle Liu, Leszek P. Pryszcz, Toni Hermoso Pulido, Anna Delgado-Tejedor, Julia Ponomarenko, Eva Maria Novoa.
*Front. Genet., 17 March 2020.* https://doi.org/10.3389/fgene.2020.00211](https://www.frontiersin.org/articles/10.3389/fgene.2020.00211/full)
-
-
-## Documentation
-The documentation is available at [https://biocorecrg.github.io/MOP2/docs/](https://biocorecrg.github.io/MOP2/docs/about.html)
diff --git a/anno/curlcake_constructs.fasta.gz b/anno/curlcake_constructs.fasta.gz
new file mode 100755
index 0000000..0653a9a
Binary files /dev/null and b/anno/curlcake_constructs.fasta.gz differ
diff --git a/anno/curlcake_constructs_EcoRV_BamHI_digestion.fasta.gz b/anno/curlcake_constructs_EcoRV_BamHI_digestion.fasta.gz
new file mode 100644
index 0000000..8049e11
Binary files /dev/null and b/anno/curlcake_constructs_EcoRV_BamHI_digestion.fasta.gz differ
diff --git a/anno/curlcake_constructs_EcoRV_BamHI_digestion.gtf.gz b/anno/curlcake_constructs_EcoRV_BamHI_digestion.gtf.gz
new file mode 100644
index 0000000..7334949
Binary files /dev/null and b/anno/curlcake_constructs_EcoRV_BamHI_digestion.gtf.gz differ
diff --git a/anno/yeast_rRNA_ref.gtf.gz b/anno/yeast_rRNA_ref.gtf.gz
new file mode 100644
index 0000000..378a0a3
Binary files /dev/null and b/anno/yeast_rRNA_ref.gtf.gz differ
diff --git a/conf.py b/conf.py
new file mode 100644
index 0000000..7f21a5f
--- /dev/null
+++ b/conf.py
@@ -0,0 +1,183 @@
+# -*- coding: utf-8 -*-
+#
+# Configuration file for the Sphinx documentation builder.
+#
+# This file does only contain a selection of the most common options. For a
+# full list see the documentation:
+# http://www.sphinx-doc.org/en/master/config
+
+# -- Path setup --------------------------------------------------------------
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+# import os
+# import sys
+# sys.path.insert(0, os.path.abspath('.'))
+
+
+# -- Project information -----------------------------------------------------
+
+project = u'Master of Pores 3'
+copyright = u'2019, CRG'
+author = u'Luca Cozzuto'
+author = u'Anna Delgado'
+author = u'Eva Novoa'
+
+# The short X.Y version
+version = u''
+# The full version, including alpha/beta/rc tags
+release = u''
+
+
+# -- General configuration ---------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#
+# needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+ 'sphinx.ext.autodoc',
+ 'sphinx.ext.githubpages'
+]
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix(es) of source filenames.
+# You can specify multiple suffix as a list of string:
+#
+#source_suffix = ['.rst', '.html']
+source_suffix = '.rst'
+
+# The master toctree document.
+master_doc = 'docs/index'
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#
+# This is also used if you do content translation via gettext catalogs.
+# Usually you set "language" from the command line for these cases.
+language = None
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This pattern also affects html_static_path and html_extra_path.
+exclude_patterns = []
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = None
+
+
+# -- Options for HTML output -------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages. See the documentation for
+# a list of builtin themes.
+#
+html_theme = "sphinx_rtd_theme"
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further. For a list of options available for each theme, see the
+# documentation.
+#
+# html_theme_options = {}
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+
+html_copy_source = False
+html_show_sourcelink = False
+
+# Custom sidebar templates, must be a dictionary that maps document names
+# to template names.
+#
+# The default sidebars (for documents that don't match any pattern) are
+# defined by theme itself. Builtin themes are using these templates by
+# default: ``['localtoc.html', 'relations.html', 'sourcelink.html',
+# 'searchbox.html']``.
+#
+# html_sidebars = {}
+
+
+# -- Options for HTMLHelp output ---------------------------------------------
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'SphinxGitHubActionTestdoc'
+
+
+# -- Options for LaTeX output ------------------------------------------------
+
+latex_elements = {
+ # The paper size ('letterpaper' or 'a4paper').
+ #
+ # 'papersize': 'letterpaper',
+
+ # The font size ('10pt', '11pt' or '12pt').
+ #
+ # 'pointsize': '10pt',
+
+ # Additional stuff for the LaTeX preamble.
+ #
+ # 'preamble': '',
+
+ # Latex figure (float) alignment
+ #
+ # 'figure_align': 'htbp',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title,
+# author, documentclass [howto, manual, or own class]).
+latex_documents = [
+ (master_doc, 'SphinxGitHubActionTest.tex', u'Sphinx GitHub Action Test Documentation',
+ u'Sean Zheng', 'manual'),
+]
+
+
+# -- Options for manual page output ------------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [
+ (master_doc, 'sphinxgithubactiontest', u'Sphinx GitHub Action Test Documentation',
+ [author], 1)
+]
+
+
+# -- Options for Texinfo output ----------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+# dir menu entry, description, category)
+texinfo_documents = [
+ (master_doc, 'SphinxGitHubActionTest', u'Sphinx GitHub Action Test Documentation',
+ author, 'SphinxGitHubActionTest', 'One line description of project.',
+ 'Miscellaneous'),
+]
+
+
+# -- Options for Epub output -------------------------------------------------
+
+# Bibliographic Dublin Core info.
+epub_title = project
+
+# The unique identifier of the text. This can be a ISBN number
+# or the project homepage.
+#
+# epub_identifier = ''
+
+# A unique identification for the text.
+#
+# epub_uid = ''
+
+# A list of files that should not be packed into the epub file.
+epub_exclude_files = ['search.html']
+
+
+# -- Extension configuration -------------------------------------------------
diff --git a/conf/awsbatch.config b/conf/awsbatch.config
index c57ac27..95c02ec 100644
--- a/conf/awsbatch.config
+++ b/conf/awsbatch.config
@@ -3,6 +3,7 @@ aws.region = 'eu-central-1'
aws.batch.cliPath = '/home/ec2-user/miniconda/bin/aws'
process {
+ shell = ['/bin/bash', '-euo', 'pipefail']
executor = 'awsbatch'
queue = 'mop'
cpus = 1
@@ -13,32 +14,27 @@ process {
containerOptions = { workflow.containerEngine == "docker" ? '-u $(id -u):$(id -g)': null}
withLabel: big_cpus {
- cpus = 2
- memory = '2G'
+ cpus = 8
+ memory = '12G'
}
withLabel: big_cpus_ignore {
errorStrategy = 'ignore'
- cpus = 2
- memory = '2G'
+ cpus = 8
+ memory = '12G'
}
withLabel: big_mem_cpus {
time = '6h'
- cpus = 3
- memory = '2G'
- }
-
- withLabel: big_time_cpus {
- time = '24h'
- cpus = 3
- memory = '2G'
+ cpus = 8
+ memory = '20G'
}
withLabel: demulti_gpus {
accelerator = 1
queue = 'mop-gpu'
memory = '2G'
+ // clusterOptions = {" -l gpu=1 -l virtual_free=${task.memory.toString().replaceAll(/[\sB]/,'')}"}
containerOptions = { workflow.containerEngine == "singularity" ? '--nv':
( workflow.containerEngine == "docker" ? '-u $(id -u):$(id -g) --gpus all': null ) } }
withLabel: basecall_gpus {
@@ -46,7 +42,7 @@ process {
queue = 'mop-gpu'
memory = '2G'
clusterOptions = '-l gpu=1'
- containerOptions = { workflow.containerEngine == "singularity" ? '--nv':
+ containerOptions = { workflow.containerEngine == "singularity" ? '--nv':
( workflow.containerEngine == "docker" ? '-u $(id -u):$(id -g) --gpus all': null ) }
}
}
diff --git a/conf/awsbatch.config2 b/conf/awsbatch.config2
deleted file mode 100644
index 80feb8a..0000000
--- a/conf/awsbatch.config2
+++ /dev/null
@@ -1,47 +0,0 @@
-workDir = 's3://mop2-bucket-1/work'
-aws.region = 'eu-central-1'
-aws.batch.cliPath = '/home/ec2-user/miniconda/bin/aws'
-
-process {
- executor = 'awsbatch'
- queue = 'mop'
- cpus = 1
- memory='2G'
- cache='lenient'
-
- container = 'biocorecrg/mopprepr:0.7'
- containerOptions = { workflow.containerEngine == "docker" ? '-u $(id -u):$(id -g)': null}
-
- withLabel: big_cpus {
- cpus = 2
- memory = '2G'
- }
-
- withLabel: big_cpus_ignore {
- errorStrategy = 'ignore'
- cpus = 2
- memory = '2G'
- }
-
- withLabel: big_mem_cpus {
- time = '6h'
- cpus = 2
- memory = '2G'
- }
-
- withLabel: demulti_gpus {
- accelerator = 1
- queue = 'mop-gpu'
- memory = '2G'
- // clusterOptions = {" -l gpu=1 -l virtual_free=${task.memory.toString().replaceAll(/[\sB]/,'')}"}
- containerOptions = { workflow.containerEngine == "singularity" ? '--nv':
- ( workflow.containerEngine == "docker" ? '-u $(id -u):$(id -g) --gpus all': null ) } }
- withLabel: basecall_gpus {
- accelerator = 1
- queue = 'mop-gpu'
- memory = '2G'
- clusterOptions = '-l gpu=1'
- containerOptions = { workflow.containerEngine == "singularity" ? '--nv':
- ( workflow.containerEngine == "docker" ? '-u $(id -u):$(id -g) --gpus all': null ) }
- }
-}
diff --git a/conf/ci.config b/conf/ci.config
index eb0a882..011b5e8 100644
--- a/conf/ci.config
+++ b/conf/ci.config
@@ -1,12 +1,13 @@
process {
+ shell = ['/bin/bash', '-euo', 'pipefail']
executor = 'local'
cpus = 2
- memory = '1.5GB'
+ memory = '1.5GB'
cache='lenient'
container = 'biocorecrg/mopprepr:0.7'
containerOptions = { workflow.containerEngine == "docker" ? '-u $(id -u):$(id -g)': null}
withLabel: big_cpus_ignore {
errorStrategy = 'ignore'
-
+
}
}
diff --git a/conf/crg.config b/conf/crg.config
new file mode 100644
index 0000000..10e14bb
--- /dev/null
+++ b/conf/crg.config
@@ -0,0 +1,65 @@
+process {
+ shell = ['/bin/bash', '-euo', 'pipefail', '\n', 'hostname >&2', '\n']
+ executor = 'crg'
+ penv = 'smp'
+ queue = 'short-rocky9,long-centos79,biocore-el7,short-centos79'
+ cpus = 1
+ memory='12G'
+// clusterOptions = { "-l h_vmem=${task.memory.toString().replaceAll(/[\sB]/,'')}" }
+ cache='lenient'
+ container = 'biocorecrg/mopprepr:0.7'
+ containerOptions = { workflow.containerEngine == "docker" ? '-u $(id -u):$(id -g)': null}
+ withLabel: big_mem_time {
+ memory = '60G'
+ time = '24h'
+ }
+ withLabel: big_time_cpus {
+ cpus = 8
+ memory = '60G'
+ time = '24h'
+ // clusterOptions = { "-l h_vmem=${task.memory.toString().replaceAll(/[\sB]/,'')}" }
+
+ }
+ withLabel: big_cpus {
+ cpus = 8
+ memory = '8G'
+ }
+ withLabel: big_mem {
+ cpus = 1
+ memory = '80G'
+ }
+ withLabel: big_cpus_retry {
+ errorStrategy = {task.attempt <= 3 ? 'retry' : 'ignore'}
+ cpus = 8
+ memory = {16.GB * task.attempt}
+ maxRetries = 3
+
+ }
+ withLabel: big_cpus_ignore {
+ errorStrategy = 'ignore'
+ cpus = 8
+ memory = '8G'
+ }
+ withLabel: big_mem_cpus {
+ time = '48h'
+ cpus = 8
+ memory = '60G'
+ }
+ withLabel: demulti_gpus {
+ queue = 'gpu'
+ memory = '12G'
+ // clusterOptions = {" -l gpu=1 -l virtual_free=${task.memory.toString().replaceAll(/[\sB]/,'')}"}
+ containerOptions = { workflow.containerEngine == "singularity" ? '--nv':
+ ( workflow.containerEngine == "docker" ? '-u $(id -u):$(id -g) --gpus all': null ) }
+ }
+ withLabel: basecall_gpus {
+ queue = 'gpu'
+ memory = '8G'
+ clusterOptions = '-l gpu=1'
+ containerOptions = { workflow.containerEngine == "singularity" ? '--nv':
+ ( workflow.containerEngine == "docker" ? '-u $(id -u):$(id -g) --gpus all': null ) }
+ }
+}
+singularity.enabled = true
+singularity.envWhitelist = "CUDA_VISIBLE_DEVICES,SGE_HGR_gpu"
+//singularity.cacheDir = "/software/bi/biocore_tools/singularity_containers/"
diff --git a/conf/local.config b/conf/local.config
index 52ccc40..b6f79ba 100644
--- a/conf/local.config
+++ b/conf/local.config
@@ -1,17 +1,14 @@
process {
+ shell = ['/bin/bash', '-euo', 'pipefail']
executor = 'local'
- cpus = 1
- memory = '1.5GB'
+ cpus = 3
+ memory = '4GB'
cache='lenient'
container = 'biocorecrg/mopprepr:0.7'
containerOptions = { workflow.containerEngine == "docker" ? '-u $(id -u):$(id -g)': null}
withLabel: big_cpus_ignore {
errorStrategy = 'ignore'
-
- }
- withLabel: basecall_gpus {
- maxForks = 1
- containerOptions = { workflow.containerEngine == "singularity" ? '--nv':
- ( workflow.containerEngine == "docker" ? '-u $(id -u):$(id -g) --gpus all': null ) }
}
+
}
+singularity.cacheDir = "${projectDir}/../singularity"
diff --git a/conf/m1_apple.config b/conf/m1_apple.config
index b34451a..67e0d68 100644
--- a/conf/m1_apple.config
+++ b/conf/m1_apple.config
@@ -1,17 +1,17 @@
process {
executor = 'local'
cpus = 1
- memory = '1.5GB'
+ memory = '1.5GB'
cache='lenient'
container = 'biocorecrg/mopprepr:0.7'
containerOptions = { workflow.containerEngine == "docker" ? '-u $(id -u):$(id -g) --platform linux/amd64': null}
withLabel: big_cpus_ignore {
errorStrategy = 'ignore'
-
+
}
withLabel: basecall_gpus {
maxForks = 1
containerOptions = { workflow.containerEngine == "singularity" ? '--nv':
- ( workflow.containerEngine == "docker" ? '-u $(id -u):$(id -g) --gpus all': null ) }
+ ( workflow.containerEngine == "docker" ? '-u $(id -u):$(id -g) --gpus all': null ) }
}
}
diff --git a/conf/newcrg.config b/conf/newcrg.config
new file mode 100644
index 0000000..c5f3e12
--- /dev/null
+++ b/conf/newcrg.config
@@ -0,0 +1,67 @@
+process {
+ shell = ['/bin/bash', '-euo', 'pipefail', '\n', 'hostname >&2', '\n']
+ executor = "slurm"
+ cpus = 1
+ memory='12G'
+ queue='genoa64'
+ cache='lenient'
+ time = '6h'
+ container = 'biocorecrg/mopprepr:0.7'
+ containerOptions = { workflow.containerEngine == "docker" ? '-u $(id -u):$(id -g)': null}
+ clusterOptions = { task.time <= 3.h ? '--qos=shorter' :
+ (task.time <= 6.h ? '--qos=short' :
+ (task.time <= 12.h ? ' --qos=normal' :
+ (task.time <= 24.h ? '--qos=long' :
+ (task.time <= 48.h ? '--qos=vlong' : '--qos=marathon' )))) }
+
+ withLabel: big_mem_time {
+ memory = '60G'
+ time = '24h'
+ }
+ withLabel: big_time_cpus {
+ cpus = 8
+ memory = '60G'
+ time = '24h'
+
+ }
+ withLabel: big_cpus {
+ cpus = 8
+ memory = '8G'
+ }
+ withLabel: big_mem {
+ cpus = 1
+ memory = '80G'
+ }
+ withLabel: big_cpus_retry {
+ errorStrategy = 'retry'
+ cpus = 8
+ memory = {8.GB * task.attempt}
+ maxRetries = 3
+
+ }
+ withLabel: big_cpus_ignore {
+ errorStrategy = 'ignore'
+ cpus = 8
+ memory = '8G'
+ }
+ withLabel: big_mem_cpus {
+ time = '6h'
+ cpus = 8
+ memory = '60G'
+ }
+ withLabel: demulti_gpus {
+ time = '6h'
+ memory = '12G'
+ containerOptions = { workflow.containerEngine == "singularity" ? '--nv':
+ ( workflow.containerEngine == "docker" ? '-u $(id -u):$(id -g) --gpus all': null ) }
+ }
+ withLabel: basecall_gpus {
+ time = '6h'
+ memory = '8G'
+ clusterOptions = '--gres=gpu:1g.10gb:1'
+ containerOptions = { workflow.containerEngine == "singularity" ? '--nv':
+ ( workflow.containerEngine == "docker" ? '-u $(id -u):$(id -g) --gpus all': null ) }
+ }
+}
+singularity.enabled = true
+singularity.envWhitelist = "CUDA_VISIBLE_DEVICES,SGE_HGR_gpu"
diff --git a/conf/sge.config b/conf/sge.config
index 1e68063..7c5f539 100644
--- a/conf/sge.config
+++ b/conf/sge.config
@@ -1,7 +1,8 @@
process {
+ shell = ['/bin/bash', '-euo', 'pipefail']
executor = 'sge'
penv = 'smp'
- queue = 'long-sl7,biocore-el7,short-sl7'
+ queue = 'short-rocky9,long-centos79,short-centos79'
cpus = 1
memory='12G'
clusterOptions = { "-l h_vmem=${task.memory.toString().replaceAll(/[\sB]/,'')}" }
@@ -10,7 +11,7 @@ process {
containerOptions = { workflow.containerEngine == "docker" ? '-u $(id -u):$(id -g)': null}
withLabel: big_mem_time {
memory = '60G'
- time = '24h'
+ time = '24h'
}
withLabel: big_time_cpus {
cpus = 8
@@ -18,7 +19,7 @@ process {
time = '24h'
clusterOptions = { "-l h_vmem=${task.memory.toString().replaceAll(/[\sB]/,'')}" }
- }
+ }
withLabel: big_cpus {
cpus = 8
memory = '8G'
@@ -31,7 +32,6 @@ process {
errorStrategy = 'retry'
cpus = 8
memory = {8.GB * task.attempt}
- time = {6.h * task.attempt}
maxRetries = 3
}
@@ -40,17 +40,11 @@ process {
cpus = 8
memory = '8G'
}
- withLabel: big_mem_cpus {
+ withLabel: big_mem_cpus {
+ time = '6h'
cpus = 8
memory = '60G'
- }
-
- withLabel: big_mem_cpus_ret {
- cpus = 8
- memory = '60G'
- time = {6.h * task.attempt}
- }
-
+ }
withLabel: demulti_gpus {
queue = 'gpu'
memory = '20G'
@@ -58,7 +52,7 @@ process {
singularity.envWhitelist = "CUDA_VISIBLE_DEVICES,SGE_HGR_gpu"
containerOptions = { workflow.containerEngine == "singularity" ? '--nv':
( workflow.containerEngine == "docker" ? '-u $(id -u):$(id -g) --gpus all': null ) }
- }
+ }
withLabel: basecall_gpus {
queue = 'gpu'
memory = '8G'
@@ -66,5 +60,5 @@ process {
singularity.envWhitelist = "CUDA_VISIBLE_DEVICES,SGE_HGR_gpu"
containerOptions = { workflow.containerEngine == "singularity" ? '--nv':
( workflow.containerEngine == "docker" ? '-u $(id -u):$(id -g) --gpus all': null ) }
- }
+ }
}
diff --git a/conf/slurm.config b/conf/slurm.config
index 99b5526..a00fc0b 100644
--- a/conf/slurm.config
+++ b/conf/slurm.config
@@ -7,14 +7,14 @@ process {
containerOptions = { workflow.containerEngine == "docker" ? '-u $(id -u):$(id -g)': null}
withLabel: big_mem_time {
memory = '60G'
- time = '24h'
+ time = '24h'
}
withLabel: big_time_cpus {
cpus = 8
memory = '60G'
time = '24h'
- }
+ }
withLabel: big_cpus {
cpus = 8
memory = '8G'
@@ -44,11 +44,12 @@ process {
singularity.envWhitelist = "CUDA_VISIBLE_DEVICES,SGE_HGR_gpu"
containerOptions = { workflow.containerEngine == "singularity" ? '--nv':
( workflow.containerEngine == "docker" ? '-u $(id -u):$(id -g) --gpus all': null ) }
- }
+ }
withLabel: basecall_gpus {
memory = '8G'
singularity.envWhitelist = "CUDA_VISIBLE_DEVICES,SGE_HGR_gpu"
containerOptions = { workflow.containerEngine == "singularity" ? '--nv':
( workflow.containerEngine == "docker" ? '-u $(id -u):$(id -g) --gpus all': null ) }
- }
+ }
}
+singularity.cacheDir = "$baseDir/../singularity"
diff --git a/conf/standard.config b/conf/standard.config
index 274fea2..f4a5525 100644
--- a/conf/standard.config
+++ b/conf/standard.config
@@ -1,4 +1,5 @@
process {
+ shell = ['/bin/bash', '-euo', 'pipefail']
cpus = 1
memory='8G'
cache='lenient'
@@ -13,7 +14,7 @@ process {
errorStrategy = 'ignore'
cpus = 8
memory = '8G'
- }
+ }
withLabel: big_mem_cpus {
time = '6h'
cpus = 8
@@ -27,6 +28,7 @@ process {
maxForks = 1
memory = '8G'
containerOptions = { workflow.containerEngine == "singularity" ? '--nv':
- ( workflow.containerEngine == "docker" ? '-u $(id -u):$(id -g) --gpus all': null ) }
+ ( workflow.containerEngine == "docker" ? '-u $(id -u):$(id -g) --gpus all': null ) }
}
-}
\ No newline at end of file
+}
+singularity.cacheDir = "$baseDir/../singularity"
diff --git a/data/mod/batch_0.fast5 b/data/fast5/mod/batch_0.fast5
similarity index 100%
rename from data/mod/batch_0.fast5
rename to data/fast5/mod/batch_0.fast5
diff --git a/data/wt/batch_0.fast5 b/data/fast5/wt/batch_0.fast5
similarity index 100%
rename from data/wt/batch_0.fast5
rename to data/fast5/wt/batch_0.fast5
diff --git a/data/fast5_dem/test/FAT12104_2836aa20_0.fast5 b/data/fast5_dem/test/FAT12104_2836aa20_0.fast5
new file mode 100755
index 0000000..7f5d716
Binary files /dev/null and b/data/fast5_dem/test/FAT12104_2836aa20_0.fast5 differ
diff --git a/data/fast5_dem/test/FAT12104_2836aa20_1.fast5 b/data/fast5_dem/test/FAT12104_2836aa20_1.fast5
new file mode 100755
index 0000000..ad3b2e8
Binary files /dev/null and b/data/fast5_dem/test/FAT12104_2836aa20_1.fast5 differ
diff --git a/data/fast5_dna_dem/test/FAT12104_2836aa20_0.fast5 b/data/fast5_dna_dem/test/FAT12104_2836aa20_0.fast5
new file mode 100755
index 0000000..7f5d716
Binary files /dev/null and b/data/fast5_dna_dem/test/FAT12104_2836aa20_0.fast5 differ
diff --git a/data/fast5_dna_dem/test/FAT12104_2836aa20_1.fast5 b/data/fast5_dna_dem/test/FAT12104_2836aa20_1.fast5
new file mode 100755
index 0000000..ad3b2e8
Binary files /dev/null and b/data/fast5_dna_dem/test/FAT12104_2836aa20_1.fast5 differ
diff --git a/data/fast5_rna_dem/test/batch_0.fast5 b/data/fast5_rna_dem/test/batch_0.fast5
new file mode 100644
index 0000000..f308837
Binary files /dev/null and b/data/fast5_rna_dem/test/batch_0.fast5 differ
diff --git a/data/fastq/mod.fq.gz b/data/fastq/mod.fq.gz
new file mode 100644
index 0000000..5c9e85c
Binary files /dev/null and b/data/fastq/mod.fq.gz differ
diff --git a/data/fastq/wt.fq.gz b/data/fastq/wt.fq.gz
new file mode 100644
index 0000000..05511b2
Binary files /dev/null and b/data/fastq/wt.fq.gz differ
diff --git a/docker/Dockerfile_basecall b/docker/Dockerfile_basecall
index 2d4ef38..3053627 100644
--- a/docker/Dockerfile_basecall
+++ b/docker/Dockerfile_basecall
@@ -1,4 +1,5 @@
-FROM nvidia/cuda:10.1-base-ubuntu18.04
+#This will do biocorecrg/mopbasecall:0.2
+FROM nvidia/cuda:10.1-base-ubuntu18.04
MAINTAINER Leszek Pryszcz
MAINTAINER Luca Cozzuto
@@ -22,11 +23,11 @@ RUN pip install ont-fast5-api
RUN apt update && apt install -y p7zip-full libidn11 libgssapi-krb5-2
# Cleanup
-RUN apt-get clean && apt-get update && apt-get install -y locales && rm -rf /var/lib/apt/lists/*
+RUN apt-get clean && apt-get update && apt-get install -y locales && rm -rf /var/lib/apt/lists/*
RUN locale-gen en_US.UTF-8
ENV LD_LIBRARY_PATH $LD_LIBRARY_PATH:/usr/local/cuda-10.1/compat/
-ENV PYTHONPATH $PATH
+ENV PYTHONPATH $PATH
WORKDIR /
ENV LC_ALL=en_US.utf8
diff --git a/docker/Dockerfile_basecall_cuda11 b/docker/Dockerfile_basecall_cuda11
new file mode 100644
index 0000000..827bb42
--- /dev/null
+++ b/docker/Dockerfile_basecall_cuda11
@@ -0,0 +1,31 @@
+#This will do biocorecrg/mopbasecallc11:0.1
+FROM nvidia/cuda:11.5.1-base-ubuntu20.04
+
+MAINTAINER Leszek Pryszcz
+MAINTAINER Luca Cozzuto
+
+# Install Python3 & pip, curl
+RUN apt update && apt install -y python3-setuptools curl
+RUN update-alternatives --install /usr/bin/python python /usr/bin/python3 10
+RUN curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py && python3 get-pip.py && rm get-pip.py
+
+RUN mkdir -p /project
+WORKDIR /project
+
+#Installing ont_fast5_api
+RUN pip install --upgrade pip
+RUN pip install ont-fast5-api
+
+# Install guppy's dependencies
+RUN apt update && apt install -y p7zip-full libidn11 libgssapi-krb5-2
+
+# Cleanup
+RUN apt-get clean && apt-get update && apt-get install -y locales && rm -rf /var/lib/apt/lists/*
+RUN locale-gen en_US.UTF-8
+
+ENV LD_LIBRARY_PATH $LD_LIBRARY_PATH:/usr/local/cuda-11.5/compat/
+ENV PYTHONPATH $PATH
+WORKDIR /
+
+ENV LC_ALL=en_US.utf8
+ENV LANG=en_US.utf8
diff --git a/docker/Dockerfile_demulti b/docker/Dockerfile_demulti
index 54b4e63..491f8b6 100644
--- a/docker/Dockerfile_demulti
+++ b/docker/Dockerfile_demulti
@@ -14,7 +14,7 @@ RUN tar -zxf Python-${PYTHON_VERSION}.tgz; rm Python-${PYTHON_VERSION}.tgz ; cd
RUN ln -s /usr/local/bin/python3 /usr/local/bin/python; ln -s /usr/local/bin/pip3 /usr/local/bin/pip
RUN pip install --upgrade pip
-RUN pip install wheel biopython
+RUN pip install wheel biopython
RUN pip install parasail
RUN apt-get install -y git
RUN git clone https://github.com/rambaut/readucks.git
@@ -26,5 +26,5 @@ RUN pip install --upgrade pip setuptools
RUN pip install pyts numba==0.45.0 keras==2.2.4 scikit-learn pandas TensorFlow==1.13.1
# Cleanup
-RUN apt-get clean && apt-get update && apt-get install -y locales && rm -rf /var/lib/apt/lists/*
+RUN apt-get clean && apt-get update && apt-get install -y locales && rm -rf /var/lib/apt/lists/*
RUN locale-gen en_US.UTF-8
diff --git a/docker/Dockerfile_epinano b/docker/Dockerfile_epinano
index 1bf5a79..72e308e 100644
--- a/docker/Dockerfile_epinano
+++ b/docker/Dockerfile_epinano
@@ -1,10 +1,9 @@
-## this will do biocorecrg/mopepinano:0.2
FROM biocorecrg/centos-perlbrew-pyenv3-java:centos7
# File Author / Maintainer
-MAINTAINER Luca Cozzuto
+MAINTAINER Luca Cozzuto
-ARG EPINANO_VERSION=1.2.0
+ARG EPINANO_VERSION=1.1.1
#upgrade pip
@@ -12,22 +11,15 @@ RUN pip install -U pip
RUN pip install --upgrade setuptools
# Installing modules for EPINANO
-RUN pip install requests paho-mqtt
-RUN pip install atomicwrites==1.4.0 attrs==21.2.0 biopython==1.76 cloudpickle==1.3.0
-RUN pip install dask==2.5.2 fsspec==2021.6.1 future==0.17.1 h5py==2.10.0 importlib-metadata==4.6.1
-RUN pip install locket==0.2.1 more-itertools==8.8.0 numpy==1.17.2 pandas==0.24.2
-RUN pip install partd==1.2.0 pluggy==0.13.1 py==1.10.0 pysam==0.15.4 pytest==4.4.1 python-dateutil==2.8.1
-RUN pip install pytz==2021.1 scikit-learn==0.20.2 scipy==1.5.4 six==1.16.0 toolz==0.11.1 typing-extensions==3.10.0.0 zipp==3.5.0
-
-RUN yum install -y wget
-RUN wget https://github.com/enovoa/EpiNano/releases/download/Epinano${EPINANO_VERSION}/Epinano1.2.tgz
-RUN tar -zvxf *.tgz; rm *.tgz
-RUN chmod +x /project/Epinano1.2/*.py
-ENV PATH "${PATH}:/project/Epinano1.2/"
-ENV SAM2TSV "/project/Epinano1.2/misc/sam2tsv.jar"
+RUN pip install h5py==2.8.0 numpy==1.16.1 pandas==0.23.4 scikit-learn==0.20.2
+RUN yum install -y wget
+RUN wget https://github.com/enovoa/EpiNano/archive/refs/tags/epinano${EPINANO_VERSION}.tar.gz
+RUN tar -zvxf epinano${EPINANO_VERSION}.tar.gz; cd EpiNano-epinano${EPINANO_VERSION}/
+RUN chmod +x /project/EpiNano-epinano${EPINANO_VERSION}/scripts/*
+ENV PATH "/project/EpiNano-epinano${EPINANO_VERSION}/scripts/"
# Clean cache
-RUN yum clean all
+RUN yum clean all
#cleaning
RUN rm -fr *.tar.gz; rm -fr *.bz2; rm -fr ./tmp
@@ -35,4 +27,3 @@ RUN rm -rf /var/cache/yum
ENV LC_ALL=en_US.utf8
ENV LANG=en_US.utf8
-
diff --git a/docker/Dockerfile_htseq b/docker/Dockerfile_htseq
index fb64ab8..2dbcc02 100644
--- a/docker/Dockerfile_htseq
+++ b/docker/Dockerfile_htseq
@@ -2,7 +2,7 @@
FROM ubuntu:focal-20211006
# File Author / Maintainer
-MAINTAINER Luca Cozzuto
+MAINTAINER Luca Cozzuto
ARG HTSEQ_VERSION=30e9e9c
@@ -14,9 +14,9 @@ RUN apt install -y python3-pip
RUN pip install --upgrade pip
# Installing modules for EPINANO
-RUN pip install numpy pysam wheel
-RUN pip install matplotlib
-RUN pip install scipy
+RUN pip install numpy pysam wheel
+RUN pip install matplotlib
+RUN pip install scipy
RUN pip install anndata loompy cython
RUN apt install -y git
RUN git clone "https://github.com/htseq/htseq.git"; cd htseq; git checkout ${HTSEQ_VERSION}; python3 setup.py build; python3 setup.py install
@@ -24,11 +24,10 @@ RUN git clone "https://github.com/htseq/htseq.git"; cd htseq; git checkout ${HTS
# Clean cache
-#RUN yum clean all
+#RUN yum clean all
#cleaning
-RUN rm -fr htseq
+RUN rm -fr htseq
ENV LC_ALL=en_US.utf8
ENV LANG=en_US.utf8
-
diff --git a/docker/Dockerfile_nanocomp b/docker/Dockerfile_nanocomp
index 5f6d837..032fff0 100644
--- a/docker/Dockerfile_nanocomp
+++ b/docker/Dockerfile_nanocomp
@@ -1,7 +1,7 @@
FROM biocorecrg/centos-perlbrew-pyenv3-java:centos7
# File Author / Maintainer
-MAINTAINER Luca Cozzuto
+MAINTAINER Luca Cozzuto
ARG NANOCOMPORE_VERSION=v1.0.0rc3-1
ARG MEDAKA_VERSION=v0.11.5
@@ -16,11 +16,11 @@ RUN pip install nanocompore==${NANOCOMPORE_VERSION}
# Intaalling Medaka
RUN pip install medaka==${MEDAKA_VERSION}
-# Installing
+# Installing
RUN pip install NanopolishComp==${NANOPOLISHCOMP_VERSION}
# Clean cache
-RUN yum clean all
+RUN yum clean all
#cleaning
RUN rm -fr *.tar.gz; rm -fr *.bz2; rm -fr ./tmp
diff --git a/docker/Dockerfile_nanomod b/docker/Dockerfile_nanomod
index 4e625a6..5c136fb 100644
--- a/docker/Dockerfile_nanomod
+++ b/docker/Dockerfile_nanomod
@@ -1,11 +1,11 @@
-# This will do biocorecrg/mopmod:0.6.1
+# This will do biocorecrg/mopmod:0.6
FROM biocorecrg/centos-perlbrew-pyenv3-java:centos7
# File Author / Maintainer
-MAINTAINER Luca Cozzuto
+MAINTAINER Luca Cozzuto
ARG PICARD_VERSION=2.20.0
-ARG TOMBO_VERSION=1.5
+ARG TOMBO_VERSION=1.5
ARG SAMTOOLS_VERSION=1.4.1
ARG BEDTOOLS_VERSION=2.29.2
@@ -22,10 +22,10 @@ RUN pip install ont-tombo==${TOMBO_VERSION}
RUN yum install -y xz-devel.x86_64
RUN bash -c 'curl -k -L https://downloads.sourceforge.net/project/samtools/samtools/${SAMTOOLS_VERSION}/samtools-${SAMTOOLS_VERSION}.tar.bz2 > samtools.tar.bz2'
RUN tar -jvxf samtools.tar.bz2
-RUN cd samtools-${SAMTOOLS_VERSION}; ./configure; make; make install; cd ../
+RUN cd samtools-${SAMTOOLS_VERSION}; ./configure; make; make install; cd ../
RUN rm samtools.tar.bz2
-# install sam2tsv // version is last github
+# install sam2tsv // version is last github
RUN git clone "https://github.com/lindenb/jvarkit.git"; cd jvarkit; ./gradlew sam2tsv
# Installing Picard
@@ -35,7 +35,7 @@ RUN bash -c 'curl -k -L https://github.com/broadinstitute/picard/releases/downlo
RUN bash -c 'curl -k -L https://github.com/arq5x/bedtools2/releases/download/v${BEDTOOLS_VERSION}/bedtools.static.binary > /usr/local/bin/bedtools'
RUN chmod +x /usr/local/bin/bedtools
-RUN pip install h5py pandas pyarrow==0.12.1
+RUN pip install h5py pandas pyarrow
RUN pip install pyBigWig
# Add wigToBigWig
@@ -44,17 +44,17 @@ RUN chmod +x /usr/local/bin/wigToBigWig
# Clean cache
-RUN yum clean all
+RUN yum clean all
#cleaning
RUN rm -fr *.tar.gz; rm -fr *.bz2; rm -fr ./tmp
RUN rm -rf /var/cache/yum
-RUN rm -fr fastqc.zip samtools-*
+RUN rm -fr fastqc.zip samtools-*
#plugin
RUN yum install -y wget
RUN wget https://github.com/nanoporetech/vbz_compression/releases/download/v1.0.1/ont-vbz-hdf-plugin-1.0.1-Linux-x86_64.tar.gz; tar -zvxf ont-vbz-hdf-plugin-1.0.1-Linux-x86_64.tar.gz
-RUN mv ont-vbz-hdf-plugin-1.0.1-Linux/usr/local/hdf5/ /usr/local/
+RUN mv ont-vbz-hdf-plugin-1.0.1-Linux/usr/local/hdf5/ /usr/local/
ENV HDF5_PLUGIN_PATH "/usr/local/hdf5/lib/plugin"
#ENV LD_LIBRARY_PATH "/usr/local/hdf5/lib/plugin"
@@ -62,4 +62,3 @@ ENV SAM2TSV "java -jar /project/jvarkit/dist/sam2tsv.jar"
ENV PICARD "java -jar /usr/local/bin/picard.jar"
ENV LC_ALL=en_US.utf8
ENV LANG=en_US.utf8
-
diff --git a/docker/Dockerfile_nanomod2 b/docker/Dockerfile_nanomod2
new file mode 100644
index 0000000..3a00724
--- /dev/null
+++ b/docker/Dockerfile_nanomod2
@@ -0,0 +1,66 @@
+# This will do biocorecrg/mopmod:0.7
+FROM biocorecrg/centos-perlbrew-pyenv3-java:centos7
+
+# File Author / Maintainer
+MAINTAINER Luca Cozzuto
+
+ARG PICARD_VERSION=2.20.0
+ARG TOMBO_VERSION=1.5
+ARG SAMTOOLS_VERSION=1.4.1
+ARG BEDTOOLS_VERSION=2.29.2
+
+#upgrade pip
+RUN pip install -U pip
+RUN pip install --upgrade setuptools
+
+# Installing Tombo
+RUN pip install ont-fast5-api
+RUN pip install pyfaidx numpy
+RUN pip install ont-tombo==${TOMBO_VERSION}
+
+# Installing samtools
+RUN yum install -y xz-devel.x86_64
+RUN bash -c 'curl -k -L https://downloads.sourceforge.net/project/samtools/samtools/${SAMTOOLS_VERSION}/samtools-${SAMTOOLS_VERSION}.tar.bz2 > samtools.tar.bz2'
+RUN tar -jvxf samtools.tar.bz2
+RUN cd samtools-${SAMTOOLS_VERSION}; ./configure; make; make install; cd ../
+RUN rm samtools.tar.bz2
+
+# install sam2tsv // version is last github
+RUN git clone "https://github.com/lindenb/jvarkit.git"; cd jvarkit; ./gradlew sam2tsv
+
+# Installing Picard
+RUN bash -c 'curl -k -L https://github.com/broadinstitute/picard/releases/download/${PICARD_VERSION}/picard.jar > /usr/local/bin/picard.jar'
+
+# Installing BEDTOOLS
+RUN bash -c 'curl -k -L https://github.com/arq5x/bedtools2/releases/download/v${BEDTOOLS_VERSION}/bedtools.static.binary > /usr/local/bin/bedtools'
+RUN chmod +x /usr/local/bin/bedtools
+
+RUN pip install h5py pandas pyarrow
+RUN pip install pyBigWig
+RUN pip install polars
+RUN pip install duckdb==0.3.2
+
+# Add wigToBigWig
+RUN bash -c 'curl -k -L http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/wigToBigWig > /usr/local/bin/wigToBigWig'
+RUN chmod +x /usr/local/bin/wigToBigWig
+RUN yum install -y pigz
+
+# Clean cache
+RUN yum clean all
+
+#cleaning
+RUN rm -fr *.tar.gz; rm -fr *.bz2; rm -fr ./tmp
+RUN rm -rf /var/cache/yum
+RUN rm -fr fastqc.zip samtools-*
+
+#plugin
+RUN yum install -y wget
+RUN wget https://github.com/nanoporetech/vbz_compression/releases/download/v1.0.1/ont-vbz-hdf-plugin-1.0.1-Linux-x86_64.tar.gz; tar -zvxf ont-vbz-hdf-plugin-1.0.1-Linux-x86_64.tar.gz
+RUN mv ont-vbz-hdf-plugin-1.0.1-Linux/usr/local/hdf5/ /usr/local/
+ENV HDF5_PLUGIN_PATH "/usr/local/hdf5/lib/plugin"
+#ENV LD_LIBRARY_PATH "/usr/local/hdf5/lib/plugin"
+
+ENV SAM2TSV "java -jar /project/jvarkit/dist/sam2tsv.jar"
+ENV PICARD "java -jar /usr/local/bin/picard.jar"
+ENV LC_ALL=en_US.utf8
+ENV LANG=en_US.utf8
diff --git a/docker/Dockerfile_nanopolish b/docker/Dockerfile_nanopolish
index ae6aef3..9b53939 100644
--- a/docker/Dockerfile_nanopolish
+++ b/docker/Dockerfile_nanopolish
@@ -1,7 +1,7 @@
FROM biocorecrg/centos-perlbrew-pyenv3-java:centos7
# File Author / Maintainer
-MAINTAINER Luca Cozzuto
+MAINTAINER Luca Cozzuto
ARG NANOPOLISH_VERSION=v0.13.3
ARG NANOPOLISHCOMP_VERSION=v0.6.2
@@ -11,32 +11,32 @@ ARG NANOPOLISHCOMP_VERSION=v0.6.2
RUN pip install -U pip
RUN pip install --upgrade setuptools
-# Installing
+# Installing
RUN pip install NanopolishComp==${NANOPOLISHCOMP_VERSION}
# Installing Nanopolish
-RUN yum install -y wget
+RUN yum install -y wget
RUN yum group install "Development Tools" -y
RUN git clone --recursive https://github.com/jts/nanopolish.git
RUN cd nanopolish; git checkout ${NANOPOLISH_VERSION}; make all
-RUN cd nanopolish; pip install -r scripts/requirements.txt
-ENV PATH "/project/nanopolish/:${PATH}"
+RUN cd nanopolish; pip install -r scripts/requirements.txt
+ENV PATH "/project/nanopolish/:${PATH}"
# install pigz
RUN yum install -y pigz
# Clean cache
-RUN yum clean all
+RUN yum clean all
#cleaning
RUN rm -fr *.tar.gz; rm -fr *.bz2; rm -fr ./tmp
RUN rm -rf /var/cache/yum
-RUN rm -fr fastqc.zip samtools-*
+RUN rm -fr fastqc.zip samtools-*
#plugin
RUN yum install -y wget
RUN wget https://github.com/nanoporetech/vbz_compression/releases/download/v1.0.1/ont-vbz-hdf-plugin-1.0.1-Linux-x86_64.tar.gz; tar -zvxf ont-vbz-hdf-plugin-1.0.1-Linux-x86_64.tar.gz
-RUN mv ont-vbz-hdf-plugin-1.0.1-Linux/usr/local/hdf5/ /usr/local/
+RUN mv ont-vbz-hdf-plugin-1.0.1-Linux/usr/local/hdf5/ /usr/local/
ENV HDF5_PLUGIN_PATH "/usr/local/hdf5/lib/plugin"
#ENV LD_LIBRARY_PATH "/usr/local/hdf5/lib/plugin"
diff --git a/docker/Dockerfile_nanotail b/docker/Dockerfile_nanotail
index 376fa95..6c0f734 100644
--- a/docker/Dockerfile_nanotail
+++ b/docker/Dockerfile_nanotail
@@ -1,7 +1,7 @@
FROM biocorecrg/centos-perlbrew-pyenv3-java:centos7
# File Author / Maintainer
-MAINTAINER Luca Cozzuto
+MAINTAINER Luca Cozzuto
ARG R_VERSION=3.6.0
ARG TAILFINDR_VERSION=af56cce
@@ -17,7 +17,7 @@ RUN Rscript -e "install.packages(c('data.table','flexdashboard','dplyr','plyr','
RUN Rscript -e "install.packages(c('devtools', 'futile.logger','readr','scales','viridis','yaml', 'ggplot2'), dependencies = TRUE, repos='http://cran.us.r-project.org')"
RUN Rscript -e "install.packages(c('BiocManager', 'snow'), repos='http://cran.us.r-project.org')"; Rscript -e "BiocManager::install()"
-# Installing tailfindr
+# Installing tailfindr
#plugin
RUN pip install --upgrade pip
@@ -26,7 +26,7 @@ RUN yum install -y hdf5-devel wget
RUN wget https://github.com/nanoporetech/vbz_compression/releases/download/v1.0.1/ont-vbz-hdf-plugin-1.0.1-Linux-x86_64.tar.gz; tar -zvxf ont-vbz-hdf-plugin-1.0.1-Linux-x86_64.tar.gz
RUN mv ont-vbz-hdf-plugin-1.0.1-Linux/usr/local/hdf5/ /usr/local/
ENV HDF5_PLUGIN_PATH "/usr/local/hdf5/lib/plugin"
-#ENV LD_LIBRARY_PATH "/usr/local/hdf5/lib"
+#ENV LD_LIBRARY_PATH "/usr/local/hdf5/lib"
@@ -36,8 +36,8 @@ RUN Rscript -e "BiocManager::install(c('BiocGenerics', 'Biostrings', 'IRanges',
RUN bash -c 'curl -k -L https://support.hdfgroup.org/ftp/HDF5/releases/hdf5-1.8/hdf5-1.8.21/src/hdf5-1.8.21.tar.gz > hdf5-1.8.21.tar.gz'
RUN tar -zvxf hdf5-1.8.21.tar.gz; cd hdf5-1.8.21; \
./configure --prefix=/usr/local/hdf5 --enable-fortran --enable-cxx; make; make install
-ENV LD_LIBRARY_PATH "/usr/local/hdf5/lib"
-ENV PATH "/usr/local/hdf5/bin":${PATH}
+ENV LD_LIBRARY_PATH "/usr/local/hdf5/lib"
+ENV PATH "/usr/local/hdf5/bin":${PATH}
RUN wget https://github.com/nanoporetech/vbz_compression/releases/download/v1.0.1/ont-vbz-hdf-plugin-1.0.1-Linux-x86_64.tar.gz; tar -zvxf ont-vbz-hdf-plugin-1.0.1-Linux-x86_64.tar.gz
RUN mv ont-vbz-hdf-plugin-1.0.1-Linux/usr/local/hdf5/lib/plugin/* /usr/local/lib/plugin
@@ -59,7 +59,7 @@ RUN Rscript -e "install.packages(c('ggrepel', 'MASS', 'reshape2'), repos='http:/
# Clean cache
-RUN yum clean all
+RUN yum clean all
#cleaning
RUN rm -fr *.tar.gz; rm -fr *.bz2; rm -fr ./tmp
@@ -67,4 +67,3 @@ RUN rm -rf /var/cache/yum
ENV LC_ALL=en_US.utf8
ENV LANG=en_US.utf8
-
diff --git a/docker/Dockerfile_nanotail2 b/docker/Dockerfile_nanotail2
new file mode 100644
index 0000000..0351cd8
--- /dev/null
+++ b/docker/Dockerfile_nanotail2
@@ -0,0 +1,70 @@
+#This will do biocorecrg/moptail:1.3
+FROM biocorecrg/centos-perlbrew-pyenv3-java:centos7
+
+# File Author / Maintainer
+MAINTAINER Luca Cozzuto
+
+ARG R_VERSION=3.6.0
+ARG TAILFINDR_VERSION=v1.3
+
+#Installing R
+RUN yum install -y epel-release libxml2-devel libcurl-devel libcurl libcurl-devel
+RUN yum install R-${R_VERSION} -y
+RUN mkdir -p /usr/share/doc/R-${R_VERSION}/html
+# problem with curl 4.1 // must use 4.0
+RUN Rscript -e "install.packages('https://cran.r-project.org/src/contrib/Archive/curl/curl_4.0.tar.gz',repo=NULL,type='source')"
+
+RUN Rscript -e "install.packages(c('data.table','flexdashboard','dplyr','plyr','ggExtra','ggplot2','hexbin','knitr','optparse','RColorBrewer','reshape2'), repos='http://cran.us.r-project.org')"
+RUN Rscript -e "install.packages(c('devtools', 'futile.logger','readr','scales','viridis','yaml', 'ggplot2'), dependencies = TRUE, repos='http://cran.us.r-project.org')"
+RUN Rscript -e "install.packages(c('BiocManager', 'snow'), repos='http://cran.us.r-project.org')"; Rscript -e "BiocManager::install()"
+
+# Installing tailfindr
+
+#plugin
+RUN pip install --upgrade pip
+
+RUN yum install -y hdf5-devel wget
+RUN wget https://github.com/nanoporetech/vbz_compression/releases/download/v1.0.1/ont-vbz-hdf-plugin-1.0.1-Linux-x86_64.tar.gz; tar -zvxf ont-vbz-hdf-plugin-1.0.1-Linux-x86_64.tar.gz
+RUN mv ont-vbz-hdf-plugin-1.0.1-Linux/usr/local/hdf5/ /usr/local/
+ENV HDF5_PLUGIN_PATH "/usr/local/hdf5/lib/plugin"
+#ENV LD_LIBRARY_PATH "/usr/local/hdf5/lib"
+
+
+
+RUN Rscript -e "BiocManager::install(c('BiocGenerics', 'Biostrings', 'IRanges', 'zlibbioc', 'XVector', 'bit', 'pryr'))"
+
+#RUN yum install -y hdf5-devel
+RUN bash -c 'curl -k -L https://support.hdfgroup.org/ftp/HDF5/releases/hdf5-1.8/hdf5-1.8.21/src/hdf5-1.8.21.tar.gz > hdf5-1.8.21.tar.gz'
+RUN tar -zvxf hdf5-1.8.21.tar.gz; cd hdf5-1.8.21; \
+./configure --prefix=/usr/local/hdf5 --enable-fortran --enable-cxx; make; make install
+ENV LD_LIBRARY_PATH "/usr/local/hdf5/lib"
+ENV PATH "/usr/local/hdf5/bin":${PATH}
+
+RUN wget https://github.com/nanoporetech/vbz_compression/releases/download/v1.0.1/ont-vbz-hdf-plugin-1.0.1-Linux-x86_64.tar.gz; tar -zvxf ont-vbz-hdf-plugin-1.0.1-Linux-x86_64.tar.gz
+RUN mv ont-vbz-hdf-plugin-1.0.1-Linux/usr/local/hdf5/lib/plugin/* /usr/local/lib/plugin
+ENV HDF5_PLUGIN_PATH "/usr/local/hdf5/lib/plugin"
+
+
+RUN Rscript -e "BiocManager::install(c('hdf5r'))"
+#RUN ciccio
+
+RUN Rscript -e "BiocManager::install(c('foreach', 'doParallel', 'doSNOW', 'rbokeh', 'psych', 'gistr', 'mnormt'))"
+
+RUN Rscript -e "BiocManager::install(c('Rsamtools'))"
+RUN Rscript -e "BiocManager::install(c('tidyr'))"
+RUN Rscript -e "devtools::install_github(\"adnaniazi/tailfindr@${TAILFINDR_VERSION}\", dependencies=FALSE, INSTALL_opts=c(\"--no-docs\", \"--no-help\"))"
+
+# Installing vennDiagram
+RUN Rscript -e "install.packages(c('argparse', 'gplots', 'VennDiagram'), dependencies = TRUE, repos='http://cran.us.r-project.org')"
+RUN Rscript -e "install.packages(c('ggrepel', 'MASS', 'reshape2'), repos='http://cran.us.r-project.org')"; Rscript -e "BiocManager::install()"
+
+
+# Clean cache
+RUN yum clean all
+
+#cleaning
+RUN rm -fr *.tar.gz; rm -fr *.bz2; rm -fr ./tmp
+RUN rm -rf /var/cache/yum
+
+ENV LC_ALL=en_US.utf8
+ENV LANG=en_US.utf8
diff --git a/docker/Dockerfile_nanotail3 b/docker/Dockerfile_nanotail3
new file mode 100644
index 0000000..54a15a9
--- /dev/null
+++ b/docker/Dockerfile_nanotail3
@@ -0,0 +1,71 @@
+#This will do biocorecrg/moptail:nano3p_4
+FROM biocorecrg/centos-perlbrew-pyenv3-java:centos7
+
+# File Author / Maintainer
+MAINTAINER Luca Cozzuto
+
+ARG R_VERSION=3.6.0
+ARG TAILFINDR_VERSION=v1.3
+
+#Installing R
+RUN yum install -y epel-release libxml2-devel libcurl-devel libcurl libcurl-devel
+RUN yum install R-${R_VERSION} -y
+RUN mkdir -p /usr/share/doc/R-${R_VERSION}/html
+# problem with curl 4.1 // must use 4.0
+RUN Rscript -e "install.packages('https://cran.r-project.org/src/contrib/Archive/curl/curl_4.0.tar.gz',repo=NULL,type='source')"
+
+RUN Rscript -e "install.packages(c('data.table','flexdashboard','dplyr','plyr','ggExtra','ggplot2','hexbin','knitr','optparse','RColorBrewer','reshape2'), repos='http://cran.us.r-project.org')"
+RUN Rscript -e "install.packages(c('devtools', 'futile.logger','readr','scales','viridis','yaml', 'ggplot2'), dependencies = TRUE, repos='http://cran.us.r-project.org')"
+RUN Rscript -e "install.packages(c('BiocManager', 'snow'), repos='http://cran.us.r-project.org')"; Rscript -e "BiocManager::install()"
+
+# Installing tailfindr
+
+#plugin
+RUN pip install --upgrade pip
+
+RUN yum install -y hdf5-devel wget
+RUN wget https://github.com/nanoporetech/vbz_compression/releases/download/v1.0.1/ont-vbz-hdf-plugin-1.0.1-Linux-x86_64.tar.gz; tar -zvxf ont-vbz-hdf-plugin-1.0.1-Linux-x86_64.tar.gz
+RUN mv ont-vbz-hdf-plugin-1.0.1-Linux/usr/local/hdf5/ /usr/local/
+ENV HDF5_PLUGIN_PATH "/usr/local/hdf5/lib/plugin"
+#ENV LD_LIBRARY_PATH "/usr/local/hdf5/lib"
+
+
+
+RUN Rscript -e "BiocManager::install(c('BiocGenerics', 'Biostrings', 'IRanges', 'zlibbioc', 'XVector', 'bit', 'pryr'))"
+
+#RUN yum install -y hdf5-devel
+RUN bash -c 'curl -k -L https://support.hdfgroup.org/ftp/HDF5/releases/hdf5-1.8/hdf5-1.8.21/src/hdf5-1.8.21.tar.gz > hdf5-1.8.21.tar.gz'
+RUN tar -zvxf hdf5-1.8.21.tar.gz; cd hdf5-1.8.21; \
+./configure --prefix=/usr/local/hdf5 --enable-fortran --enable-cxx; make; make install
+ENV LD_LIBRARY_PATH "/usr/local/hdf5/lib"
+ENV PATH "/usr/local/hdf5/bin":${PATH}
+
+RUN wget https://github.com/nanoporetech/vbz_compression/releases/download/v1.0.1/ont-vbz-hdf-plugin-1.0.1-Linux-x86_64.tar.gz; tar -zvxf ont-vbz-hdf-plugin-1.0.1-Linux-x86_64.tar.gz
+RUN mv ont-vbz-hdf-plugin-1.0.1-Linux/usr/local/hdf5/lib/plugin/* /usr/local/lib/plugin
+ENV HDF5_PLUGIN_PATH "/usr/local/hdf5/lib/plugin"
+
+
+RUN Rscript -e "BiocManager::install(c('hdf5r'))"
+#RUN ciccio
+
+RUN Rscript -e "BiocManager::install(c('foreach', 'doParallel', 'doSNOW', 'rbokeh', 'psych', 'gistr', 'mnormt'))"
+
+RUN Rscript -e "BiocManager::install(c('Rsamtools'))"
+RUN Rscript -e "BiocManager::install(c('tidyr'))"
+RUN echo "force upgrade 4"
+RUN Rscript -e "devtools::install_github(\"adnaniazi/tailfindr\", ref=\"nano3p-seq\", dependencies=FALSE, INSTALL_opts=c(\"--no-docs\", \"--no-help\"))"
+
+# Installing vennDiagram
+RUN Rscript -e "install.packages(c('argparse', 'gplots', 'VennDiagram'), dependencies = TRUE, repos='http://cran.us.r-project.org')"
+RUN Rscript -e "install.packages(c('ggrepel', 'MASS', 'reshape2'), repos='http://cran.us.r-project.org')"; Rscript -e "BiocManager::install()"
+
+
+# Clean cache
+RUN yum clean all
+
+#cleaning
+RUN rm -fr *.tar.gz; rm -fr *.bz2; rm -fr ./tmp
+RUN rm -rf /var/cache/yum
+
+ENV LC_ALL=en_US.utf8
+ENV LANG=en_US.utf8
diff --git a/docker/Dockerfile_nanotail4 b/docker/Dockerfile_nanotail4
new file mode 100644
index 0000000..26e9b9e
--- /dev/null
+++ b/docker/Dockerfile_nanotail4
@@ -0,0 +1,46 @@
+FROM rocker/verse:4.2.3
+
+RUN R -e 'install.packages("remotes")'
+RUN R -e 'remotes::install_cran("rlang")'
+RUN apt-get update && apt-get install -y libcurl4-openssl-dev libhdf5-dev libicu-dev libssl-dev make pandoc zlib1g-dev && rm -rf /var/lib/apt/lists/*
+RUN mkdir -p /usr/local/lib/R/etc/ /usr/lib/R/etc/
+RUN echo "options(repos = c(CRAN = 'https://cran.rstudio.com/'), download.file.method = 'libcurl', Ncpus = 4)" | tee /usr/local/lib/R/etc/Rprofile.site | tee /usr/lib/R/etc/Rprofile.site
+RUN Rscript -e 'remotes::install_version("rbokeh")'
+
+RUN apt-get update && apt-get install patch
+#RUN Rscript -e "remotes::install_github('adnaniazi/tailfindr', ref='master')"
+
+RUN Rscript -e "install.packages(c('data.table','flexdashboard','dplyr','plyr','ggExtra','ggplot2','hexbin','knitr','optparse','RColorBrewer','reshape2'), repos='http://cran.us.r-project.org')"
+RUN Rscript -e "install.packages(c('devtools', 'futile.logger','readr','scales','viridis','yaml', 'ggplot2'), dependencies = TRUE, repos='http://cran.us.r-project.org')"
+RUN Rscript -e "install.packages(c('BiocManager', 'snow'), repos='http://cran.us.r-project.org')"; Rscript -e "BiocManager::install()"
+RUN Rscript -e "BiocManager::install(c('BiocGenerics', 'Biostrings', 'IRanges', 'zlibbioc', 'XVector', 'bit', 'pryr'))"
+RUN Rscript -e "BiocManager::install(c('hdf5r'))"
+RUN Rscript -e "BiocManager::install(c('foreach', 'doParallel', 'doSNOW', 'rbokeh', 'psych', 'gistr', 'mnormt'))"
+RUN Rscript -e "BiocManager::install(c('Rsamtools'))"
+RUN Rscript -e "BiocManager::install(c('tidyr'))"
+RUN Rscript -e "install.packages(c('remotes'), repos='http://cran.us.r-project.org')"
+RUN Rscript -e 'remotes::install_version("rbokeh", repos = "http://cran.us.r-project.org")'
+RUN Rscript -e "remotes::install_github(\"adnaniazi/tailfindr\", ref=\"nano3p-seq-r10\", dependencies=FALSE, INSTALL_opts=c(\"--no-docs\", \"--no-help\"))"
+
+
+
+RUN Rscript -e "install.packages(c('argparse', 'gplots', 'VennDiagram'), dependencies = TRUE, repos='http://cran.us.r-project.org')"
+RUN Rscript -e "install.packages(c('ggrepel', 'MASS', 'reshape2'), repos='http://cran.us.r-project.org')"; Rscript -e "BiocManager::install()"
+
+
+
+# Install VBZ plugin
+RUN apt-get update && apt-get install -y wget
+#RUN wget https://github.com/nanoporetech/vbz_compression/releases/download/v1.0.1/ont-vbz-hdf-plugin-1.0.1-Linux-x86_64.tar.gz && \
+ #tar -xzf ont-vbz-hdf-plugin-1.0.1-Linux-x86_64.tar.gz -C /usr/local/
+#ENV HDF5_PLUGIN_PATH "/usr/local/ont-vbz-hdf-plugin-1.0.1-Linux/usr/local/hdf5/lib/plugin"
+
+#RUN yum install -y hdf5-devel wget
+RUN wget https://github.com/nanoporetech/vbz_compression/releases/download/v1.0.1/ont-vbz-hdf-plugin-1.0.1-Linux-x86_64.tar.gz; tar -zvxf ont-vbz-hdf-plugin-1.0.1-Linux-x86_64.tar.gz
+RUN mv ont-vbz-hdf-plugin-1.0.1-Linux/usr/local/hdf5/ /usr/local/
+ENV HDF5_PLUGIN_PATH "/usr/local/hdf5/lib/plugin"
+ENV LD_LIBRARY_PATH "/usr/local/hdf5/lib"
+ENV PATH "/usr/local/hdf5/bin":${PATH}
+ENV HDF5_PLUGIN_PATH "/usr/local/hdf5/lib/plugin"
+ENV LC_ALL=en_US.utf8
+ENV LANG=en_US.utf8
diff --git a/docker/Dockerfile_preprocessing b/docker/Dockerfile_preprocessing
index bab1ee7..dc98e4f 100644
--- a/docker/Dockerfile_preprocessing
+++ b/docker/Dockerfile_preprocessing
@@ -2,7 +2,7 @@
FROM biocorecrg/centos-perlbrew-pyenv23-java:centos7
# File Author / Maintainer
-MAINTAINER Luca Cozzuto
+MAINTAINER Luca Cozzuto
ARG MULTIQC_VERSION=1.8
ARG SAMTOOLS_VERSION=1.10
@@ -39,14 +39,14 @@ RUN yum install -y xz-devel.x86_64
RUN bash -c 'curl -k -L https://github.com/samtools/samtools/releases/download/${SAMTOOLS_VERSION}/samtools-${SAMTOOLS_VERSION}.tar.bz2 > samtools.tar.bz2'
#RUN bash -c 'curl -k -L https://downloads.sourceforge.net/project/samtools/samtools/${SAMTOOLS_VERSION}/samtools-${SAMTOOLS_VERSION}.tar.bz2 > samtools.tar.bz2'
RUN tar -jvxf samtools.tar.bz2
-RUN cd samtools-${SAMTOOLS_VERSION}; ./configure; make; make install; make install install-htslib; cd ../
+RUN cd samtools-${SAMTOOLS_VERSION}; ./configure; make; make install; make install install-htslib; cd ../
RUN rm samtools.tar.bz2
# Installing bcftools
RUN yum install -y xz-devel.x86_64
RUN bash -c 'curl -k -L https://github.com/samtools/bcftools/releases/download/${BCFTOOLS_VERSION}/bcftools-${BCFTOOLS_VERSION}.tar.bz2 > bcftools.tar.bz2'
RUN tar -jvxf bcftools.tar.bz2
-RUN cd bcftools-${BCFTOOLS_VERSION}; ./configure; make; make install; cd ../
+RUN cd bcftools-${BCFTOOLS_VERSION}; ./configure; make; make install; cd ../
RUN rm bcftools.tar.bz2
# install sam2tsv // version is github tag
@@ -85,7 +85,7 @@ RUN Rscript -e "install.packages(c('BiocManager', 'snow'), repos='http://cran.us
# Installing POREquality // version is github tag
RUN yum install -y pandoc
-RUN git clone https://github.com/carsweshau/POREquality; cd POREquality/; git checkout ${POREQUALITY_VERSION}; cd ../
+RUN git clone https://github.com/carsweshau/POREquality; cd POREquality/; git checkout ${POREQUALITY_VERSION}; cd ../
# Installing MinIONQC
RUN bash -c 'curl -k -L https://github.com/roblanf/minion_qc/releases/download/v${MINIONQC_VERSION}/MinIONQC.R > /usr/local/bin/MinIONQC.R'
@@ -95,10 +95,10 @@ RUN chmod +x /usr/local/bin/MinIONQC.R
RUN pip install ont-fast5-api==${ONTFAST5API_VERSION}
#install Nanoplot
-RUN pip install pandas
+RUN pip install pandas
RUN pip install NanoPlot==${NANOPLOT_VERSION}
-# Installing MULTIQC_VERSION
+# Installing MULTIQC_VERSION
RUN pip install multiqc==${MULTIQC_VERSION}
# Installing GraphicsMagick
@@ -117,14 +117,14 @@ RUN pip install HTSeq==${HTSEQ_VERSION}
RUN pip install medaka==${MEDAKA_VERSION}
# Clean cache
-RUN yum clean all
+RUN yum clean all
#cleaning
RUN rm -fr *.tar.gz; rm -fr *.bz2; rm -fr ./tmp
RUN rm -rf /var/cache/yum
-RUN rm -fr fastqc.zip samtools-*
+RUN rm -fr fastqc.zip samtools-*
-#ENV PATH "/project/ont-guppy_3.1.5_linux64/ont-guppy/bin/:${PATH}"
+#ENV PATH "/project/ont-guppy_3.1.5_linux64/ont-guppy/bin/:${PATH}"
ENV SAM2TSV "java -jar /project/jvarkit/dist/sam2tsv.jar"
ENV PICARD "java -jar /usr/local/bin/picard.jar"
ENV LC_ALL=en_US.utf8
diff --git a/docker/demxufastq/Dockerfile b/docker/demxufastq/Dockerfile
new file mode 100644
index 0000000..1b02f6d
--- /dev/null
+++ b/docker/demxufastq/Dockerfile
@@ -0,0 +1,13 @@
+#This will do biocorecrg/demxufastq:0.1
+FROM biocorecrg/centos-perlbrew-pyenv23-java:centos7
+
+# File Author / Maintainer
+MAINTAINER Luca Cozzuto
+
+#upgrade pip
+RUN pip install --upgrade pip
+RUN pip install pandas
+RUN pip install biopython
+RUN pip install datetime
+
+COPY *.py /usr/local/bin/
diff --git a/mop_preprocess/bin/extract_sequence_from_fastq.py b/docker/demxufastq/extract_sequence_from_fastq.py
similarity index 97%
rename from mop_preprocess/bin/extract_sequence_from_fastq.py
rename to docker/demxufastq/extract_sequence_from_fastq.py
index 6061c50..1f0c96e 100755
--- a/mop_preprocess/bin/extract_sequence_from_fastq.py
+++ b/docker/demxufastq/extract_sequence_from_fastq.py
@@ -1,8 +1,8 @@
#!/usr/bin/env python
+import sys
import gzip
-import os
import pprint
-import sys
+import os
usage = '''
created by Huanle and Luca for Master of Pores! :)
@@ -31,7 +31,7 @@ def fopen (f):
outprefix = os.path.splitext(sys.argv[2])[0]
outext = os.path.splitext(sys.argv[2])[1]
-if (outext == '.gz'):
+if (outext == '.gz'):
outprefix = os.path.splitext(outprefix)[0]
fh = fopen (sys.argv[2])
diff --git a/docker/demxufastq/fastq_split_by_barcode.py b/docker/demxufastq/fastq_split_by_barcode.py
new file mode 100755
index 0000000..89d6415
--- /dev/null
+++ b/docker/demxufastq/fastq_split_by_barcode.py
@@ -0,0 +1,74 @@
+#!/usr/bin/env python3
+desc="""Split FastQ by barcode
+"""
+epilog="""Author: l.p.pryszcz+git@gmail.com
+Modified by Luca Cozzuto for MoP3
+"""
+
+import gzip, os, sys
+import pandas as pd
+from datetime import datetime
+from Bio import SeqIO
+
+def split_by_barcode(demux, fastq, outname, minbaseQ):
+ """Split FastQ file by barcode"""
+ outdir = os.path.dirname(outname)
+ if outdir and not os.path.isdir(outdir):
+ os.makedirs(outdir)
+
+ df = pd.read_csv(demux, sep="\t")
+ read2bc = {r: b for r, b in df.loc[df["baseQ"]>=minbaseQ, ["read_id", "barcode"]].to_numpy()}
+
+ outs = {}
+ b2c = {}
+ i = k = 0
+ info = "{:,} processed reads; saved {:,} reads with {} barcodes: {}\r"
+ for fn in fastq:
+ handle = gzip.open(fn, "rt") if fn.endswith(".gz") else open(fn, 'rt')
+ for i, r in enumerate(SeqIO.parse(handle, 'fastq'), i+1):
+ read_id = r.id
+ if not i%10000: sys.stderr.write(info.format(i, k, len(b2c), b2c))
+ if read_id not in read2bc: continue
+ bc = read2bc[read_id]
+ if(str(bc) != 'nan'):
+ if bc not in outs:
+ outs[bc] = gzip.open("%s.bc_%d.fastq.gz"%(outname, bc), "wt")
+ b2c[bc] = 0
+ outs[bc].write(r.format('fastq'))
+ b2c[bc] += 1
+ k += 1
+ sys.stderr.write(info.replace('\r','\n').format(i, k, len(b2c), b2c))
+ for bc, out in outs.items():
+ out.close()
+
+def main():
+ import argparse
+ usage = "%(prog)s -v" #usage=usage,
+ parser = argparse.ArgumentParser(description=desc, epilog=epilog, \
+ formatter_class=argparse.RawTextHelpFormatter)
+
+ parser.add_argument('--version', action='version', version='1.0a')
+ parser.add_argument("-v", "--verbose", action="store_true", help="verbose")
+ parser.add_argument("-i", "--demux", required=True, help="demux file name")
+ parser.add_argument("-f", "--fastq", nargs="+", help="input FastQ file(s)")
+ parser.add_argument("-o", "--outname", required=True,
+ help="output basename (.bc_?.fq.gz will be added)")
+ parser.add_argument("-b", "--minbaseQ", default=50, type=int,
+ help="minimum demux quality [%(default)s]")
+
+ o = parser.parse_args()
+ if o.verbose:
+ sys.stderr.write("Options: %s\n"%str(o))
+
+ split_by_barcode(o.demux, o.fastq, o.outname, o.minbaseQ)
+
+if __name__=='__main__':
+ t0 = datetime.now()
+ try:
+ main()
+ except KeyboardInterrupt:
+ sys.stderr.write("\nCtrl-C pressed! \n")
+ #except IOError as e:
+ # sys.stderr.write("I/O error({0}): {1}\n".format(e.errno, e.strerror))
+ dt = datetime.now()-t0
+ sys.stderr.write("#Time elapsed: %s\n"%dt)
diff --git a/docs/about.rst b/docs/about.rst
index ce4c075..86f016c 100644
--- a/docs/about.rst
+++ b/docs/about.rst
@@ -1,25 +1,42 @@
.. _home-page-about:
*******************
-About Master of Pores
+About Master of Pores 3
*******************
.. autosummary::
:toctree: generated
-Master of Pores is a pipeline writte in Nextflow DSL2 for the analysis of Nanopore data. It can handle reads from direct RNAseq, cDNAseq, DNAseq etc.
-The pipeline is composed by four modules:
- - mop_preprocess: preprocessing
+.. |docker| image:: https://img.shields.io/badge/Docker-v20.10.8-blue
+.. |status| image:: https://github.com/biocorecrg/master_of_pores/actions/workflows/build.yml/badge.svg
+.. |license| image:: https://img.shields.io/badge/License-MIT-yellow.svg
+.. |nver| image:: https://img.shields.io/badge/Nextflow-21.04.1-brightgreen
+.. |sing| image:: https://img.shields.io/badge/Singularity-v3.2.1-green.svg
+
+.. list-table::
+ :widths: 10 10 10 10 10
+ :header-rows: 0
+
+ * - |docker|
+ - |status|
+ - |license|
+ - |nver|
+ - |sing|
+
+`Master of Pores 3 `_ is a collection of pipelines written in Nextflow DSL2 for the analysis of Nanopore data. It can handle reads from direct RNAseq, cDNAseq, DNAseq etc.
+
+The software is composed by four pipelines:
+
+ - mop_preprocess: preprocessing of input data. Basecalling, demultiplexing, alignment, read counts, and more!
- mop_mod: detecting chemical modifications. It reads the output directly from mop_preprocess
- mop_tail: estimating polyA tail size. It reads the output directly from mop_preprocess
- mop_consensus: it generates a consensus from the predictions from mop_mod. It reads the output directly from mop_mod
-
The name is inspired by Metallica's `Master Of Puppets `_
-.. image:: ../img/master_red.jpg
- :width: 400
+.. image:: ../img/goku3.png
+ :width: 600
This is a joint project between `CRG bioinformatics core `_ and `Epitranscriptomics and RNA Dynamics research group `_.
@@ -27,9 +44,8 @@ This is a joint project between `CRG bioinformatics core `_ Luca Cozzuto, Huanle Liu, Leszek P. Pryszcz, Toni Hermoso Pulido, Anna Delgado-Tejedor, Julia Ponomarenko, Eva Maria Novoa. Front. Genet., 17 March 2020.
-
+If you use this tool, please cite our papers:
+`"Nanopore Direct RNA Sequencing Data Processing and Analysis Using MasterOfPores" `__ Cozzuto L, Delgado-Tejedor A, Hermoso Pulido T, Novoa EM, Ponomarenko J. N. Methods Mol Biol. 2023;2624:185-205. doi: 10.1007/978-1-0716-2962-8_13.
+`"MasterOfPores: A Workflow for the Analysis of Oxford Nanopore Direct RNA Sequencing Datasets" `_ Luca Cozzuto, Huanle Liu, Leszek P. Pryszcz, Toni Hermoso Pulido, Anna Delgado-Tejedor, Julia Ponomarenko, Eva Maria Novoa. Front. Genet., 17 March 2020.
diff --git a/docs/benchmark.rst b/docs/benchmark.rst
index 734c1c5..d16faa7 100644
--- a/docs/benchmark.rst
+++ b/docs/benchmark.rst
@@ -3,25 +3,25 @@ Benchmark
*******************
We tested MoP on two minION runs using the CRG's HPC where we can run up to 100 jobs in parallel (maximum 8 CPUs each) and using up to 10 GPU cards (GeForce RTX 2080 Ti). The test dataset was published at `ENA `_ with the accession `ERR5296640 `__ for pU samples and `ERR5303454 `__ for Nm samples.
-
+
.. list-table:: Dataset
- * -
+ * -
- MOP_PREPROCESS
- MOP_MOD
- MOP_TAIL
- MOP_CONSENSUS
* - Input data
- - 95 Gb
- - 137 Gb
- - 137 Gb
+ - 95 Gb
+ - 137 Gb
+ - 137 Gb
- 14 Mb
* - Execution time
- 10 hours
- 6 hours
- - 2.5 hours
+ - 2.5 hours
- 3 mins
* - Work folder
- 382 Gb
diff --git a/docs/changelog.rst b/docs/changelog.rst
index a9d20e6..d06a81e 100644
--- a/docs/changelog.rst
+++ b/docs/changelog.rst
@@ -7,6 +7,23 @@ CHANGELOG
.. autosummary::
:toctree: generated
+Version 3.0
+================
+* mop_preprocess
+ * We added a custom model for m6A basecalling. It is automatically installed when running INSTALL.sh. For using it you need to indicated ``--pars_tools "drna_tool_splice_m6A_opt.tsv" ``
+ * We add support to cuda11 for guppy version > 4.4.1.
+ * Added readucks for improving demultiplexing with guppy (optional).
+ * New parameter "barcodes" where you can specify a file with barcodes to be kept. Example in **keep_barcodes.txt**
+ * Adding a `new model for direct RNA basecalling `__.
+ * Added support to dorado basecalling. Not yet supported the demultiplexing
+ * Also guppy version >= 6.5.x are supported. No need for indicating different command lines for different guppy versions inside tool_opts. The pipeline will get the version and act accordingly
+ * pod5 are supported for dorado and guppy >= 6.5.x. No fast5 and stats files will be output. This will limit other pipelines.
+
+* mop_tail
+ * we upgraded tailfindR to version 1.3
+ * Tailfinder can be used either in standard mode or nano3p mode (chemistry R10 and R9) by specifying the *tailfindr_mode* to: standard, n3ps_r9 or n3ps_r10.
+
+
Version 2.0
================
@@ -21,22 +38,21 @@ Version 2.0
* can demultiplex fast5 using guppy too
* deeplexicon can be run on GPU too
* Parameters of each tool are stored in a tsv file. We have different ones already pre-set for cDNA, DNA and dRNA (option **--pars_tools**)
- * Added new process **discovery** with bambu / isoquant for discovering and quantifying new transcripts.
+ * Added new process **discovery** with bambu / isoquant for discovering and quantifying new transcripts.
* demultiplexing, filtering, mapping, counting and discovery can be switched off by setting "NO" as a parameter
* saveSpace can be set to "YES" to reduce the amount of disk space required. **WARNING This will prevent the possibility to resume!**
* Merged old NanoPreprocess and NanoPreprocessSimple in **mop_preprocess**. Using fastq or fast5 will switch among the two executions.
* Htseq-count now accepts alignments generated by minimap2. https://github.com/htseq/htseq/issues/33
- * We can specify a **final_summary_**.txt** for extracting kit and flowcell info in the params.config file. If not present we should specify those info or a custom model via extra parameters in one of the **\*_opt.tsv** files or guppy will trigger an error.
+ * We can specify a **final_summary_**.txt** for extracting kit and flowcell info in the params.config file. If not present we should specify those info or a custom model via extra parameters in one of the **\*_opt.tsv** files or guppy will trigger an error.
* This module can be run in AWS BATCH using the profile **awsbatch**
* demultiplexing of fast5 with deeplexicon is now faster thanks to multithreading and parallelization
- * A new test dataset allows to perform CI on mop_preprocess, mop_mod (excluding nanocompore) and mop_tail
* mop_tail (formerly known as nanoTail)
* now you can launch each analysis independently
* Fine tuning of parameter for each step in tools_opt.tsv
* mop_mod (formerly known as nanoMod)
- *
+ * coming SOON!
Version 1.1
=================
diff --git a/docs/ci.rst b/docs/ci.rst
new file mode 100644
index 0000000..29fa25b
--- /dev/null
+++ b/docs/ci.rst
@@ -0,0 +1,18 @@
+.. _home-page-about:
+
+*******************
+Continuous integration
+*******************
+
+.. autosummary::
+ :toctree: generated
+
+The following pipelines are continuously checked using GitHub actions:
+
+* mop_preprocess
+* mop_mod
+* mop_tail
+
+.. image:: https://github.com/biocorecrg/master_of_pores/actions/workflows/build.yml/badge.svg
+ :target: https://github.com/biocorecrg/master_of_pores
+ :alt: pipeline status
diff --git a/docs/conf.py b/docs/conf.py
index 5bb8151..0ff48c8 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -17,7 +17,7 @@
# -- Project information -----------------------------------------------------
-project = 'MoP2'
+project = 'MoP3'
# -- General configuration ---------------------------------------------------
@@ -62,7 +62,6 @@
# The URL which points to the root of the HTML documentation.
# It is used to indicate the location of document like canonical_url
-html_baseurl = 'https://biocorecrg.github.io/MoP2/docs/'
+html_baseurl = 'https://biocorecrg.github.io/master_of_pores'
# html_logo = html_favicon = '_images/sphinx-notes.png'
-
diff --git a/docs/index.rst b/docs/index.rst
index 7f4c58c..940d71f 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -1,27 +1,25 @@
.. _home-page-index:
*******************
-Welcome to the documentation of Master Of Pores 2
+Welcome to the documentation of Master Of Pores 3
*******************
.. autosummary::
:toctree: generated
-.. image:: ../img/master_red.jpg
- :width: 400
+.. image:: ../img/goku3.png
+ :width: 600
Master of Pores is a pipeline written in Nextflow DSL2 for the analysis of Nanopore data. It can handle reads from direct RNAseq, cDNAseq, DNAseq etc.
-The pipeline is composed by theee modules:
+The pipeline is composed by four modules:
- mop_preprocess: preprocessing
- - mop_mod: detecting chemical modifications. It reads the output directly from mop_preprocess.
+ - mop_mod: detecting chemical modifications. It reads the output directly from mop_preprocess
- mop_tail: estimating polyA tail size. It reads the output directly from mop_preprocess
- mop_consensus: it generates a consensus from the predictions from mop_mod. It reads the output directly from mop_mod
-
-
-.. MoP2 documentation master file, created by
+.. MoP3 documentation master file, created by
Luca Cozzuto.
You can adapt this file completely to your liking, but it should at least
contain the root `toctree` directive.
@@ -34,12 +32,12 @@ Contents:
about
install
mop_preprocess
- mop_tail
mop_mod
mop_consensus
+ mop_tail
reporting
awsbatch
benchmark
changelog
-
-
+ ci
+ troubleshooting
diff --git a/docs/install.rst b/docs/install.rst
index bac435e..c00b572 100644
--- a/docs/install.rst
+++ b/docs/install.rst
@@ -1,7 +1,7 @@
.. _home-page-install:
**************
-How to install
+Get Started
**************
.. autosummary::
@@ -9,37 +9,42 @@ How to install
Please install nextflow `Nextflow `_ and either `Singularity `_ or `Docker `_ before.
-For installing Nextflow you need a POSIX compatible system (Linux, OS X, etc). It requires Bash 3.2 (or later) and Java 8 (or later, up to 17). Windows system is supported through WSL. For the installation of Nextflow just run:
+For installing Nextflow you need a POSIX compatible system (Linux, OS X, etc). It requires Bash 3.2 (or later) and Java 11 (or later, up to 17). Windows system is supported through WSL. For the installation of Nextflow just run:
.. code-block:: console
curl -s https://get.nextflow.io | bash
-For installing the pipeline you need to download the repo:
+To install the pipeline you need to download the repo:
.. code-block:: console
- git clone --depth 1 --recurse-submodules https://github.com/biocorecrg/MOP2.git
+ git clone --depth 1 --recurse-submodules https://github.com/biocorecrg/master_of_pores.git
+Installing Guppy
+============
-You can use **INSTALL.sh** to download the **guppy 3.4.5** or you can download the version you prefer by adding an extra parameter.
+You can use **INSTALL.sh** and the version of Guppy you want to download.
.. note::
-
- Please consider that the support of VBZ compression of fast5 started with version 3.4.X.
+
+ Please consider that the support of VBZ compression of fast5 started with version 3.4.X.
.. code-block:: console
-
- cd MoP2; bash INSTALL.sh
-or
+ cd master_of_pores; bash INSTALL.sh 6.0.1
+
+or for installing the default 3.4.5
.. code-block:: console
- cd MoP2; bash INSTALL.sh 4.0.15
-
-
+ cd master_of_pores; bash INSTALL.sh
+
+Guppy custom models for RNA basecalling will be downloaded from our repository https://biocore.crg.eu/public/mop3_pub/models.tar and placed automatically within the right path inside the pipeline.
+
+You can install different versions of Guppy but only one will be run during the pipeline execution. For switching among them you need to run INSTALL.sh with the version you prefer.
+
Testing
============
@@ -47,23 +52,21 @@ Testing
cd mop_preprocess
- nextflow run mop_preprocess.nf -with-singularity -bg -profile local > log
+ nextflow run mop_preprocess.nf -params-file params.f5.yaml -with-singularity -bg -profile local > log
.. tip::
You can replace ```-with-singularity``` with ```-with-docker``` if you want to use the docker engine.
-
-Apple M1 processor
-====================
-
-Use the profile **m1mac** for running on machines with Apple M1 processor.
-
-
-.. code-block:: console
-
- cd mop_preprocess
-
- nextflow run mop_preprocess.nf -with-singularity -bg -profile m1mac > log
-
+Profiles
+============
+Some nextflow configuration files are stored within the folder **conf** and can be selected using different profiles. Currently, we have:
+
+- ci: for continuous integration testing (low resources)
+- local: for being used in a laptop without GPU support
+- m1mac: for running the containers in emulation for being used on M1/M2/M3 Apple processors.
+- sge: for being used in an HPC with Sun Grid Engine
+- cluster or crg: for being used in the custom HPC environment at CRG
+- slurm: for being used in an HPC with SLURM
+- awsbatch: for being used in Amazon AWS cloud infrastructure
diff --git a/docs/mop_consensus.rst b/docs/mop_consensus.rst
index f7ba32e..8371cfc 100644
--- a/docs/mop_consensus.rst
+++ b/docs/mop_consensus.rst
@@ -7,11 +7,57 @@ MOP_CONSENSUS
.. autosummary::
:toctree: generated
-This module takes as input the output from MOP_MOD with all the four worklows. It outputs the consensus of the diferent predictions running the tool `Nanoconsensus `__ in parallel on each transcript for each comparison.
+This pipeline takes as input the output from MOP_MOD with all the four worklows. It outputs the consensus of the diferent predictions running the tool `Nanoconsensus `__ in parallel on each transcript for each comparison.
-Here an example of a result:
-.. image:: ../img/nanocons.png
- :width: 800
+Input Parameters
+======================
+
+The input parameters are stored in yaml files like the one represented here:
+
+.. literalinclude:: ../mop_consensus/params.yaml
+ :language: yaml
+
+
+How to run the pipeline
+=============================
+
+Before launching the pipeline,user should:
+
+1. Decide which containers to use - either docker or singularity **[-with-docker / -with-singularity]**.
+2. Fill in both **params.config** and **tools_opt.tsv** files.
+
+To launch the pipeline, please use the following command:
+
+.. code-block:: console
+
+ nextflow run mop_consensus.nf -params-file params.yaml -with-singularity > log.txt
+
+
+You can run the pipeline in the background adding the nextflow parameter **-bg**:
+.. code-block:: console
+ nextflow run mop_consensus.nf -params-file params.yaml -with-singularity -bg > log.txt
+
+You can change the parameters either by changing **params.config** file or by feeding the parameters via command line:
+
+.. code-block:: console
+
+ nextflow run mop_consensus.nf -params-file params.yaml -with-singularity -bg --output test2 > log.txt
+
+
+You can specify a different working directory with temporary files:
+
+.. code-block:: console
+
+ nextflow run mop_consensus.nf -params-file params.yaml -with-singularity -bg -w /path/working_directory > log.txt
+
+
+Results
+====================
+
+Here an example of a result:
+
+.. image:: ../img/nanocons.png
+ :width: 800
diff --git a/docs/mop_mod.rst b/docs/mop_mod.rst
index d37da49..597280c 100644
--- a/docs/mop_mod.rst
+++ b/docs/mop_mod.rst
@@ -7,49 +7,29 @@ MOP_MOD
.. autosummary::
:toctree: generated
-This module takes as input the output from MOP_PREPROCESS: basecalled fast5 reads, together with their respective fastq files and unspliced alignments to the transcriptome . It runs four different RNA detection algorithms (Epinano, Nanopolish, Tombo and Nanocompore) and it outputs the predictions generated by each one of them as individual tab-delimited files.
-
+This pipeline takes as input the output from MOP_PREPROCESS: basecalled fast5 reads, together with their respective fastq files and unspliced alignments to the transcriptome . It runs four different RNA detection algorithms (Epinano, Nanopolish, Tombo and Nanocompore) and it outputs the predictions generated by each one of them as individual tab-delimited files.
+
+.. image:: ../img/flow_mod.png
+ :width: 600
+ :alt: mop_mod graph
Input Parameters
======================
-.. list-table::
- :widths: 25 75
- :header-rows: 1
-
- * - Parameter name
- - Description
- * - **input_path**
- - Output folder generated by mop_preprocess
- * - **comparison**
- - TSV file with two fields, each one will indicate the ID of the sample that has to be compared 1 vs 1
- * - **reference**
- - reference sequences
- * - **output**
- - Output folder
- * - **pars_tools**
- - TSV file with optional extra command line parameters for the tool indicated in the first field.
- * - **epinano**
- - It (in)activate the corresponding flow. It can be YES or NO
- * - **nanocompore**
- - It (in)activate the corresponding flow. It can be YES or NO
- * - **tombo_lsc**
- - It (in)activate the corresponding flow. It can be YES or NO
- * - **tombo_msc**
- - It (in)activate the corresponding flow. It can be YES or NO
- * - **epinano_plots**
- - If YES will produce a plot for each sample for each transcript.
- * - **email**
- - Email for pipeline reporting.
-
+The input parameters are stored in yaml files like the one represented here:
+
+.. literalinclude:: ../mop_mod/params.yaml
+ :language: yaml
+
+
How to run the pipeline
=============================
-Before launching the pipeline,user should:
+Before launching the pipeline, user should:
1. Decide which containers to use - either docker or singularity **[-with-docker / -with-singularity]**.
-2. Fill in both **params.config** and **tools_opt.tsv** files.
+2. Fill in both **params.yaml** and **tools_opt.tsv** files.
3. Fill in **comparison.tsv** file - please see example below:
.. code-block:: console
@@ -62,52 +42,35 @@ To launch the pipeline, please use the following command:
.. code-block:: console
- nextflow run mop_mod.nf -with-singularity > log.txt
+ nextflow run mop_mod.nf -params-file params.yaml -with-singularity > log.txt
You can run the pipeline in the background adding the nextflow parameter **-bg**:
.. code-block:: console
- nextflow run mop_mod.nf -with-singularity -bg > log.txt
+ nextflow run mop_mod.nf -params-file params.yaml -with-singularity -bg > log.txt
You can change the parameters either by changing **params.config** file or by feeding the parameters via command line:
.. code-block:: console
- nextflow run mop_mod.nf -with-singularity -bg --output test2 > log.txt
+ nextflow run mop_mod.nf -params-file params.yaml -with-singularity -bg --output test2 > log.txt
You can specify a different working directory with temporary files:
.. code-block:: console
- nextflow run mop_mod.nf -with-singularity -bg -w /path/working_directory > log.txt
-
-
-.. note::
-
- * In case of errors you can troubleshoot seeing the log file (log.txt) for more details. Furthermore, if more information is needed, you can also find the working directory of the process in the file. Then, access that directory indicated by the error output and check both the **.command.log** and **.command.err** files.
-
-
-.. tip::
-
- Once the error has been solved or if you change a specific parameter, you can resume the execution with the **Netxtlow** parameter **- resume** (only one dash!). If there was an error, the pipeline will resume from the process that had the error and proceed with the rest. If a parameter was changed, only processes affected by this parameter will be re-run.
-
-
-.. code-block:: console
-
- nextflow run mop_mod.nf -with-singularity -bg -resume > log_resumed.txt
+ nextflow run mop_mod.nf -params-file params.yaml -with-singularity -bg -w /path/working_directory > log.txt
-To check whether the pipeline has been resumed properly, please check the log file. If previous correctly executed process are found as *Cached*, resume worked!
-
Results
====================
Several folders are created by the pipeline within the output directory specified by the **output** parameter:
-1. **Epinano** results are stored in **epinano_flow** directory. It contains two files per sample: one containing data at position level and the other, at 5-mer level. Different features frequencies as well as quality data are included in the results. See example below:
+1. **Epinano** results are stored in **epinano_flow** directory. It contains two files per sample: one containing data at position level and the other, at 5-mer level. Different features frequencies as well as quality data are included in the results. See example below:
.. code-block:: console
@@ -117,14 +80,13 @@ Several folders are created by the pipeline within the output directory specifie
gene_A,2517,C,45529.0,6.92130,5.00000,5.04250,0.06165301236574491,0.1505633771881658,0.13540820136616222
gene_A,2518,A,45545.0,6.49821,5.00000,5.47485,0.10802503018992206,0.10855198155670216,0.2082775277198375
gene_A,2519,T,45557.0,6.51247,5.00000,4.81853,0.09386043857145993,0.14792457800118533,0.2033057488421099
-
+
Here an example of a plot from Epinano:
.. image:: ../img/epinano.png
- :width: 600
-
+ :width: 350
-2. **Tombo** results are stored in **tombo_flow** directory. It contains one file per comparison. It reports the p-value per position, the sum of p-values per 5-mer and coverage in both WT and KO. See example below:
+2. **Tombo** results are stored in **tombo_flow** directory. It contains one file per comparison. It reports the p-value per position, the sum of p-values per 5-mer and coverage in both WT and KO. See example below:
.. code-block:: console
@@ -135,9 +97,9 @@ Here an example of a plot from Epinano:
"gene_A_6" "gene_A" "6" "0.0000" "93" "88" 0.0014
"gene_A_7" "gene_A" "7" "0.0000" "95" "89" 0.0027
"gene_A_8" "gene_A" "8" "0.0014" "95" "89" 0.004
-
-3. **Nanopolish** results are stored in **nanopolish-compore_flow** directory. It contains two files per sample: raw eventalign output (gzipped) and another with the median raw current per position and transcript (**sample_processed_perpos_median.tsv.gz**). See example below:
+
+3. **Nanopolish** results are stored in **nanopolish-compore_flow** directory. It contains two files per sample: raw eventalign output (gzipped) and another with the median raw current per position and transcript (**sample_processed_perpos_median.tsv.gz**). See example below:
.. code-block:: console
@@ -150,3 +112,16 @@ Here an example of a plot from Epinano:
gene_A 5 GAAGA 1 104.25 471
4. **Nanocompore** results are stored in **nanopolish-compore_flow** directory. It contains one file per comparison (**wt_1_vs_ko_1_nanocompore_results.tsv**). Default output from Nanocompore (see Nanocompore's repository for a more detailed explanation).
+
+Encoding of modification information from m6A-aware basecalled data using modPhred
+=====================================================================================
+
+Once the data has been basecalled with our m6A modification-aware basecalling model, modification information data should be encoded for its later downstream analysis. This step is performed by **modPhred**, another software included in the **mop_mod** module.
+
+To run this software, in the ``params.yaml`` file you should specify ``modphred: "YES"`` and run the code below:
+
+.. code-block:: console
+
+ cd mop_mod
+ nextflow run mop_mod.nf -params-file params.yaml -with-singularity -bg > yourlog.txt
+
diff --git a/docs/mop_preprocess.rst b/docs/mop_preprocess.rst
index 446b301..2c6f51f 100644
--- a/docs/mop_preprocess.rst
+++ b/docs/mop_preprocess.rst
@@ -7,144 +7,194 @@ MOP_PREPROCESS
.. autosummary::
:toctree: generated
-This module takes as input the raw fast5 reads - single or multi - and produces a number of outputs (basecalled fast5, sequences in fastq format, aligned reads in BAM format etc). The pre-processing module is able to perform base-calling, demultiplexing (optional), filtering, quality control, mapping to a genome / transcriptome reference, feature counting, discovery of novel transcripts and it generates a final report of the performance and results of each of the steps performed. It automatically detects the kind of input fast5 file (single or multi sequence).
+This pipeline takes as input the raw fast5 reads - single or multi - and it produces several outputs (basecalled fast5, sequences in fastq format, aligned reads in BAM format etc). The pre-processing pipeline can perform base-calling, demultiplexing (optional), filtering, quality control, mapping to a reference (either a genome or a transcriptome), feature counting, discovery of novel transcripts, and it generates a final report with the performance and results of each of the steps performed.
+
+It automatically detects the kind of input fast5 file (single or multi-sequence). It can also support the new pod5 format but it won't output basecalled fastq useful for the other pipelines. The basecalling can be performed with guppy or dorado and the demultiplexing with either guppy, or deeplexicon. Basecalled fastq and Fast5 files can be demultiplexed as well. You can restrict the number of barcodes by indicating a file with barcode list using the **barcodes** parameter.
+
+
+.. image:: ../img/flow_preproc.png
+ :width: 600
+ :alt: mop_preprocess graph
-.. note::
- For using the Apple's M1 processor you should use the custom profile **m1mac** and **docker**.
-
Input Parameters
======================
-.. list-table::
- :widths: 25 75
- :header-rows: 1
-
- * - Parameter name
- - Description
- * - **conffile**
- - Configuration file produced by the Nanopore instrument. It can be omitted but in that case the user must specify either the guppy parameters "--kit" and "--flowcell" or the custom model via **[NAME_tool_opt.tsv]** file
- * - **fast5 files**
- - Path to fast5 input files (single or multi-fast5 files). They should be inside folders that will be used as sample name. **[/Path/\*\*/*.fast5]**. If empty it will search for fastq files and skip basecalling
- * - **fastq files**
- - Path to fastq input files. They should be inside folders that will be used as sample name. Must be empty if you want to perform basecalling **[/Path/\*\*/*.fastq]**.
- * - **reference**
- - File in fasta format. **[Reference_file.fa]**
- * - **ref_type**
- - Specify if the reference is a genome or a transcriptome. **[genome / transcriptome]**
- * - **annotation**
- - Annotation file in GTF format. It is optional and needed only in case of mapping to the genome and when interested in gene counts. Can be gzipped. **[Annotation_file.gtf]**.
- * - **pars_tools**
- - Parameters of tools. It is ha tab separated file with custom parameters for each tool **[NAME_tool_opt.tsv]**
- * - **output**
- - Output folder name. **[/Path/to_output_folder]**
- * - **qualityqc**
- - Quality threshold for QC. **[5]**
- * - **granularity**
- - indicates the number of input fast5 files analyzed in a single process.
- * - **basecalling**
- - Tool for basecalling **[guppy / NO ]**
- * - **GPU**
- - Allow the pipeline to run with GPU. **[OFF / ON]**
- * - **demultiplexing**
- - Tool for demultiplexing algorithm. **[deeplexicon / guppy / NO ]**
- * - **demulti_fast5**
- - If performing demultiplexing generate demultiplexed multifast5 files too. **[YES / NO]**
- * - **filtering**
- - Tool for filtering fastq files. **[nanofilt / NO]**
- * - **mapping**
- - Tool for mapping reads. **[minimap2 / graphmap / graphmap2 / bwa / NO ]**
- * - **counting**
- - Tool for gene or transcripts counts **[htseq / nanocount / NO]**
- * - **discovery**
- - Tool for generating novel transcripts. **[bambu / NO]**
- * - **cram_conv**
- - Converting bam in cram. **[YES / NO]**
- * - **subsampling_cram**
- - Subsampling BAM before CRAM conversion. **[YES / NO]**
- * - **saveSpace**
- - Remove intermediate files (**beta**) **[YES / NO]**
- * - **email**
- - Users email for receving the final report when the pipeline is finished. **[user_email]**
-
-
-
-You can change them by editing the **params.config** file or using the command line - please, see next section.
+The input parameters are stored in yaml files like the one represented here:
+
+.. literalinclude:: ../mop_preprocess/params.f5.demrna.yaml
+ :language: yaml
+
+You can change them by editing this file or using the command line as explained in the next section.
+
+
+.. tip::
+
+ In the case of pod5 as input files, you can use them as they were fast5 with dorado or guppy >= 6.5.x. The only limitation is that you cannot obtain basecalled fast5 so you cannot use the other pipelines that need fast5 as input files.
+
+
How to run the pipeline
=============================
-Before launching the pipeline, user should decide which containers to use - either docker or singularity **[-with-docker / -with-singularity]**.
+Before launching the pipeline, the user can decide which containers to use: either docker or singularity **[-with-docker / -with-singularity]**.
-Then, to launch the pipeline, please use the following command:
+Then, to launch the pipeline, please use the following command by specifying the path of the yaml parameter file:
.. code-block:: console
- nextflow run mop_preprocess.nf -with-singularity > log.txt
+ nextflow run mop_preprocess.nf -with-singularity -params-file params.yaml > log.txt
-You can run the pipeline in the background adding the nextflow parameter **-bg**:
+You can run the pipeline in the background by adding the nextflow parameter **-bg**:
.. code-block:: console
- nextflow run mop_preprocess.nf -with-singularity -bg > log.txt
+ nextflow run mop_preprocess.nf -params-file params.yaml -with-singularity -bg > log.txt
-You can change the parameters either by changing **params.config** file or by feeding the parameters via command line:
+You can change the parameters either by changing the yaml config file or by feeding the parameters via command line:
.. code-block:: console
- nextflow run mop_preprocess.nf -with-singularity -bg --output test2 > log.txt
+ nextflow run mop_preprocess.nf -with-singularity -params-file params.yaml -bg --output test2 > log.txt
You can specify a different working directory with temporary files:
.. code-block:: console
- nextflow run mop_preprocess.nf -with-singularity -bg -w /path/working_directory > log.txt
+ nextflow run mop_preprocess.nf -with-singularity -params-file params.yaml -bg -w /path/working_directory > log.txt
+
+You can use different profiles for running the pipeline in different environments. We have one set up for HPC using the SGE scheduler:
+
+.. code-block:: console
+
+ nextflow run mop_preprocess.nf -with-singularity -bg -params-file params.yaml -w /path/working_directory -profile cluster > log.txt
+
+One for HPC using the slurm scheduler
+
+.. code-block:: console
+
+ nextflow run mop_preprocess.nf -with-singularity -bg -params-file params.yaml -w /path/working_directory -profile slurm > log.txt
-You can use different profiles specifying the different environments. We have one set up for HPC using the SGE scheduler:
+One for emulating the new M1 processor for Apple:
.. code-block:: console
- nextflow run mop_preprocess.nf -with-singularity -bg -w /path/working_directory -profile cluster > log.txt
+ nextflow run mop_preprocess.nf -with-singularity -bg -params-file params.yaml -w /path/working_directory -profile m1mac > log.txt
+
or you can run the pipeline locally:
.. code-block:: console
- nextflow run mop_preprocess.nf -with-singularity -bg -w /path/working_directory -profile local > log.txt
+ nextflow run mop_preprocess.nf -with-singularity -bg -params-file params.yaml -w /path/working_directory -profile local > log.txt
.. note::
-
- * In case of errors you can troubleshoot seeing the log file (log.txt) for more details. Furthermore, if more information is needed, you can also find the working directory of the process in the file. Then, access that directory indicated by the error output and check both the `.command.log` and `.command.err` files.
+ * In case of errors you can troubleshoot by seeing the log file (log.txt) for more details. Furthermore, if more information is needed, you can also go to the intermediate directory indicated in the log and check both the `.command.log` and `.command.err` files.
.. tip::
- Once the error has been solved or if you change a specific parameter, you can resume the execution with the **Netxtlow** parameter **- resume** (only one dash!). If there was an error, the pipeline will resume from the process that had the error and proceed with the rest. If a parameter was changed, only processes affected by this parameter will be re-run.
+ Once the error has been solved or if you change a specific parameter, you can resume the execution with the **Netxtlow** parameter **- resume** (only one dash!). If there is an error, the pipeline will resume from the process that had the error and proceed with the rest. If you change a parameter, only the processes affected by this parameter will be re-run.
.. code-block:: console
- nextflow run mop_preprocess.nf -with-singularity -bg -resume > log_resumed.txt
+ nextflow run mop_preprocess.nf -with-singularity -params-file params.yaml -bg -resume > log_resumed.txt
- To check whether the pipeline has been resumed properly, please check the log file. If previous correctly executed process are found as *Cached*, resume worked!
+ To check whether the pipeline has been resumed properly, please check the log file. If previous correctly executed processes are found as *Cached*, the resume worked!
.. code-block:: console
...
[warm up] executor > crg
- [e8/2e64bd] Cached process > baseCalling (RNA081120181_1)
- [b2/21f680] Cached process > QC (RNA081120181_1)
- [c8/3f5d17] Cached process > mapping (RNA081120181_1)
+ [9d/82eeaa] Cached process > checkRef (Checking yeast_rRNA_ref.fa.gz)
+ [33/b8d053] Submitted process > BASECALL:GUPPY_VERSION:getVersion
+ [e5/e5c990] Submitted process > BASECALL:GUPPY65_BASECALL:baseCallNew (mod---2)
+ [b5/0997da] Submitted process > BASECALL:GUPPY65_BASECALL:baseCallNew (wt---1)
+ [fb/6353d6] Submitted process > SEQFILTER:NANOQ_FILTER:filter (mod---2)
...
+.. note::
+ To resume the execution, temporary files generated previously by the pipeline must be kept. Otherwise, the pipeline will re-start from the beginning.
+
+tool_opts
+====================
+
+The command line options for each tool used in the pipeline are stored within specialized tsv files stored within the *tool_opts* folder. Here is an example:
+
+.. literalinclude:: ../mop_preprocess/tool_opts/drna_tool_m6A_splice_opt.tsv
+
+The first column indicates the processing step as **basecalling** or **demultiplexing** etc. Some tools such as Guppy can be used for more processing steps. Several pre-compiled tool_opts files are stored within the folder **tool_opts**.
+
+.. note::
+ Readucks is run after guppy demultiplexing. It refines the demultiplexing generating different fastqs
+
+
+Model libraries for specific tools
+====================
+The following folders are available for the respective tools. Some models are already pre-installed-
+
+* deeplexicon_models
+ * resnet20-final.h5
+ * pAmps-final-actrun_newdata_nanopore_UResNet20v2_model.030.h5
+ * pAmps-rep2-4-train1_newdata_nanopore_UResNet20v2_model.039.h5
+* dorado_models
+ * rna002_70bps_hac@v3
+
+.. note::
+ You need to download the models you want to use in case they are not already available. For instance, if you need another model for dorado you need to do:
+
+.. code-block:: console
+
+ dorado download --model MODELNAME
+
+
+You also need to add the dedicated parameter within the tool_opts file for the specific tool as:
+
+.. code-block:: console
+
+ basecalling dorado "rna002_70bps_hac@v3"
+ demultiplexing deeplexicon "-f multi -m resnet20-final.h5"
+
+.. note::
+ You need to copy the model in the corresponding folder and indicate just the model name. You don't need the absolute path.
+
+
+
+Barcodes
+===================
+
+You can select the barcodes you are interested in by writing them down in a text file as in this example. The format is *samplename---barcodeID*
+
+.. literalinclude:: ../mop_preprocess/keep_barcodes.txt
+
+The sample id is given by either the folder containing the fast5 files or the basename of the fastq files. So, if your files are in a folder named **myfiles**, it will be:
+
+.. code-block:: console
+
+ myfiles---bc_1
+ myfiles---bc_2
+ myfiles---bc_3
.. note::
- To resume the execution, temporary files generated previously by the pipeline must be kept. Otherwise, pipeline will re-start from the beginning.
+ The naming convention of the different barcodes is decided by each tool, so guppy will produce **barcode01**, **barcode02**, etc.
+
+
+Basecalling with the m6A-aware model
+=========================================
+
+For m6A basecalling in your ``params.f5.yaml`` file you should specify ``basecalling: "guppy"`` and ``pars_tools: "tool_opts/drna_tool_m6A_splice_opt.tsv" `` so that guppy will use the m6A model. In your output folder you will have the ``fast5_files`` folder containing the m6A basecalled fast5 files for downstream analysis. Then run:
+
+.. code-block:: console
+
+ cd mop_preprocess
+ nextflow run mop_preprocess.nf -params-file params.f5.yaml -with-singularity -bg > yourlog.txt
+
Results
@@ -153,30 +203,16 @@ Results
Several folders are created by the pipeline within the output directory specified by the **output** parameter:
-* **fast5_files**: Contains the basecalled multifast5 files. Each batch contains 4000 sequences.
+* **fast5_files**: Contains the basecalled multifast5 files. Each batch contains 4000 sequences.
* **fastq_files**: Contains one or, in case of demultiplexing, more fastq files.
* **QC_files**: Contains each single QC produced by the pipeline.
* **alignment**: Contains the bam file(s).
* **cram_files**: Contains the cram file(s).
* **counts**: Contains read counts per gene / transcript if counting was performed.
* **assigned**: Contains assignment of each read to a given gene / transcript if counting was performed.
-* **report**: Contains the final multiqc report.
+* **report**: Contains the final multiqc report.
* **assembly**: It contains assembled transcripts.
-
-Here an example of a final report:
-
-.. image:: ../img/multiqc.png
- :width: 800
-
-
-
.. note::
- Newer versions of guppy automatically separate the reads depending on the quality. You need to disable this via custom options for being used in MoP3. This is also to avoid losing interesting signals since the modified bases have often low qualities. GUPPY 6 seems to require singularity 3.7.0 or higher.
-
-.. tip::
- You can pass via parameter a custom NAME_tool_opt.tsv file with custom guppy options to disable the qscore filtering. Some custom files are already available in this package, like **drna_tool_unsplice_guppy6_opt.tsv**
-
-
-
+ MOP3 will automatically detect the version of guppy and modify the parameters accordingly. You don't need to add any extra parameter as in MOP2.
diff --git a/docs/mop_tail.rst b/docs/mop_tail.rst
index 780a8e9..fa795d6 100644
--- a/docs/mop_tail.rst
+++ b/docs/mop_tail.rst
@@ -6,35 +6,23 @@ MOP_TAIL
.. autosummary::
:toctree: generated
-
-This module takes as input the output from MOP_PREPROCESS: basecalled fast5 reads, together with their respective fastq files, alignment and assignment read ID to gene/transcript. It outputs the estimation of poly(A) tail length at read level provided by **Tailfindr**, **Nanopolish** or both.
+
+This pipeline takes as input the output from MOP_PREPROCESS: basecalled fast5 reads, together with their respective fastq files, alignment and assignment read ID to gene/transcript. It outputs the estimation of poly(A) tail length at read level provided by **Tailfindr**, **Nanopolish** or both. Tailfinr can be run using three modes: standard, for Nano3P-seq protocol with R9 chemistry and Nano3P-seq protocol with R10 chemistry.
+
+.. image:: ../img/flow_tail.png
+ :width: 600
+ :alt: mop_tail graph
+
Input Parameters
======================
-.. list-table::
- :widths: 25 75
- :header-rows: 1
-
- * - Parameter name
- - Description
- * - **input_path**
- - Output folder generated by mop_preprocess
- * - **reference**
- - reference sequences
- * - **output**
- - Output folder
- * - **pars_tools**
- - TSV file with optional extra command line parameters for the tool indicated in the first field.
- * - **tailfindr**
- - It (in)activate the corresponding flow. It can be YES or NO
- * - **nanopolish**
- - It (in)activate the corresponding flow. It can be YES or NO
- * - **email**
- - Email for pipeline reporting.
-
-
-
+The input parameters are stored in yaml files like the one represented here:
+
+.. literalinclude:: ../mop_tail/params.yaml
+ :language: yaml
+
+
How to run the pipeline
=============================
@@ -47,45 +35,29 @@ To launch the pipeline, please use the following command:
.. code-block:: console
- nextflow run mop_tail.nf -with-singularity > log.txt
+ nextflow run mop_tail.nf -params-file params.yaml -with-singularity > log.txt
You can run the pipeline in the background adding the nextflow parameter **-bg**:
.. code-block:: console
- nextflow run mop_tail.nf -with-singularity -bg > log.txt
+ nextflow run mop_tail.nf -params-file params.yaml -with-singularity -bg > log.txt
You can change the parameters either by changing **params.config** file or by feeding the parameters via command line:
.. code-block:: console
- nextflow run mop_tail.nf -with-singularity -bg --output test2 > log.txt
+ nextflow run mop_tail.nf -params-file params.yaml -with-singularity -bg --output test2 > log.txt
You can specify a different working directory with temporary files:
.. code-block:: console
- nextflow run mop_tail.nf -with-singularity -bg -w /path/working_directory > log.txt
-
-
-.. note::
-
- * In case of errors you can troubleshoot seeing the log file (log.txt) for more details. Furthermore, if more information is needed, you can also find the working directory of the process in the file. Then, access that directory indicated by the error output and check both the **.command.log** and **.command.err** files.
-
+ nextflow run mop_tail.nf -params-file params.yaml -with-singularity -bg -w /path/working_directory > log.txt
-.. tip::
- Once the error has been solved or if you change a specific parameter, you can resume the execution with the **Netxtlow** parameter **- resume** (only one dash!). If there was an error, the pipeline will resume from the process that had the error and proceed with the rest. If a parameter was changed, only processes affected by this parameter will be re-run.
-
-
-.. code-block:: console
-
- nextflow run mop_tail.nf -with-singularity -bg -resume > log_resumed.txt
-
-To check whether the pipeline has been resumed properly, please check the log file. If previous correctly executed process are found as *Cached*, resume worked!
-
Results
====================
@@ -101,13 +73,9 @@ Several folders are created by the pipeline within the output directory specifie
"013a5dde-9c52-4de1-83eb-db70fb2cd130" 52.16 49.39 "YKR072C"
"01119f62-ca68-458d-aa1f-cf8c8c04cd3b" 231.64 274.28 "YDR133C"
"0154ce9c-fe6b-4ebc-bbb1-517fdc524207" 24.05 24.24 "YFL044C"
- "020cde28-970d-4710-90a5-977e4b4bbc46" 41.27 56.79 "YGL238W"
-
-If both softwares are run, an additional plot which shows the correlation of their results is generated.
-
-
-Here an example of a result:
+ "020cde28-970d-4710-90a5-977e4b4bbc46" 41.27 56.79 "YGL238W"
-.. image:: ../img/moptail.png
- :width: 800
+If both programs are run, an additional plot that shows the correlation of their results is generated.
+.. image:: ../img/mod_corr.png
+ :width: 400
diff --git a/docs/reporting.rst b/docs/reporting.rst
index 93810d6..5d5efc9 100644
--- a/docs/reporting.rst
+++ b/docs/reporting.rst
@@ -22,28 +22,28 @@ For enabling the generation of a report just run each pipeline with the paramete
Once the pipeline is finished you will have a comprehensive report with information about CPU, disk and memory usage for each execution and as a distribution for each process. Here an example:
.. image:: ../img/res_report.png
- :width: 800
-
+ :width: 800
+
Live reporting
====================
-For having a live reporting your pipeline should run in an environment that has access to internet. Then you have to log-in to the `Tower.nf `_ website that is developed and mantained by the good people at `Seqera Labs `__.
+For having a live reporting your pipeline should run in an environment that has access to internet. Then you have to log-in to the `Tower.nf `_ website that is developed and mantained by the good people at `Seqera Labs `__.
.. image:: ../img/tower.png
- :width: 800
-
+ :width: 800
+
We suggest you to use either the GitHub or the Google authentication:
.. image:: ../img/tower_eli1.png
- :width: 800
+ :width: 800
You can generate your token at `https://tower.nf/tokens `__ and
.. image:: ../img/tower2.png
- :width: 800
+ :width: 800
then you can export as an environmental variable.
@@ -52,8 +52,8 @@ then you can export as an environmental variable.
export TOWER_ACCESS_TOKEN=*******YOUR***TOKEN*****HERE*******
.. tip::
- We recommend you to add this line to either your .bashrc or .bash_profile file.
-
+ We recommend you to add this line to either your .bashrc or .bash_profile file.
+
Finally we can then launch the pipeline adding the parameter `-with-tower`.
.. code-block:: console
@@ -64,9 +64,3 @@ Going back to the website you can see now your pipeline running and have a nice
.. image:: ../img/tower.gif
:width: 800
-
-
-
-
-
-
diff --git a/docs/requirements.txt b/docs/requirements.txt
index 483a4e9..6c5d5d4 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -1 +1 @@
-sphinx_rtd_theme
+sphinx-rtd-theme
diff --git a/docs/troubleshooting.rst b/docs/troubleshooting.rst
new file mode 100644
index 0000000..1c76785
--- /dev/null
+++ b/docs/troubleshooting.rst
@@ -0,0 +1,39 @@
+.. _home-page-troubleshooting:
+
+*****************
+Troubleshooting
+*****************
+
+.. autosummary::
+ :toctree: generated
+
+Demultiplexing with Guppy
+================================
+Error:
+
+.. code-block:: console
+
+ Init time: 2515 ms
+
+ 0% 10 20 30 40 50 60 70 80 90 100%
+ |----|----|----|----|----|----|----|----|----|----|
+ ***************************************************
+ Caller time: 32995 ms, Samples called: 159196780, samples/s: 4.82488e+06
+ Finishing up any open output files.
+ Basecalling completed successfully.
+
+ Command error:
+ rm: cannot remove '*_out/*/*.fastq': No such file or directory
+
+Solution:
+Check your barcode kit!. You must indicate the **--barcode_kits** in the tool_opts file at the row **demultiplexing guppy**. Example:
+
+.. code-block:: console
+
+ demultiplexing guppy "--flowcell FLO-MIN114 --kit SQK-LSK114 --barcode_kits SQK-NBD114-24"
+
+
+Memory failures in mop_preprocess and mop_mod
+================================================================
+
+Sometimes FastQC, Epinano, or other tools can run out of memory because the input data contain low-quality reads. It is always a good idea to double-check it and eventually filter them per size / quality using either **nanoq** or **nanofilt**.
diff --git a/img/flow_mod.png b/img/flow_mod.png
new file mode 100644
index 0000000..c4fd291
Binary files /dev/null and b/img/flow_mod.png differ
diff --git a/img/flow_preproc.png b/img/flow_preproc.png
new file mode 100644
index 0000000..5825e61
Binary files /dev/null and b/img/flow_preproc.png differ
diff --git a/img/flow_tail.png b/img/flow_tail.png
new file mode 100644
index 0000000..c134066
Binary files /dev/null and b/img/flow_tail.png differ
diff --git a/img/goku3.png b/img/goku3.png
new file mode 100644
index 0000000..d227000
Binary files /dev/null and b/img/goku3.png differ
diff --git a/img/logo_small.png b/img/logo_small.png
index 5a003f5..b6ef44e 100644
Binary files a/img/logo_small.png and b/img/logo_small.png differ
diff --git a/img/mod_corr.png b/img/mod_corr.png
new file mode 100644
index 0000000..1c9bda9
Binary files /dev/null and b/img/mod_corr.png differ
diff --git a/img/moptail.png b/img/moptail.png
deleted file mode 100644
index f56fe88..0000000
Binary files a/img/moptail.png and /dev/null differ
diff --git a/img/multiqc.png b/img/multiqc.png
deleted file mode 100644
index f65e91d..0000000
Binary files a/img/multiqc.png and /dev/null differ
diff --git a/img/wt_corr.png b/img/wt_corr.png
new file mode 100644
index 0000000..e7a8aa3
Binary files /dev/null and b/img/wt_corr.png differ
diff --git a/launch_nf.sh b/launch_nf.sh
new file mode 100644
index 0000000..ef4f447
--- /dev/null
+++ b/launch_nf.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+#SBATCH --no-requeue
+#SBATCH --mem 8192M
+#SBATCH -p genoa64
+#SBATCH --qos='pipelines'
+set -x
+pid=""
+ kill_func() {
+ echo TRAP;
+ kill $pid ;
+ wait $pid
+}
+trap kill_func INT
+trap kill_func EXIT
+
+"$@" & pid=$! ; echo "waiting for ${pid}" ; wait $pid
diff --git a/local_modules.nf b/local_modules.nf
index 783464e..b5a1013 100644
--- a/local_modules.nf
+++ b/local_modules.nf
@@ -4,97 +4,36 @@ params.LABEL = ""
params.OUTPUT = ""
params.saveSpace = "NO"
-// MODULES
+// MODULES
// MOP_PREPROCESS
-process extracting_demultiplexed_fastq {
- label (params.LABEL)
- tag "${ idfile }"
-
- input:
- tuple val(idfile), path(demux), path(fastq)
-
-
- output:
- tuple val(idfile), path ("*.fastq.gz")
-
- script:
- """
- extract_sequence_from_fastq.py ${demux} ${fastq}
- for i in *.fastq; do gzip \$i; done
- """
-}
-
-process preparing_demultiplexing_fast5_deeplexicon {
-
- label (params.LABEL)
- tag "${ idfile }"
-
- input:
- tuple val(idfile), path("demux_*")
-
- output:
- tuple val(idfile), path("*.list")
-
- script:
- """
- cat demux_* | grep -v ReadID >> dem.files
- awk '{print \$2 > \$3".list" }' dem.files
- """
-}
-
-process extracting_demultiplexed_fast5_deeplexicon {
- label (params.LABEL)
- container 'lpryszcz/deeplexicon:1.2.0'
- tag "${ idfile } on ${ idlist }"
- if (params.saveSpace == "YES") publishDir(params.OUTPUTF5, mode:'move', pattern: '*-*')
- else publishDir(params.OUTPUTF5, mode:'copy', pattern: '*-*')
-
- publishDir(params.OUTPUTST, mode:'copy', pattern: 'summaries/*_final_summary.stats', saveAs: { file -> "${file.split('\\/')[-1]}" })
-
-
- input:
- tuple val(idfile), path(idlist), file("*")
- output:
- path("${idfile}-*"), type: "dir", emit: dem_fast5
- path("summaries/*_final_summary.stats"), emit: dem_summaries
-
- script:
- """
- mkdir ${idfile}---`basename ${idlist} .list`; fast5_subset --input ./ --save_path ${idfile}---`basename ${idlist} .list`/ --read_id_list ${idlist} --batch_size 4000 -c vbz -t ${task.cpus}
- mkdir summaries
- for i in */filename_mapping.txt; do awk 'BEGIN{print "filename\tread_id"}{print \$2"\t"\$1}' \$i > `echo \$i | awk -F"/" '{print "summaries/"\$1"_final_summary.stats"}'`; done
- rm */filename_mapping.txt;
- """
-}
+process extract_demultiplexed_fast5_readucks {
-process extracting_demultiplexed_fast5_guppy {
tag "${ idfile }"
label (params.LABEL)
- if (params.saveSpace == "YES") publishDir(params.OUTPUT, mode:'move')
- else publishDir(params.OUTPUT, mode:'copy')
+ if (params.saveSpace == "YES") publishDir(params.OUTPUT, mode:'move')
+ else publishDir(params.OUTPUT, mode:'copy')
container "quay.io/biocontainers/ont-fast5-api:4.0.0--pyhdfd78af_0"
-
+
input:
tuple val(idfile), path("summaries_*"), file("*")
-
+
output:
path("${idfile}-*")
script:
"""
if [ -f "summaries_" ]; then
- ln -s summaries_ final_summary.stats
- else
- head -n 1 summaries_1 > final_summary.stats
- for i in summaries_*; do grep -v "filename" \$i | awk -F"\t" -v id=${idfile} '{OFS="\t"; \$19 = id"---"\$19; print \$0}' >> final_summary.stats; done
+ ln -s summaries_ summaries_1
fi
+ head -n 1 summaries_1 > final_summary.stats
+ for i in summaries_*; do grep -v "filename" \$i | awk -F"\t" -v id=${idfile} '{OFS="\t"; \$19 = id"---"\$21; print \$0}' >> final_summary.stats; done
- demux_fast5 -c vbz -t ${task.cpus} --input ./ --save_path ./ --summary_file final_summary.stats
- rm -fr barcode_arrangement
+ demux_fast5 -c vbz -t ${task.cpus} --input ./ --save_path ./ --summary_file final_summary.stats
+ rm -fr barcode_arrangement
"""
}
@@ -105,14 +44,14 @@ process extracting_demultiplexed_fast5_guppy {
process cleanFile {
tag "${id}"
label (params.LABEL)
-
+
input:
tuple val(id), path(file_to_remove)
val(file_to_wait1)
val(extension)
when: params.saveSpace == "YES"
-
+
script:
"""
for i in *${extension}; do rm \$(readlink -f \$i); done
@@ -126,14 +65,14 @@ process cleanFile {
process concatenateFastQFiles {
tag "${idfile}"
label (params.LABEL)
- publishDir(params.OUTPUT, mode:'copy')
+ publishDir(params.OUTPUT, mode:'copy')
input:
tuple val(idfile), path(demultifq)
output:
- tuple val(idfile), path("${idfile}.fq.gz")
-
+ tuple val(idfile), path("${idfile}.fq.gz")
+
script:
"""
@@ -150,21 +89,21 @@ process MinIONQC {
label (params.LABEL)
container 'biocorecrg/mopprepr:0.7'
errorStrategy 'ignore'
- if (params.OUTPUT != "") publishDir(params.OUTPUT, mode:'copy', pattern: '*.stats')
+ if (params.OUTPUT != "") publishDir(params.OUTPUT, mode:'copy', pattern: '*.stats')
+
-
input:
- tuple val(folder_name), path("summaries_*")
+ tuple val(folder_name), path("summaries_*")
output:
- tuple val(folder_name), path ("${folder_name}_QC"), emit: QC_folder
- tuple val(folder_name), path ("*_summary.stats"), emit: stats
+ tuple val(folder_name), path ("${folder_name}_QC"), emit: QC_folder
+ tuple val(folder_name), path ("*_summary.stats"), emit: stats
script:
"""
if [ -f "summaries_" ]; then
- ln -s summaries_ ${folder_name}_final_summary.stats
- else
+ cp summaries_ ${folder_name}_final_summary.stats
+ else
head -n 1 summaries_1 > ${folder_name}_final_summary.stats
for i in summaries_*; do grep -v "filename" \$i >> ${folder_name}_final_summary.stats; done
fi
@@ -173,18 +112,18 @@ process MinIONQC {
}
/*
-* Perform bam2stats QC
+* Perform bam2stats QC
*/
process bam2stats {
label (params.LABEL)
- tag "${id}"
-
+ tag "${id}"
+
input:
tuple val(id), path(bamfile)
output:
tuple val(id), path ("${id}.stat")
-
+
script:
"""
bam2stats.py ${bamfile} > ${id}.stat
@@ -197,7 +136,7 @@ process bam2stats {
process AssignReads {
tag "${id}"
- publishDir(params.OUTPUT, mode:'copy')
+ publishDir(params.OUTPUT, mode:'copy')
label (params.LABEL)
input:
@@ -206,7 +145,7 @@ process AssignReads {
output:
tuple val(id), path ("${id}.assigned")
-
+
script:
if (tool == "nanocount")
"""
@@ -214,9 +153,9 @@ process AssignReads {
"""
else if(tool == "htseq")
"""
- samtools view ${input} | awk '{gsub(/XF:Z:/,"",\$NF); print \$1"\t"\$NF}' |grep -v '__' > ${id}.assigned
+ samtools view ${input} | awk '{if (\$NF>1) {gsub(/XF:Z:/,"",\$NF); print \$1"\011"\$NF} }' | awk '{if (\$0!~"__") print }' > ${id}.assigned
"""
- else
+ else
error "Invalid alignment mode: ${tool}"
}
@@ -227,13 +166,13 @@ process AssignReads {
process countStats {
tag "${id}"
label (params.LABEL)
-
+
input:
tuple val(id), path(input)
output:
tuple val(id), path ("${id}.count.stats")
-
+
script:
"""
wc -l ${input} |sed s@.assigned@@g | awk '{print \$2"\t"\$1}' > ${id}.count.stats
@@ -241,18 +180,19 @@ process countStats {
}
/*
-* Join AlnStats
+* Join AlnStats
*/
process joinAlnStats {
label (params.LABEL)
tag "joining aln stats"
-
+ shell '/bin/bash'
+
input:
- file "alnqc_*"
+ file "alnqc_*"
output:
- path("alnQC_mqc.txt")
-
+ path("alnQC_mqc.txt")
+
script:
"""
echo '# id: alnQC
@@ -264,42 +204,42 @@ process joinAlnStats {
}
/*
-* Join Count Stats
+* Join Count Stats
*/
process joinCountStats {
tag "joining count stats"
label (params.LABEL)
-
+
input:
- file "stats_*"
+ file "stats_*"
output:
path("counts_mqc.txt")
-
+
script:
"""
echo '# id: Assigned reads
# plot_type: \'table\'
- # section_name: Assigned counts
- File name \'Counts\' ' > counts_mqc.txt
- cat stats_* >> counts_mqc.txt
+ # section_name: Assigned counts
+ File name \'Counts\' ' > counts_mqc.txt
+ cat stats_* >> counts_mqc.txt
"""
-}
+}
process bam2Cram {
- tag "${idfile}"
-
- publishDir(params.OUTPUT, mode:'copy')
+ tag "${idfile}"
+
+ publishDir(params.OUTPUT, mode:'copy')
label (params.LABEL)
input:
path(reference)
val(subsampling_val)
tuple val(idfile), path(aln), path(index)
-
+
output:
- file("*.sorted.cram*") optional true
-
+ file("*.sorted.cram*") optional true
+
script:
def downcmd = ""
def input = aln
@@ -323,21 +263,21 @@ process joinCountStats {
process checkRef {
tag "Checking ${ reference }"
label (params.LABEL)
-
+
input:
path(reference)
-
+
output:
path("reference.fa")
-
+
script:
"""
- if [ `echo ${reference} | grep ".gz"` ]; then
+ if [ `echo ${reference} | grep ".gz"` ]; then
zcat ${reference} > reference.fa
- else
+ else
ln -s ${reference} reference.fa
fi
- """
+ """
}
// MOP_MOD and MOP_TAIL
@@ -371,7 +311,7 @@ process splitBams {
script:
"""
- samtools faidx ${ref_piece}
+ samtools faidx ${ref_piece}
awk '{OFS=" "}{print \$1, "1", \$2}' ${ref_piece}.fai > ${ref_piece}.bed
samtools view -@ ${task.cpus} ${bams} -L ${ref_piece}.bed -S | samtools view -Sb -t ${ref_piece}.fai -@ ${task.cpus} -o ${combid}.bam
samtools sort -@ ${task.cpus} -o ${combid}_s.bam ${combid}.bam
@@ -388,13 +328,14 @@ process indexReference {
input:
path(reference)
-
+
output:
tuple val("${reference.simpleName}"), path(reference), path("*.dict"), path ("*.fai")
-
+
script:
"""
- \$PICARD CreateSequenceDictionary R=${reference} O=${reference}.dict
+ mkdir tmp
+ \$PICARD CreateSequenceDictionary TMP_DIR=./tmp R=${reference} O=${reference}.dict
samtools faidx ${reference}
"""
}
@@ -403,30 +344,29 @@ process joinEpinanoRes {
label (params.LABEL)
container 'biocorecrg/mopmod:0.6.2'
tag "joining on ${id}"
- publishDir(params.OUTPUT, mode:'copy')
+ publishDir(params.OUTPUT, mode:'copy')
input:
tuple val(id), path(epinanores)
-
+
output:
- tuple val(id), path("*.plus_strand.per.site.csv.gz"), emit: plusepi
- tuple val(id), path("*.plus_strand.per.site.csv.gz"), emit: minusepi
+ tuple val(id), path("*.plus_strand.per.site.csv.gz"), optional: true, emit: plusepi
+ tuple val(id), path("*.minus_strand.per.site.csv.gz"), optional: true, emit: minusepi
-
script:
"""
- if compgen -G "*.plus_strand.per.site.csv.gz" > /dev/null; then
- zcat *pieces*.plus_strand.per.site.csv.gz | awk '!(NR>1 && /#Ref/)' | gzip >> ${id}.plus_strand.per.site.csv.gz
+ if compgen -G "*.fwd.per.site.csv.gz" > /dev/null; then
+ zcat *pieces*.fwd.per.site.csv.gz | awk '!(NR>1 && /#Ref/)' | gzip >> ${id}.plus_strand.per.site.csv.gz
+ fi
+ if compgen -G "*.rev.per.site.csv.gz" > /dev/null; then
+ zcat *pieces*.rev.per.site.csv.gz | awk '!(NR>1 && /#Ref/)' | gzip >> ${id}.minus_strand.per.site.csv.gz
fi
- if compgen -G "*.minus_strand.per.site.csv.gz" > /dev/null; then
- zcat *pieces*.minus_strand.per.site.csv.gz | awk '!(NR>1 && /#Ref/)' | gzip >> ${id}.minus_strand.per.site.csv.gz
- fi
"""
}
/*
-*
+*
*/
/*
@@ -437,17 +377,17 @@ process mean_per_pos {
container 'biocorecrg/mopmod:0.7'
label (params.LABEL)
- tag "${idsample}"
-
+ tag "${idsample}"
+
input:
- tuple val(idsample), path(event_align)
-
+ tuple val(idsample), path(event_align)
+
output:
tuple val(idsample), path("*_perpos_median.parquet")
script:
-
+
"""
mean_per_pos.py -i ${event_align} -o `basename ${event_align} .fast5_event_align.tsv.gz`
#gzip *_processed_perpos_median.tsv
@@ -461,11 +401,11 @@ process concat_mean_per_pos {
container 'biocorecrg/mopmod:0.7'
label (params.LABEL)
- tag "${idsample} on ${chr_file}"
-
+ tag "${idsample} on ${chr_file}"
+
input:
tuple val(idsample), path(event_align), path(chr_file)
-
+
output:
tuple val(idsample), path("${idsample}.gz")
@@ -478,19 +418,19 @@ process concat_mean_per_pos {
/*
-* CONCAT CSV FILES
+* CONCAT CSV FILES
*/
process concat_csv_files {
container 'biocorecrg/mopmod:0.7'
label (params.LABEL)
- tag "${idsample}"
-
- publishDir(params.OUTPUT, mode:'copy')
-
+ tag "${idsample}"
+
+ publishDir(params.OUTPUT, mode:'copy')
+
input:
tuple val(idsample), path("files_*")
-
+
output:
tuple val(idsample), path("${idsample}.csv.gz")
@@ -506,16 +446,16 @@ process concat_csv_files {
*/
process callVariants {
- tag "${sampleID}"
+ tag "${sampleID}"
container 'biocorecrg/mopmod:0.6'
label (params.LABEL)
-
+
input:
- tuple val(sampleID), path(alnfile), path(reference), path(dict_index), path(faiidx)
+ tuple val(sampleID), path(alnfile), path(reference), path(dict_index), path(faiidx)
output:
tuple val(sampleID), path("${sampleID}.tsv")
-
+
script:
"""
samtools view -h ${alnfile} -F 256 | \$SAM2TSV -R ${reference} | cut -f 3 --complement > ${sampleID}.tsv
@@ -527,19 +467,19 @@ process makeEpinanoPlots {
container "biocorecrg/mopnanotail:0.3"
label (params.LABEL)
- tag {"${sampleIDA}--${sampleIDB} ${mode}"}
-
+ tag {"${sampleIDA}--${sampleIDB} ${mode}"}
+
input:
path(rscript)
- tuple val(sampleIDA), val(sampleIDB), path(per_site_varA), path(per_site_varB)
+ tuple val(sampleIDA), val(sampleIDB), path(per_site_varA), path(per_site_varB)
val(mode)
-
+
output:
path("*.pdf")
-
+
script:
"""
- Rscript --vanilla ${rscript} ${per_site_varA} ${sampleIDA} ${per_site_varB} ${sampleIDB} ${mode}
+ Rscript --vanilla ${rscript} ${per_site_varA} ${sampleIDA} ${per_site_varB} ${sampleIDB} ${mode}
"""
}
@@ -547,19 +487,19 @@ process multiToSingleFast5 {
container 'biocorecrg/mopmod:0.6'
label (params.LABEL)
- tag "${idsample}"
-
+ tag "${idsample}"
+
input:
tuple val(idsample), path(fast5)
-
+
output:
tuple val(idsample), path("${idsample}-single")
-
+
script:
"""
mkdir ${idsample}-single;
- multi_to_single_fast5 -i ./ -s ./ -t ${task.cpus};
- rm ./filename_mapping.txt;
+ multi_to_single_fast5 -i ./ -s ./ -t ${task.cpus};
+ rm ./filename_mapping.txt;
mv ./*/*.fast5 ${idsample}-single;
"""
}
@@ -569,16 +509,16 @@ process multiToSingleFast5 {
*/
process bedGraphToWig {
container 'biocorecrg/mopmod:0.6'
- tag "${idsample}"
+ tag "${idsample}"
errorStrategy 'ignore'
-
+
input:
path(chromsizes)
tuple val(idsample), path(bedgraph)
-
+
output:
tuple val(idsample), path("*.bw")
-
+
script:
def ofname = "${bedgraph.baseName}.wig"
"""
@@ -592,7 +532,7 @@ process bedGraphToWig {
*/
process mergeTomboWigs {
label (params.LABEL)
- tag "${combID}"
+ tag "${combID}"
publishDir params.OUTPUT, pattern: "*_Tombo_Output.tsv.gz", mode: 'copy'
container "biocorecrg/mopmod:0.6"
@@ -601,8 +541,8 @@ process mergeTomboWigs {
tuple val(combID), path(coverage), path(covcontrol), path(statistic)
output:
- path("*_Tombo_Output.tsv.gz") optional true
-
+ path("*_Tombo_Output.tsv.gz") optional true
+
script:
"""
Merge_Tombo.py ${statistic} ${covcontrol} ${coverage} ${combID}.${strand}
@@ -614,14 +554,14 @@ process mergeTomboWigs {
*/
process RNA2DNA {
label (params.LABEL)
- tag "${id}"
+ tag "${id}"
input:
tuple val(id), path(rnafqfile)
output:
- tuple val(id), path("*_RNA.fq.gz")
-
+ tuple val(id), path("*_RNA.fq.gz")
+
script:
def ofname = "${rnafqfile.baseName}_RNA.fq"
@@ -636,7 +576,7 @@ process RNA2DNA {
*/
process wigToBigWig {
label (params.LABEL)
- tag "${id}"
+ tag "${id}"
container "biocorecrg/mopmod:0.6"
//errorStrategy 'ignore'
@@ -645,8 +585,8 @@ process wigToBigWig {
tuple val(id), path(bedgraph)
output:
- tuple val(id), path("*.bw") optional true
-
+ tuple val(id), path("*.bw") optional true
+
script:
def ofname = "${bedgraph.baseName}.bw"
@@ -666,15 +606,15 @@ process wigToBigWig {
process collect_tailfindr_results {
publishDir params.OUTPUT, pattern: "*_findr.csv.gz", mode: 'copy'
- tag "${ sampleID }"
+ tag "${ sampleID }"
label (params.LABEL)
-
+
input:
tuple val(sampleID), path("tailfin_*")
-
+
output:
- tuple val(sampleID), path("${sampleID}.findr.len.gz"), emit: length
- tuple val(sampleID), file ("*_findr.csv.gz"), emit: csv
+ tuple val(sampleID), path("${sampleID}.findr.len.gz"), emit: length
+ tuple val(sampleID), file ("*_findr.csv.gz"), emit: csv
script:
"""
@@ -692,20 +632,20 @@ process join_nanotail_results {
tag "joining nanotail results"
publishDir params.OUTPUT, mode: 'copy'
- tag { sampleID }
-
+ tag { sampleID }
+
input:
tuple val(sampleID), path(nanopol), path(tailfindr), path(genes)
file(joinScript)
-
+
output:
file("${sampleID}_*")
-
+
script:
"""
Rscript --vanilla ${joinScript} ${tailfindr} ${nanopol} ${genes} ${sampleID}
"""
-
+
}
@@ -715,7 +655,7 @@ process join_nanotail_results {
process filter_bam {
tag "${ sampleID }"
label (params.LABEL)
-
+
input:
file(reference)
tuple val(sampleID), path(alignment)
@@ -725,8 +665,8 @@ process filter_bam {
script:
"""
- #to keep only mapped reads and remove secondary alignments
- samtools view -@ {task.cpus} -bF 260 ${alignment} > ${sampleID}_filt.bam
+ #to keep only mapped reads and remove secondary alignments
+ samtools view -@ {task.cpus} -bF 260 ${alignment} > ${sampleID}_filt.bam
"""
}
@@ -734,14 +674,14 @@ process filter_bam {
process indexFasta {
label (params.LABEL)
- tag "${reference}"
-
+ tag "${reference}"
+
input:
path(reference)
-
+
output:
- stdout
-
+ stdout
+
script:
"""
samtools faidx ${reference}
@@ -752,18 +692,20 @@ process indexFasta {
process getChromInfo {
label (params.LABEL)
- tag "${reference}"
-
+ tag "${reference}"
+
input:
path(reference)
-
+
output:
- path("chrom.sizes")
-
+ path("chrom.sizes"), emit: sizes
+ stdout emit: chromosomes
+
script:
"""
samtools faidx ${reference}
cut -f 1,2 ${reference}.fai > chrom.sizes
+ cut -f 1 chrom.sizes
"""
}
@@ -774,17 +716,18 @@ process nanoConsensus {
label (params.LABEL)
errorStrategy 'ignore'
- tag "${sampleIDs} on ${chrName}"
-
+ tag "${sampleIDs} on ${chrName}"
+
input:
path(nanoConScript)
path(nanoScripts)
path(reference)
+ val(extraparams)
tuple val(sampleIDs), path(Epi_Sample), path(Epi_IVT), path(NP_Sample), path(NP_IVT), path(Tombo), path(Nanocomp), val(chrName), val(chrStart), val(chrEnd)
-
+
output:
path("*")
-
+
script:
"""
Rscript --vanilla ${nanoConScript} -Epi_Sample ${Epi_Sample} \
@@ -797,7 +740,7 @@ process nanoConsensus {
-ini_pos ${chrStart} -fin_pos ${chrEnd} \
-output ${sampleIDs} \
-fasta ${reference} \
- --nanocomp_stat GMM_logit_pvalue
+ --nanocomp_stat GMM_logit_pvalue ${extraparams}
"""
}
@@ -806,10 +749,28 @@ process nanoConsensus {
* COMMON FUNCTIONS
*/
-// Create a channel for tool options
-def getParameters(pars_tools_file) {
- def pars_tools = file(pars_tools_file)
- if( !pars_tools.exists() ) exit 1, "Missing tools options config: '$pars_tools'"
+// Check the input of mop_preprocess
+def checkInput(fast5_par, fastq_par) {
+ def type = ""
+ if (fast5_par != "" && fastq_par == "") {
+ type = "fast5"
+ } else if(fast5_par == "" && fastq_par != "") {
+ type = "fastq"
+ } else {
+ println "ERROR ################################################################"
+ println "Please choose one between fast5 and fastq as input!!!"
+ println "ERROR ################################################################"
+ println "Exiting ..."
+ System.exit(0)
+ }
+ return (type)
+}
+
+
+// Create a hash for tool options
+def getParameters (pars_tools_file) {
+ pars_tools = file(pars_tools_file)
+ if( !pars_tools.exists() ) exit 1, "Missing tools options config: '$pars_tools_file'"
def progPars = [:]
def allLines = pars_tools.readLines()
@@ -817,12 +778,12 @@ def getParameters(pars_tools_file) {
for( line : allLines ) {
def list = line.split("\t")
if (list.length <3) {
- error "ERROR!!! Tool option file has to be tab separated\n"
+ error "ERROR!!! Tool option file has to be tab separated\n"
}
if (!(list[0] =~ /#/ )) {
progPars["${list[0]}--${list[1]}"] = list[2].replace("\"", "").replace('$baseDir', "${baseDir}").replace('${baseDir}', "${baseDir}")
- }
- }
+ }
+ }
return(progPars)
}
@@ -839,17 +800,32 @@ def parseFinalSummary(final_summary_file) {
if (list[0] == "protocol") {
def vals = list[1].split(":")
outstring = "--flowcell ${vals[1]} --kit ${vals[2]}"
- }
- }
+ }
+ }
} else {
log.info '***No configuration file found!!. You must specify kit and flowcell in the parameters!!***\n'
- }
+ }
} else {
log.info '***No configuration file given!!. You must specify kit and flowcell in the parameters!!***\n'
}
return(outstring)
}
+// Create a channel for included ids
+def filterPerBarcodes (mybarcodes, barcoded_data) {
+ reshaped_barcoded_data = barcoded_data.map {
+ def id = it[0].split("---")[0]
+ def bc_id = it[0].split("\\.")[1]
+ def ori_id = "${id}---${bc_id}"
+ [ori_id, it]
+ }
+
+ filtered_data = reshaped_barcoded_data.combine(mybarcodes, by: 0).map {
+ it[1]
+ }
+ return(filtered_data)
+}
+
def reshapeDemuxSamples(inputChannel) {
def reshapedChannel = inputChannel.map {
@@ -868,6 +844,13 @@ def reshapeSamples(inputChannel) {
return(reshapedChannel)
}
+def homogenizeVals(value) {
+ new_value = value
+ if (value == "ON" || value == "YES" ) new_value = "ON"
+ if (value == "OFF" || value == "NO" ) new_value = "NO"
+ return(new_value)
+}
+
def mapIDPairs (ids, values) {
def combs = ids.combine(values, by:0).map{
[it[1], it[0], it[2]]
@@ -877,15 +860,53 @@ def mapIDPairs (ids, values) {
return(combs)
}
+
+// Create a channel for excluded ids
+def get_barcode_list (barcodes) {
+ if (barcodes != "") {
+ barcodes_to_include = Channel.fromPath( barcodes, checkIfExists: true ).splitText(){ it.trim() }
+ } else {
+ barcodes_to_include = Channel.empty()
+ }
+ return(barcodes_to_include)
+}
+
+// Create a channel for excluded ids
+def getFast5 (fast5_string_path) {
+
+ fast5_files = Channel.fromPath( fast5_string_path, checkIfExists: true)
+
+ fast5_per_folder = fast5_files.map {
+ def filepath = file(it)
+ def file_parts = "${filepath}".tokenize("/")
+ def folder_name = filepath[-2]
+ [folder_name, it]
+ }.groupTuple()
+
+ def num = 0
+ fast5_4_analysis = fast5_per_folder.map{
+ def folder_name = it[0]
+ def buffer_files = it[1].flatten().collate(params.granularity)
+ [folder_name, buffer_files]
+ }.transpose().map{
+ num++
+ [ "${it[0]}---${num}", it[1] ]
+ }
+
+ return(fast5_4_analysis)
+}
+
+
+
def checkTools(tool_names, tool_lists) {
println "----------------------CHECK TOOLS -----------------------------"
- tool_names.each{ key, value ->
+ tool_names.each{ key, value ->
if (value == "NO" ) {
println "> ${key} will be skipped"
} else {
def combid = "${key}--${value}".toString()
if (tool_lists.containsKey(combid)) {
- println "${key} : ${value}"
+ println "${key} : ${value}"
} else {
println "ERROR ################################################################"
println "${value} is not a valid program for ${key}"
diff --git a/mop_consensus/bin/NanoConsensus.R b/mop_consensus/bin/NanoConsensus.R
index f248c5c..6eee06c 100644
--- a/mop_consensus/bin/NanoConsensus.R
+++ b/mop_consensus/bin/NanoConsensus.R
@@ -28,19 +28,21 @@ parser$add_argument("-output", "--Output_name", type="character", help="Output(s
parser$add_argument("-fasta", "--Fasta_file", type="character", help="Genome fasta file.")
parser$add_argument("-ini_pos", "--Initial_position", type="integer", default=50, help="Initial position [default %(default)].")
parser$add_argument("-fin_pos", "--Final_position", type="integer", help="Final position.")
-#parser$add_argument("-plot", "--Plotting", action="store_true", help="Plot significant positions for all methods.")
parser$add_argument("-chr", "--Chr", type="character", help="Character to study.")
-parser$add_argument("--MZS_thr", default=5, type="double",
+parser$add_argument("--MZS_thr", default=5, type="double",
help="Modified Z-Score threshold for all results [default %(default)]")
-parser$add_argument("--NC_thr", default=5, type="double",
+parser$add_argument("--NC_thr", default=5, type="double",
help="NanoConsensus score threshold for all results [default %(default)]")
parser$add_argument("-exclude", "--Exclude", nargs='+', type="integer", help="Exclude these positions from the analysis (SNPs) - it will exclude the 17-mer.")
-parser$add_argument("--model_score", default="global", type="character",
+parser$add_argument("--model_score", default="global", type="character",
help="Model used to calculate NanoConsensus score [default %(default)]")
-parser$add_argument("--coverage", default=1, type="integer",
+parser$add_argument("--coverage", default=1, type="integer",
help="Minimum coverage per position to be included in the analysis [default %(default)]")
-parser$add_argument("--nanocomp_stat", default="GMM_logit_pvalue_context_2", type="character",
+parser$add_argument("--nanocomp_stat", default="GMM_logit_pvalue_context_2", type="character",
help="Stat from Nanocompore output to be used [default %(default)]")
+parser$add_argument("--bed", help="Path to RNA modification annotation (*.bed)")
+parser$add_argument("--ablines", action='store_true', help="Plot reported modified sites from the bed file.")
+
#EPINANO:
parser$add_argument("-Epi_Sample", "--Epinano_Sample", nargs=1, type="character", help="Path to Epinano features sample results.")
@@ -70,13 +72,13 @@ write(paste('Z score threshold: ',args$MZS_thr, sep=""), file = paste("NanoConse
write(paste('NanoConsensus score threshold: ',args$NC_thr, "*median(NanoConsensus Score)", sep=""), file = paste("NanoConsensus_", args$Output_name,".log", sep=""), append = T)
write('Step 1: Processing data from individual softwares', file = paste("NanoConsensus_", args$Output_name,".log", sep=""), append = T)
-##EPINANO processing:
+##EPINANO processing:
epinano_data <- epinano_processing(args$Epinano_Sample, args$Epinano_IVT, args$Initial_position, args$Final_position, args$MZS_thr, args$Chr, args$Exclude, args$coverage)
-##NANOPOLISH processing:
+##NANOPOLISH processing:
nanopolish_data <- nanopolish_processing(args$Nanopolish_Sample, args$Nanopolish_IVT, args$Initial_position, args$Final_position, args$MZS_thr, args$Chr, args$Exclude, args$coverage)
-##TOMBO processing:
+##TOMBO processing:
tombo_data <- tombo_processing(args$Tombo_Sample, args$thr_tombo_pos, args$thr_tombo_kmer, args$Initial_position, args$Final_position, args$MZS_thr, args$Chr, args$Exclude, args$coverage)
##NANOCOMPORE processing:
@@ -87,11 +89,17 @@ nanocompore_data <- nanocomp_processing(args$Nanocomp_Sample, args$nanocomp_metr
list_plotting <- list(epinano_data[[1]], nanopolish_data[[1]], tombo_data[[1]], nanocompore_data[[1]])
list_significant <- list(epinano_data[[2]], nanopolish_data[[2]], tombo_data[[2]], nanocompore_data[[2]])
+#If there is annotation, process it:
+if (length(args$bed)!=0){
+ annotation <- process_bed(args$bed, args$Chr)
+} else {
+ annotation <- c()
+}
+
#Create Z-Scores plotting object:
write('Step 2: Plotting ZScores from individual softwares', file = paste("NanoConsensus_", args$Output_name,".log", sep=""), append = T)
-barplot_4soft <- barplot_plotting(list_plotting, list_significant, args$Output_name, args$MZS_thr, args$Autoscaling, args$Initial_position, args$Final_position)
+barplot_4soft <- barplot_plotting(list_plotting, list_significant, args$Output_name, args$MZS_thr, args$Autoscaling, args$Initial_position, args$Final_position, annotation, args$ablines)
##Analysis of SIGNIFICANT POSITIONS across methods:
write('Step 3: Overlapping analysis and generation of Venn diagram', file = paste("NanoConsensus_", args$Output_name,".log", sep=""), append = T)
-analysis_significant_positions(list_significant, list_plotting, args$Fasta_file, args$Output_name, args$Initial_position, args$Final_position, args$MZS_thr, args$NC_thr, args$model_score, barplot_4soft)
-
+analysis_significant_positions(list_significant, list_plotting, args$Fasta_file, args$Output_name, args$Initial_position, args$Final_position, args$MZS_thr, args$NC_thr, args$model_score, barplot_4soft, annotation, args$ablines)
diff --git a/mop_consensus/bin/scripts/Accessory_functions_consensusNanoMod.R b/mop_consensus/bin/scripts/Accessory_functions_consensusNanoMod.R
index b0feeea..63fb8c1 100644
--- a/mop_consensus/bin/scripts/Accessory_functions_consensusNanoMod.R
+++ b/mop_consensus/bin/scripts/Accessory_functions_consensusNanoMod.R
@@ -1,79 +1,79 @@
-###Script which contains multiple R functions used to generate consensus putative modified positions from NanoMod results.
+###Script which contains multiple R functions used to generate consensus putative modified positions from NanoMod results.
#Read gzipped or flat files
read_gzipped <- function(input_file) {
input <- input_file
if (file_ext(input_file)=="gz") {
- input = gzfile(input_file)
+ input = gzfile(input_file)
}
return(input)
}
read_tab_file <- function(input_file) {
- file_content <- read.delim(read_gzipped(input_file))
+ file_content <- read.delim(read_gzipped(input_file))
return(file_content)
}
read_csv_file <- function(input_file) {
- file_content <- read.csv(read_gzipped(input_file), stringsAsFactors = FALSE)
+ file_content <- read.csv(read_gzipped(input_file), stringsAsFactors = FALSE)
return(file_content)
}
#Processing Epinano results:
epinano_processing <- function(sample_file, ivt_file, initial_position, final_position, MZS_thr, chr, exclude_SNP, Coverage) {
-
+
#Import and clean data:
sample <- read_csv_file(sample_file)
sample <- subset(sample, cov>Coverage)
- sample <- subset(sample, pos>=initial_position)
+ sample <- subset(sample, pos>=initial_position)
sample <- subset(sample, pos<=final_position)
sample$reference <- paste(sample$X.Ref, sample$pos, sep='_')
sample$Difference <- as.numeric(sample$mis)+as.numeric(sample$ins)+as.numeric(sample$del)
- sample <- sample[,c(1,2,13,12)]
+ sample <- sample[,c(1,2,14,13)]
colnames(sample) <- c('Reference', 'Position', 'Difference_sample', 'Merge')
-
+
ivt <- read_csv_file(ivt_file)
ivt <- subset(ivt, cov>Coverage)
- ivt <- subset(ivt, pos>=initial_position)
+ ivt <- subset(ivt, pos>=initial_position)
ivt <- subset(ivt, pos<=final_position)
ivt$reference <- paste(ivt$X.Ref, ivt$pos, sep='_')
ivt$Difference <- as.numeric(ivt$mis)+as.numeric(ivt$ins)+as.numeric(ivt$del)
- ivt <- ivt[,c(1,2,13,12)]
+ ivt <- ivt[,c(1,2,14,13)]
colnames(ivt) <- c('Reference', 'Position', 'Difference_IVT', 'Merge')
-
+
if (nrow(sample)!=0 && nrow(ivt)!=0) {
#Join both dataframes and clean unecessary columns:
plotting_positions <- join(sample, ivt, by="Merge")
plotting_positions <- subset(plotting_positions, Reference == chr)
-
+
#Exclude SNPs and 10 positions before and after (21mer):
if (length(exclude_SNP)!=0) {
excluded_positions <- c()
-
+
for (single_position in exclude_SNP){
excluded_positions <- c(excluded_positions, seq(single_position-10,single_position+10))
}
-
+
plotting_positions <- subset(plotting_positions, !Position %in% unique(excluded_positions))
- }
-
+ }
+
plotting_positions$Difference <- abs(as.numeric(plotting_positions$Difference_sample) - as.numeric(plotting_positions$Difference_IVT))
plotting_positions$Feature <- "Epinano"
plotting_positions <- plotting_positions[,c(4,2,8,9)]
-
+
#Calculate the threshold:
threshold <- median(plotting_positions$Difference, na.rm = TRUE)
-
+
#Calculate fold change and re-order:
plotting_positions$Score <- plotting_positions$Difference/threshold
plotting_positions$Modified_ZScore <- (plotting_positions$Score-median(plotting_positions$Score, na.rm = TRUE))/sd(plotting_positions$Score, na.rm = TRUE)
-
+
plotting_positions <- plotting_positions[,c(1,2,5,4,6)]
colnames(plotting_positions) <- c('Reference', 'Position', 'Score', 'Feature', 'Modified_ZScore')
-
+
#Extract significant positions based on the specific threshold:
significant_positions <- subset(plotting_positions, Modified_ZScore>MZS_thr)
-
+
} else {
plotting_positions <- data.frame(Reference= character(), Position=integer(), Difference=double(), Feature=character())
significant_positions <- data.frame(Reference= character(), Position=integer(), Difference=double(), Feature=character())
@@ -85,56 +85,61 @@ epinano_processing <- function(sample_file, ivt_file, initial_position, final_po
nanopolish_processing <- function(sample_file, ivt_file, initial_position, final_position, MZS_thr, chr, exclude_SNP, Coverage) {
#Import data:
sample <- read_csv_file(sample_file)
-
- #Add sample information:
- sample$feature <- 'Nanopolish'
- sample <- subset(sample, coverage>Coverage)
- colnames(sample)<- c("contig_wt","position","reference_kmer_wt", "event_level_median_wt", "coverage", "feature_wt")
- sample<- subset(sample, contig_wt == chr)
- sample$reference <- paste(sample$contig_wt, sample$position, sep='_')
-
- #Import KO:
- raw_data_ivt <-read_csv_file(ivt_file)
- raw_data_ivt <- subset(raw_data_ivt, coverage>Coverage)
- colnames(raw_data_ivt)<- c("contig_ko","position","reference_kmer_ko", "event_level_median_ko", 'coverage')
- raw_data_ivt <- subset(raw_data_ivt, contig_ko == chr)
- raw_data_ivt$reference <- paste(raw_data_ivt$contig_ko, raw_data_ivt$position, sep='_')
-
- #Join tables, calculate differences between means/medians:
- plotting_data <- join(sample, raw_data_ivt, by="reference", type='inner')
- plotting_data$diff <- abs(plotting_data$event_level_median_ko-plotting_data$event_level_median_wt)
- plotting_positions <- data.frame(plotting_data$reference, plotting_data$position, plotting_data$diff, plotting_data$feature_wt)
- colnames(plotting_positions) <- c('Reference', 'Position', 'Difference', 'Feature')
-
- plotting_positions <- subset(plotting_positions, Position>=initial_position)
- plotting_positions <- subset(plotting_positions, Position<=final_position)
-
- #Exclude SNPs and 10 positions before and after (21mer):
- if (length(exclude_SNP)!=0) {
- excluded_positions <- c()
-
- for (single_position in exclude_SNP){
- excluded_positions <- c(excluded_positions, seq(single_position-10,single_position+10))
+ if (nrow(sample)>0) {
+ #Add sample information:
+ sample$feature <- 'Nanopolish'
+ sample <- subset(sample, coverage>Coverage)
+ colnames(sample)<- c("contig_wt","position","reference_kmer_wt", "event_level_median_wt", "coverage", "feature_wt")
+ sample<- subset(sample, contig_wt == chr)
+ sample$reference <- paste(sample$contig_wt, sample$position, sep='_')
+
+ #Import KO:
+ raw_data_ivt <- read_csv_file(ivt_file)
+ raw_data_ivt <- subset(raw_data_ivt, coverage>Coverage)
+ colnames(raw_data_ivt)<- c("contig_ko","position","reference_kmer_ko", "event_level_median_ko", 'coverage')
+ raw_data_ivt <- subset(raw_data_ivt, contig_ko == chr)
+ raw_data_ivt$reference <- paste(raw_data_ivt$contig_ko, raw_data_ivt$position, sep='_')
+
+ #Join tables, calculate differences between means/medians:
+ plotting_data <- join(sample, raw_data_ivt, by="reference", type='inner')
+ plotting_data$diff <- abs(plotting_data$event_level_median_ko-plotting_data$event_level_median_wt)
+ plotting_positions <- data.frame(plotting_data$reference, plotting_data$position, plotting_data$diff, plotting_data$feature_wt)
+ colnames(plotting_positions) <- c('Reference', 'Position', 'Difference', 'Feature')
+
+ plotting_positions <- subset(plotting_positions, Position>=initial_position)
+ plotting_positions <- subset(plotting_positions, Position<=final_position)
+
+ #Exclude SNPs and 10 positions before and after (21mer):
+ if (length(exclude_SNP)!=0) {
+ excluded_positions <- c()
+
+ for (single_position in exclude_SNP){
+ excluded_positions <- c(excluded_positions, seq(single_position-10,single_position+10))
+ }
+
+ plotting_positions <- subset(plotting_positions, !Position %in% unique(excluded_positions))
}
-
- plotting_positions <- subset(plotting_positions, !Position %in% unique(excluded_positions))
- }
-
- #Calculate the threshold:
- threshold <- median(plotting_positions$Difference, na.rm = TRUE)
-
- #Calculate fold change:
- plotting_positions$Score <- plotting_positions$Difference/threshold
- plotting_positions$Modified_ZScore <- (plotting_positions$Score-median(plotting_positions$Score, na.rm = TRUE))/sd(plotting_positions$Score, na.rm = TRUE)
-
- #Format data for plotting:
- plotting_positions <- plotting_positions[,c(1,2,5,4,6)]
-
- #Extract significant positions:
- significant_positions <- subset(plotting_positions, Modified_ZScore>MZS_thr)
-
+
+ #Calculate the threshold:
+ threshold <- median(plotting_positions$Difference, na.rm = TRUE)
+
+ #Calculate fold change:
+ plotting_positions$Score <- plotting_positions$Difference/threshold
+ plotting_positions$Modified_ZScore <- (plotting_positions$Score-median(plotting_positions$Score, na.rm = TRUE))/sd(plotting_positions$Score, na.rm = TRUE)
+
+ #Format data for plotting:
+ plotting_positions <- plotting_positions[,c(1,2,5,4,6)]
+
+ #Extract significant positions:
+ significant_positions <- subset(plotting_positions, Modified_ZScore>MZS_thr)
+
+ } else {
+ plotting_positions <- data.frame(Reference= character(), Position=integer(), Difference=double(), Feature=character())
+ significant_positions <- data.frame(Reference= character(), Position=integer(), Difference=double(), Feature=character())
+ }
+
return(list(plotting_positions,significant_positions))
-
+
}
tombo_processing <- function(sample_file, t_position, t_kmer, initial_position, final_position, MZS_thr, chr, exclude_SNP, Coverage) {
@@ -147,47 +152,47 @@ tombo_processing <- function(sample_file, t_position, t_kmer, initial_position,
sample <- subset(sample, Coverage_Sample>Coverage & Coverage_IVT>Coverage)
colnames(sample) <- c('Reference', 'Chr', 'Position', 'Difference', 'Coverage_Sample', 'Coverage_IVT', 'Statistic_kmer',
'Feature')
-
+
sample <- subset(sample, Chr == chr)
sample <- subset(sample, Position >= initial_position)
sample <- subset(sample, Position <= final_position)
-
+
#Exclude SNPs and 10 positions before and after (21mer):
if (length(exclude_SNP)!=0) {
excluded_positions <- c()
-
+
for (single_position in exclude_SNP){
excluded_positions <- c(excluded_positions, seq(single_position-10,single_position+10))
}
-
+
sample <- subset(sample, !Position %in% unique(excluded_positions))
- }
-
+ }
+
#Calculate the thresholds:
threshold_position <- median(sample$Difference, na.rm = TRUE)
threshold_kmer <- median(sample$Statistic_kmer, na.rm = TRUE)
-
+
#Calculate fold change:
sample$Score <- sample$Difference/threshold_position
sample$Score_kmer <- sample$Statistic_kmer/threshold_kmer
sample$Modified_ZScore <- (sample$Score-median(sample$Score, na.rm = TRUE))/sd(sample$Score, na.rm = TRUE)
sample$Modified_ZScore_kmer <- (sample$Score_kmer-median(sample$Score_kmer, na.rm = TRUE))/sd(sample$Score_kmer, na.rm = TRUE)
-
- #Filter columns to get data in plotting format:
+
+ #Filter columns to get data in plotting format:
plotting_positions <- sample[,c(1,3,9,8,11)]
-
+
#Extract significant positions and kmers and then perform the intersection:
positions <- subset(sample, Modified_ZScore > MZS_thr)
kmer <- subset(sample, Modified_ZScore_kmer > MZS_thr)
-
+
significant_positions <- join(kmer, positions, by = 'Reference', type = "inner")
significant_positions <- significant_positions[,c(1,3,9,8,11)]
-
+
} else {
plotting_positions <- data.frame(Reference= character(), Position=integer(), Difference=double(), Feature=character())
significant_positions <- data.frame(Reference= character(), Position=integer(), Difference=double(), Feature=character())
}
-
+
return(list(plotting_positions, significant_positions))
}
@@ -195,32 +200,32 @@ nanocomp_processing <- function(sample_file, nanocomp_metric, t_nanocomp, initia
#Import data:
sample <- read_tab_file(sample_file)
if (nrow(sample)>0) {
-
+
#Transform metric:
sample$stat <- log(sample[[nanocomp_stat]])
sample$log_stat <- (sample$stat)*(-1)
-
+
#Prepare plotting data:
sample$reference <- paste(sample$ref_id, sample$pos, sep='_')
sample$Feature <- 'Nanocompore'
-
+
sample <- sample[which(sample$ref_id==chr),]
plotting_data <- sample[,c(ncol(sample)-1, 1, ncol(sample)-2, ncol(sample))]
colnames(plotting_data) <- c('Reference', 'Position', 'Difference', 'Feature')
plotting_data <- subset(plotting_data, Position>=initial_position)
plotting_data <- subset(plotting_data, Position <= final_position)
-
+
#Exclude SNPs and 10 positions before and after (21mer):
if (length(exclude_SNP)!=0) {
excluded_positions <- c()
-
+
for (single_position in exclude_SNP){
excluded_positions <- c(excluded_positions, seq(single_position-10,single_position+10))
}
-
+
plotting_data <- subset(plotting_data, !Position %in% unique(excluded_positions))
- }
-
+ }
+
#Calculate the thresholds:
threshold <- median(plotting_data$Difference, na.rm = TRUE)
@@ -233,19 +238,26 @@ nanocomp_processing <- function(sample_file, nanocomp_metric, t_nanocomp, initia
#Extract significant positions:
significant_positions <- subset(plotting_data, Modified_ZScore > MZS_thr)
-
+
} else {
plotting_data <- data.frame(Reference= character(), Position=integer(), Difference=double(), Feature=character())
significant_positions <- data.frame(Reference= character(), Position=integer(), Difference=double(), Feature=character())
}
-
-
+
+
return(list(plotting_data, significant_positions))
}
-barplot_plotting <- function (list_plotting, list_significant, output_name, MZS_thr, autoscaling, initial_pos, final_pos){
-
- #Rbind all data - already in long format:
+process_bed <- function(bed_file, chr) {
+ whole_bed <- read.delim(bed_file, header=FALSE)
+ chr_bed <- subset(whole_bed, V1==chr)
+
+ return(chr_bed)
+}
+
+barplot_plotting <- function (list_plotting, list_significant, output_name, MZS_thr, autoscaling, initial_pos, final_pos, annotation, ablines){
+
+ #Rbind all data - already in long format:
initial_join <- TRUE
for (i in 2:length(list_plotting)){
if (initial_join==TRUE){
@@ -257,73 +269,109 @@ barplot_plotting <- function (list_plotting, list_significant, output_name, MZS_
putative_positions <- rbind(putative_positions, list_significant[[i]])
}
}
-
+
#Set Feature into a factor for plotting purposes:
initial_df$sample_f <- factor(initial_df$Feature, levels = c('Epinano', 'Nanopolish', 'Tombo', 'Nanocompore'))
putative_positions$sample_f <- factor(putative_positions$Feature, levels = c('Epinano', 'Nanopolish', 'Tombo', 'Nanocompore'))
-
- #Plotting:
- barplot_4soft <- ggplot(initial_df, aes(x=Position, y=Modified_ZScore, fill=sample_f)) + ggtitle(output_name) +
- geom_bar(data=subset(initial_df, Modified_ZScore < MZS_thr), stat= "identity", width=4, fill = "#dcdcdd") +
- new_scale_color() + xlim(initial_pos, final_pos) + ylab('Z-Score ((x-median)/sd)') + xlab("") +
- geom_bar(data=subset(initial_df, Modified_ZScore >= MZS_thr), stat = "identity", width=4) +
- scale_fill_manual(values = c("#00A651", "#662D91", "#00AEEF", "#F59364")) +
- theme_bw() +theme(plot.title = element_text(face = "bold", hjust = 0.5), text = element_text(size=25),
- axis.text = element_text(size = 25), strip.text.y = element_text(size = 25),
- legend.text=element_text(size=22), legend.position = "none") +
- facet_grid(sample_f ~ . , scales="fixed")
+
+ ##Plotting:
+ #If there are annotated positions:
+ if (nrow(annotation)!=0 && ablines){
+ barplot_4soft <- ggplot(initial_df, aes(x=Position, y=Modified_ZScore, fill=sample_f)) + ggtitle(output_name) +
+ geom_bar(data=subset(initial_df, Modified_ZScore < MZS_thr), stat= "identity", width=4, fill = "#dcdcdd") +
+ new_scale_color() + xlim(initial_pos, final_pos) + ylab('Z-Score ((x-median)/sd)') + xlab("") +
+ geom_bar(data=subset(initial_df, Modified_ZScore >= MZS_thr), stat = "identity", width=4) +
+ scale_fill_manual(values = c("#00A651", "#662D91", "#00AEEF", "#F59364"), breaks = c("Epinano", "Nanopolish", "Tombo", "Nanocompore")) +
+ geom_vline(xintercept=as.numeric(annotation$V3), linetype="dashed") +
+ theme_bw() +theme(plot.title = element_text(face = "bold", hjust = 0.5), text = element_text(size=25),
+ axis.text = element_text(size = 25), strip.text.y = element_text(size = 25),
+ legend.text=element_text(size=22), legend.position = "none") +
+ facet_grid(sample_f ~ . , scales="fixed")
+
+ } else {
+ barplot_4soft <- ggplot(initial_df, aes(x=Position, y=Modified_ZScore, fill=sample_f)) + ggtitle(output_name) +
+ geom_bar(data=subset(initial_df, Modified_ZScore < MZS_thr), stat= "identity", width=4, fill = "#dcdcdd") +
+ new_scale_color() + xlim(initial_pos, final_pos) + ylab('Z-Score ((x-median)/sd)') + xlab("") +
+ geom_bar(data=subset(initial_df, Modified_ZScore >= MZS_thr), stat = "identity", width=4) +
+ scale_fill_manual(values = c("#00A651", "#662D91", "#00AEEF", "#F59364"), breaks = c("Epinano", "Nanopolish", "Tombo", "Nanocompore")) +
+ theme_bw() +theme(plot.title = element_text(face = "bold", hjust = 0.5), text = element_text(size=25),
+ axis.text = element_text(size = 25), strip.text.y = element_text(size = 25),
+ legend.text=element_text(size=22), legend.position = "none") +
+ facet_grid(sample_f ~ . , scales="fixed")
+ }
+
+
return(barplot_4soft)
}
-Nanoconsensus_plotting <- function(data, supported_kmers, output_name, barplot_4soft) {
-
+Nanoconsensus_plotting <- function(data, supported_kmers, output_name, barplot_4soft, initial_pos, final_pos, annotation, ablines) {
+
#Extracting supported kmers:
supported_positions <- c()
kmers_limits <- c()
barplot_4soft <- barplot_4soft
-
+
#Format data:
data$Position <- data$Start+2
data$Feature <- "NanoConsensus"
-
+
#Generate NanoConsensus track:
if (nrow(supported_kmers)!=0) {
#If supported kmers, include them in the plot object:
for (i in seq(1, nrow(supported_kmers))) {
supported_positions <- c(supported_positions, seq(supported_kmers[i,2], supported_kmers[i,3]))
kmers_limits <- c(kmers_limits, supported_kmers[i,2], supported_kmers[i,3])
-
+
}
-
+
#Retrieve borders of supported kmers:
limits_supp_kmers <- subset(data[,c(16,17,18)], Position %in% kmers_limits)
-
- #Adding end of the transcript border if needed:
- #if (nrow(limits_supp_kmers)!=length(kmers_limits)) {
- #
- #}
-
- #Create plot object:
- nanoconsensus_plot <- ggplot(data, aes(x=Position, y=Merged_Score)) +
- geom_bar(stat= "identity", width=4, fill = "#dcdcdd") + ylim(0,1) +
- geom_bar(data=subset(data, Position %in% supported_positions), stat= "identity", width=4, fill = "#BE1E2D") +
- geom_label_repel(data=limits_supp_kmers,aes(label = Position, x=Position, y = Merged_Score), size = 8, label.size = 0.75) +
- ylab('NanoConsensus Score') +
- theme_bw() +theme(plot.title = element_text(face = "bold", hjust = 0.5), text = element_text(size=25),
- axis.text = element_text(size = 25), strip.text.y = element_text(size = 25),
- legend.text=element_text(size=22)) +
- facet_grid(Feature ~ . , scales="fixed")
-
+
+ #If there are annotated positions:
+ if (nrow(annotation)!=0 && ablines){
+ #Create plot object:
+ nanoconsensus_plot <- ggplot(data, aes(x=Position, y=Merged_Score)) +
+ geom_bar(stat= "identity", width=4, fill = "#dcdcdd") + ylim(0,1) +
+ geom_bar(data=subset(data, Position %in% supported_positions), stat= "identity", width=4, fill = "#BE1E2D") +
+ geom_label_repel(data=limits_supp_kmers,aes(label = Position, x=Position, y = Merged_Score), size = 8, label.size = 0.75) +
+ ylab('NanoConsensus Score') + xlim(initial_pos, final_pos) +
+ geom_vline(xintercept=as.numeric(annotation$V3), linetype="dashed") +
+ theme_bw() +theme(plot.title = element_text(face = "bold", hjust = 0.5), text = element_text(size=25),
+ axis.text = element_text(size = 25), strip.text.y = element_text(size = 25),
+ legend.text=element_text(size=22)) +
+ facet_grid(Feature ~ . , scales="fixed")
+ } else {
+ nanoconsensus_plot <- ggplot(data, aes(x=Position, y=Merged_Score)) +
+ geom_bar(stat= "identity", width=4, fill = "#dcdcdd") + ylim(0,1) +
+ geom_bar(data=subset(data, Position %in% supported_positions), stat= "identity", width=4, fill = "#BE1E2D") +
+ geom_label_repel(data=limits_supp_kmers,aes(label = Position, x=Position, y = Merged_Score), size = 8, label.size = 0.75) +
+ ylab('NanoConsensus Score') + xlim(initial_pos, final_pos) +
+ theme_bw() +theme(plot.title = element_text(face = "bold", hjust = 0.5), text = element_text(size=25),
+ axis.text = element_text(size = 25), strip.text.y = element_text(size = 25),
+ legend.text=element_text(size=22)) +
+ facet_grid(Feature ~ . , scales="fixed")
+ }
+
} else {
#Create plot object if there arent any supported kmers:
- nanoconsensus_plot <- ggplot(data, aes(x=Position, y=Merged_Score)) + geom_bar(stat= "identity", width=4, fill = "#dcdcdd") + ylim(0,1) +
- ylab('NanoConsensus Score') +
- theme_bw() +theme(plot.title = element_text(face = "bold", hjust = 0.5), text = element_text(size=25),
- axis.text = element_text(size = 25), strip.text.y = element_text(size = 25),
- legend.text=element_text(size=22)) + facet_grid(Feature ~ . , scales="fixed")
+ if (nrow(annotation)!=0 && ablines){
+ nanoconsensus_plot <- ggplot(data, aes(x=Position, y=Merged_Score)) + geom_bar(stat= "identity", width=4, fill = "#dcdcdd") + ylim(0,1) +
+ ylab('NanoConsensus Score') + xlim(initial_pos, final_pos) +
+ geom_vline(xintercept=as.numeric(annotation$V3), linetype="dashed") +
+ theme_bw() +theme(plot.title = element_text(face = "bold", hjust = 0.5), text = element_text(size=25),
+ axis.text = element_text(size = 25), strip.text.y = element_text(size = 25),
+ legend.text=element_text(size=22)) + facet_grid(Feature ~ . , scales="fixed")
+ } else {
+ nanoconsensus_plot <- ggplot(data, aes(x=Position, y=Merged_Score)) + geom_bar(stat= "identity", width=4, fill = "#dcdcdd") + ylim(0,1) +
+ ylab('NanoConsensus Score') + xlim(initial_pos, final_pos) +
+ theme_bw() +theme(plot.title = element_text(face = "bold", hjust = 0.5), text = element_text(size=25),
+ axis.text = element_text(size = 25), strip.text.y = element_text(size = 25),
+ legend.text=element_text(size=22)) + facet_grid(Feature ~ . , scales="fixed")
+ }
+
}
-
+
#Plot both plots in the same pdf file:
pdf(file=paste(output_name,"NanoConsensus_Scores.pdf", sep = "-"), bg = "transparent", width = 26, height = 15.75 )
g2 <- ggplotGrob(barplot_4soft)
@@ -332,9 +380,9 @@ Nanoconsensus_plotting <- function(data, supported_kmers, output_name, barplot_4
g$widths <- unit.pmax(g2$widths, g3$widths)
#grid.newpage()
grid.draw(g)
-
+
dev.off()
-
+
}
extract_length_from_GRobjects <- function(GRange_object) {
@@ -344,150 +392,150 @@ extract_length_from_GRobjects <- function(GRange_object) {
} else {
n_length <- length(GRange_object)
}
-
+
return(n_length)
-
+
}
overlapping_GRobjects <- function(GRange_object_1, GRange_object_2, length_object1, length_object2) {
-
+
#Only perform the intersection if both GRange are valid:
if (is.null(GRange_object_1)==FALSE & is.null(GRange_object_2)==FALSE) {
-
+
#Perform the intersections:
if (length_object1 >= length_object2) {
intersect_object <- subsetByOverlaps(GRange_object_2, GRange_object_1, minoverlap=1)
} else {
intersect_object <- subsetByOverlaps(GRange_object_1, GRange_object_2, minoverlap=1)
}
-
+
} else {
-
+
intersect_object <- GRanges()
}
-
+
return(intersect_object)
-
+
}
draw_pairwise_venn_diagram <- function (group_1, group_2, intersect_12, groups, output_name){
-
+
#Draw Venn Diagram:
grid.newpage()
- venn.plot <- draw.pairwise.venn(group_1, group_2, intersect_12,
+ venn.plot <- draw.pairwise.venn(group_1, group_2, intersect_12,
category = groups, fill = c("darksalmon", "dodgerblue"), cat.pos = c(0, 0), alpha = 0.5
)
-
+
# Writing to file
png(filename = paste(output_name,'VennDiagram.png', sep="_"))
grid.draw(venn.plot)
dev.off()
-
+
}
draw_triple_venn_diagram <- function (group_1, group_2, group_3, intersect_12, intersect_13, intersect_23, intersect_123, groups, output_name){
-
+
#Draw Venn Diagram:
grid.newpage()
- venn.plot <- draw.triple.venn(group_1, group_2, group_3, intersect_12, intersect_23, intersect_13,
+ venn.plot <- draw.triple.venn(group_1, group_2, group_3, intersect_12, intersect_23, intersect_13,
intersect_123, category = groups, fill = c("darksalmon", "dodgerblue", "lightseagreen"), cat.pos = c(-45, 0, 45), alpha = 0.5
)
-
+
# Writing to file
png(filename = paste(output_name,'VennDiagram.png', sep="_"))
grid.draw(venn.plot)
dev.off()
-
+
}
draw_venn_diagram <- function (group_1, group_2, group_3, group_4, intersect_12, intersect_13, intersect_14, intersect_23, intersect_24,
intersect_34, intersect_123, intersect_124, intersect_134, intersect_234, intersect_1234, groups, output_name){
-
+
#Draw Venn Diagram:
grid.newpage()
venn.plot <- draw.quad.venn(group_1, group_2, group_3, group_4, intersect_12, intersect_13, intersect_14, intersect_23, intersect_24,
- intersect_34, intersect_123, intersect_124, intersect_134, intersect_234, intersect_1234,
+ intersect_34, intersect_123, intersect_124, intersect_134, intersect_234, intersect_1234,
category = groups, fill = c("darksalmon", "dodgerblue", "lightseagreen", "darkorange"), cat.pos = c(0, 0, 0, 0), alpha = 0.5
)
-
+
# Writing to file
png(filename = paste(output_name,'VennDiagram.png', sep="_"))
grid.draw(venn.plot)
dev.off()
-
+
}
extract_kmers <- function (bedfile, fasta) {
#Create Temp files:
a.file=tempfile()
out=tempfile()
-
+
#Format the ranges to obtain 9-mers, centered in the 5mer identified by NanoMod - REMEMBER: bedtools understands bed files as 0-based!
bedfile$Start <- bedfile$Start-3
bedfile$End <- bedfile$End+2
-
+
#Write formatted dataframes to tempfile
write.table(bedfile,file=a.file,quote=F,sep="\t",col.names=F,row.names=F)
-
+
#Create the command for the bedtools command and execute it:
command=paste("bedtools getfasta -fi", fasta, "-bed",a.file,"-tab >",out,sep=" ")
try(system(command))
-
+
#Save results into a dataframe:
res=read.table(out,header=F)
-
- #Check if there is the RRACH motif using regular expressions:
+
+ #Check if there is the RRACH motif using regular expressions:
motif <- c()
pattern <- "([A|G]{2})AC([A|C|T]{1})"
-
+
for (i in 1:nrow(res)) {
motif <- c(motif,str_detect(res$V2[i], pattern))
}
-
+
#Add new columns:
colnames(res) <- c('Data', 'Kmer')
res$RRACH_motif <- motif
-
+
return(list(res$Kmer, res$RRACH_motif))
-
+
}
overwrite_NaNs <- function (input) {
-
+
if (is.nan(input) == TRUE || is.infinite(input) == TRUE || length(input) == 0) {
out_value <- NA
} else {
out_value <- input
}
-
+
return(out_value)
}
extracting_status <- function (positions_df, list_number, summit, MZS_thr) {
-
+
##Declaring initial variables:
soft_rawScore <- c()
soft_modifiedScore <- c()
soft_status <- c()
-
- #Loop across kmers:
+
+ #Loop across kmers:
for (i in seq(1:length(positions_df$Start))){
initial_position <- positions_df$Start[i]
final_position <- positions_df$End[i]
-
+
##Searching for the highest value - summit:
if (summit == TRUE) {
#Looping within the kmer to find the highest value - summit:
for (x in seq(initial_position, final_position)){
-
+
if (x == initial_position){
highest_rawScore <- overwrite_NaNs(list_plotting[[list_number]][which(list_plotting[[list_number]]$Position == x), 3])
highest_modifiedScore <- overwrite_NaNs(list_plotting[[list_number]][which(list_plotting[[list_number]]$Position == x), 5])
-
+
} else {
new_rawScore <- overwrite_NaNs(list_plotting[[list_number]][which(list_plotting[[list_number]]$Position == x), 3])
new_modifiedScore <- overwrite_NaNs(list_plotting[[list_number]][which(list_plotting[[list_number]]$Position == x), 5])
-
+
##Checking for higher score - rawScore:
if (is.na(highest_rawScore) == TRUE || is.nan(highest_rawScore) == TRUE){
if (is.na(new_rawScore) == FALSE & is.nan(new_rawScore) == FALSE) {
@@ -495,14 +543,14 @@ extracting_status <- function (positions_df, list_number, summit, MZS_thr) {
}
} else if (is.na(new_rawScore) == TRUE || is.nan(new_rawScore) == TRUE) {
next
-
+
} else {
if (new_rawScore > highest_rawScore){
highest_rawScore <- new_rawScore
- }
+ }
}
-
-
+
+
##Checking for higher score - modified score:
if (is.na(highest_modifiedScore) == TRUE || is.nan(highest_modifiedScore) == TRUE){
if (is.na(new_modifiedScore) == FALSE & is.nan(new_modifiedScore) == FALSE) {
@@ -510,25 +558,25 @@ extracting_status <- function (positions_df, list_number, summit, MZS_thr) {
}
} else if (is.na(new_modifiedScore) == TRUE || is.nan(new_modifiedScore) == TRUE) {
next
-
+
} else {
if (new_modifiedScore > highest_modifiedScore){
highest_modifiedScore <- new_modifiedScore
- }
+ }
}
-
+
}
-
+
}
-
+
##Adding high score to final output:
soft_rawScore[i] <- highest_rawScore
soft_modifiedScore[i] <- highest_modifiedScore
-
+
#Check if a specific software identified it:
if (is.na(highest_modifiedScore) == TRUE || is.nan(highest_modifiedScore) == TRUE) {
soft_status <- c(soft_status, 'NO')
-
+
} else {
if(soft_modifiedScore[i] >= MZS_thr){
soft_status <- c(soft_status, 'YES')
@@ -536,11 +584,11 @@ extracting_status <- function (positions_df, list_number, summit, MZS_thr) {
soft_status <- c(soft_status, 'NO')
}
}
-
+
} else {
##Searching for position 0 value:
position <- initial_position + 2
-
+
##Software - extract values:
rawScore <- list_plotting[[list_number]][which(list_plotting[[list_number]]$Position == position), 3]
if (length(rawScore)==0 || is.infinite(rawScore) == TRUE){
@@ -548,9 +596,9 @@ extracting_status <- function (positions_df, list_number, summit, MZS_thr) {
} else {
soft_rawScore <- c(soft_rawScore, rawScore)
}
-
+
modifiedScore <- list_plotting[[list_number]][which(list_plotting[[list_number]]$Position == position), 5]
-
+
if (length(modifiedScore)==0 || is.infinite(modifiedScore) == TRUE){
soft_modifiedScore <- c(soft_modifiedScore, NA)
} else {
@@ -563,7 +611,7 @@ extracting_status <- function (positions_df, list_number, summit, MZS_thr) {
single_pos_2 <- overwrite_NaNs(list_plotting[[list_number]][which(list_plotting[[list_number]]$Position == initial_position+2), 5])
single_pos_3 <- overwrite_NaNs(list_plotting[[list_number]][which(list_plotting[[list_number]]$Position == initial_position+3), 5])
single_pos_4 <- overwrite_NaNs(list_plotting[[list_number]][which(list_plotting[[list_number]]$Position == initial_position+4), 5])
-
+
#Loop over the kmer to find if specific softwares identified it:
kmer_positions <- c(single_pos_0, single_pos_1, single_pos_2, single_pos_3, single_pos_4)
if(any(kmer_positions >= MZS_thr, na.rm = TRUE)){
@@ -578,7 +626,7 @@ extracting_status <- function (positions_df, list_number, summit, MZS_thr) {
#Create final dataframe:
final <- data.frame(soft_rawScore, soft_modifiedScore, soft_status)
colnames(final) <- c('rawScore', 'modifiedScore', 'status')
-
+
return(final)
}
@@ -587,120 +635,118 @@ calcNanoConsensusScore <- function(data, type) {
w <- c(0.36,0.1,0.21,0.33)
processed_data <- sweep(data, MARGIN=2, w, "*")
return(apply(processed_data,1,sum,na.rm = TRUE))
-
+
} else if (type=="m5C") {
w <- c(0.25,0.05,0.70,0)
processed_data <- sweep(data, MARGIN=2, w, "*")
return(apply(processed_data,1,sum,na.rm = TRUE))
-
+
} else if (type=="m7G") {
w <- c(0.33,0.25,0.42,0)
processed_data <- sweep(data, MARGIN=2, w, "*")
return(apply(processed_data,1,sum,na.rm = TRUE))
-
+
} else if (type=="Am") {
w <- c(0.23,0.07,0.55,0.15)
processed_data <- sweep(data, MARGIN=2, w, "*")
return(apply(processed_data,1,sum,na.rm = TRUE))
-
+
} else if (type=="Um") {
w <- c(0.27,0.12,0.5,0.11)
processed_data <- sweep(data, MARGIN=2, w, "*")
return(apply(processed_data,1,sum,na.rm = TRUE))
-
+
} else if (type=="pU") {
w <- c(0.22,0.19,0.55,0.04)
processed_data <- sweep(data, MARGIN=2, w, "*")
return(apply(processed_data,1,sum,na.rm = TRUE))
-
+
} else {
return(apply(data,1,median,na.rm = TRUE))
}
}
extracting_modified_ZScores <- function (GRange_supported_kmers, list_plotting, MZS_thr, summit, Consensus_score, model_score) {
-
+
#Create vectors to store software data:
epinano_rawScore <- c()
nanopolish_rawScore <- c()
tombo_rawScore <- c()
nanocompore_rawScore <- c()
-
+
epinano_modifiedScore <- c()
nanopolish_modifiedScore <- c()
tombo_modifiedScore <- c()
nanocompore_modifiedScore <- c()
-
+
epinano_status <- c()
nanopolish_status <- c()
tombo_status <- c()
nanocompore_status <- c()
-
+
#Parse data into a data frame:
positions_df <- data.frame(start(GRange_supported_kmers), end(GRange_supported_kmers))
colnames(positions_df) <- c('Start', 'End')
positions_df$Chr <- seqlevels(GRange_supported_kmers)
positions_df <- positions_df[,c(3,1,2)]
-
+
#Extracting scores and software status:
epinano_data <- extracting_status(positions_df, 1, summit, MZS_thr)
nanopolish_data <- extracting_status(positions_df, 2, summit, MZS_thr)
tombo_data <- extracting_status(positions_df, 3, summit, MZS_thr)
nanocompore_data <- extracting_status(positions_df, 4, summit, MZS_thr)
-
+
#Add data to the final dataframe:
positions_df$Epinano_RawScore <- epinano_data$rawScore
positions_df$Nanopolish_RawScore <- nanopolish_data$rawScore
positions_df$Tombo_RawScore <- tombo_data$rawScore
positions_df$Nanocompore_RawScore <- nanocompore_data$rawScore
-
+
positions_df$Epinano_Score <- epinano_data$modifiedScore
positions_df$Nanopolish_Score <- nanopolish_data$modifiedScore
positions_df$Tombo_Score <- tombo_data$modifiedScore
positions_df$Nanocompore_Score <- nanocompore_data$modifiedScore
-
+
positions_df$Epinano_Status <- epinano_data$status
positions_df$Nanopolish_Status <- nanopolish_data$status
positions_df$Tombo_Status <- tombo_data$status
positions_df$Nanocompore_Status <- nanocompore_data$status
-
+
positions_NanoConsensus <- c()
-
+
##Calculate the merged_score:
#Re-scaling:
if (summit == F){
data <- data.frame(positions_df$Epinano_Score, positions_df$Nanopolish_Score, positions_df$Tombo_Score, positions_df$Nanocompore_Score)
-
+
#Re-scale Modified Z-Scores between 0 and 1:
for (i in seq(1:length(data))) {
data[,i] <- rescale(unlist(data[i]), to=c(0,1), na.rm=TRUE)
-
+
}
-
- #data[is.na(data)] <- 0
-
+
#Rescale outputs 0.5 when the software gives the same MZS for all positions - correcting for it if needed:
if (length(unique(data$positions_df.Epinano_Score)) == 1 || all(is.na(unique(data$positions_df.Epinano_Score)))) {
- data$positions_df.Epinano_Score <- 0
- }
-
+ data$positions_df.Epinano_Score <- 0
+ }
+
if (length(unique(data$positions_df.Nanopolish_Score)) == 1 || all(is.na(unique(data$positions_df.Nanopolish_Score)))) {
- data$positions_df.Nanopolish_Score <- 0
+ data$positions_df.Nanopolish_Score <- 0
}
if (length(unique(data$positions_df.Tombo_Score)) == 1 || all(is.na(unique(data$positions_df.Tombo_Score)))) {
- data$positions_df.Tombo_Score <- 0
- }
-
+ data$positions_df.Tombo_Score <- 0
+ }
+
if (length(unique(data$positions_df.Nanocompore_Score)) == 1 || all(is.na(unique(data$positions_df.Nanocompore_Score)))) {
- data$positions_df.Nanocompore_Score <- 0
+ data$positions_df.Nanocompore_Score <- 0
}
-
+
#Calculate NanoConsensus score:
write(paste("Step 4: Calculating NanoConsensus scores with model: ", model_score, sep = ""), file = paste("NanoConsensus_", args$Output_name,".log", sep=""), append = T)
positions_df$Merged_Score <- calcNanoConsensusScore(data, model_score)
-
+
threshold <- Consensus_score*median(positions_df$Merged_Score, na.rm = TRUE)
print(threshold)
positions_NanoConsensus <- subset(positions_df, Merged_Score >= threshold)
@@ -715,22 +761,22 @@ bedgraph_tracks <- function (data, output_name, color, methods) {
if (!dir.exists("./Bedgraph_tracks")){
dir.create("Bedgraph_tracks", showWarnings = FALSE)
}
-
+
#Sliced dataset:
subset_data <- data.frame(data[,c(1,2,3,i)])
-
+
#From kmers to individual positions:
subset_data$Start <- subset_data$Start+1
subset_data$End <- subset_data$End-2
-
+
#Removing NAs - otherwise, track wont be loaded into IGV:
subset_data[is.na(subset_data)] <- 0
-
+
#Prepare the header:
header_track <- paste(" \'1i track type=bedGraph name=", methods[i-3]," autoScale=on visibility=full color=",color[i-3]," altColor=",color[i-3]," priority=20 graphType=bar\'", sep="")
-
+
#Generate bedgraph tracks:
- write.table(subset_data, file = paste("Bedgraph_tracks/", methods[i-3], "-", str_split_fixed(output_name,"_Raw_kmers.txt",2)[1],'.bedgraph', sep=''),
+ write.table(subset_data, file = paste("Bedgraph_tracks/", methods[i-3], "-", str_split_fixed(output_name,"_Raw_kmers.txt",2)[1],'.bedgraph', sep=''),
sep = '\t', row.names = FALSE, col.names = FALSE, quote = FALSE)
#Include the header to be able to load the track into IGV:
@@ -744,7 +790,7 @@ bed_tracks <- function (data, output_name, color, methods) {
if (!dir.exists("./Kmer_tracks")){
dir.create("Kmer_tracks", showWarnings = FALSE)
}
-
+
for (i in 1:length(data)){
bed_data <- data.frame(seqnames(data[[i]]),
start(data[[i]]),
@@ -752,88 +798,146 @@ bed_tracks <- function (data, output_name, color, methods) {
c(rep(".", length(data[[i]]))),
c(rep(0, length(data[[i]]))),
strand(data[[i]]), start(data[[i]]),end(data[[i]]), c(rep(color[i], length(data[[i]]))))
-
-
+
+
#Prepare the header:
header_track <- paste(" \'1i track name=", methods[i],"_kmers visibility=2 itemRgb=\"On\"\'", sep="")
-
+
#Generate bed tracks:
- write.table(bed_data, file = paste("Kmer_tracks/", methods[i], "-", output_name, "-kmers.bed", sep=''),
+ write.table(bed_data, file = paste("Kmer_tracks/", methods[i], "-", output_name, "-kmers.bed", sep=''),
sep = '\t', row.names = FALSE, col.names = FALSE, quote = FALSE)
-
+
#Include the header to be able to load the track into IGV:
command=paste("sed -i", header_track, paste(" ./Kmer_tracks/", methods[i], "-", output_name, "-kmers.bed", sep=''), sep="")
try(system(command))
-
+
}
}
-kmer_analysis <- function (all_ranges, fasta_file, output_name, tracks) {
+nearest_distance_mod <- function(all_ranges, annotation) {
+ distance <- c()
+ mods <- c()
+
+ #Loop through all the supported kmers:
+ for (i in 1:nrow(all_ranges)){
+
+ #Define variables to determine distance to nearest modified site:
+ initial <- all_ranges[i,2]
+ final <- all_ranges[i,3]
+ single_distance <- c()
+ single_mods <- c()
+ within <- FALSE
+
+ #Loop through all the annotated positions:
+ for (j in 1:nrow(annotation)){
+ annotated_position <- annotation[j,3]
+
+ #Annotated position within the modified kmer:
+ if (annotated_position<=final && annotated_position>=initial && within==TRUE){
+ single_distance <- c(single_distance, 0)
+ single_mods <- c(single_mods, paste(annotation[j,4],annotation[j,3], sep="-"))
+
+ } else if (annotated_position<=final && annotated_position>=initial) {
+ single_distance <- 0
+ single_mods <- paste(annotation[j,4],annotation[j,3], sep="-")
+ within <- TRUE
+
+ } else {
+ #Annotated position outside the modified kmer:
+ d <- min(abs(annotated_position-initial), abs(annotated_position-final))
+ if (d0) {
for (i in 1:nrow(sliced)){
features <- strsplit(as.character(sliced[[1]][i]), "[_]")
-
+
#Create GR objects:
if (length(features[[1]])==2){
chr <- features[[1]][1]
} else {
- elements <- c()
+ elements <- c()
for (i in 1:length(features[[1]])-1){
elements <- c(elements, features[[1]][i])
}
chr <- paste(elements, collapse="_")
-
+
}
grNew <- GRanges(seqnames=chr,ranges=IRanges(as.integer(features[[1]][length(features[[1]])])-2, end = as.integer(features[[1]][length(features[[1]])])+2))
-
+
if(is.null(grList)==TRUE){
grList <- grNew
} else {
grList <- reduce(c(grList,grNew))
}
-
+
}
-
+
assign(paste('gr',methods_name[j],sep=""), reduce(unique(grList)))
-
+
} else {
assign(paste('gr',methods_name[j],sep=""), unique(grList))
}
-
+
}
-
+
##Perform intersections:
#Check how many elements are in each GRange object and if it is null, create an empty one:
if (is.null(grEpinano)==TRUE){
@@ -842,32 +946,32 @@ analysis_significant_positions <- function (list_significant, list_plotting, fas
} else {
n1 <- length(grEpinano)
}
-
+
if (is.null(grNanopolish)==TRUE){
grNanopolish <- GRanges()
n2 <- 0
} else {
n2 <- length(grNanopolish)
}
-
+
if (is.null(grTombo)==TRUE){
grTombo <- GRanges()
n3 <- 0
} else {
n3 <- length(grTombo)
}
-
+
if (is.null(grNanocompore)==TRUE){
grNanocompore <- GRanges()
n4 <- 0
} else {
n4 <- length(grNanocompore)
}
-
+
##Generate bed files:
color_beds <- c("0,166,81", "102,45,145", "0,174,239","242,101,34")
bed_tracks(list(grEpinano, grNanopolish, grTombo, grNanocompore), output_name, color_beds, c('Epinano', 'Nanopolish', 'Tombo', 'Nanocompore'))
-
+
##Update log file:
write(paste('-Positions identified by Epinano:', n1, sep = " "), file = paste("NanoConsensus_", args$Output_name,".log", sep=""), append = T)
write(paste('-Positions identified by Nanopolish:', n2, sep = " "), file = paste("NanoConsensus_", args$Output_name,".log", sep=""), append = T)
@@ -879,54 +983,54 @@ analysis_significant_positions <- function (list_significant, list_plotting, fas
#Overlappings: checking which software has identified less significant positions and then it uses it as query
intersect_12 <- overlapping_GRobjects(grEpinano, grNanopolish, n1, n2)
length_intersect_12 <- extract_length_from_GRobjects(intersect_12)
-
+
intersect_13 <- overlapping_GRobjects(grEpinano, grTombo, n1, n3)
length_intersect_13 <- extract_length_from_GRobjects(intersect_13)
-
+
intersect_23 <- overlapping_GRobjects(grNanopolish, grTombo, n2, n3)
length_intersect_23 <- extract_length_from_GRobjects(intersect_23)
intersect_123 <- overlapping_GRobjects(intersect_12, grTombo, length_intersect_12, n3)
length_intersect_123 <- extract_length_from_GRobjects(intersect_123)
- #Venn Diagram:
+ #Venn Diagram:
write(paste('-Positions identified by Epinano-Nanopolish:', length_intersect_12, sep = " "), file = paste("NanoConsensus_", args$Output_name,".log", sep=""), append = T)
write(paste('-Positions identified by Epinano-Tombo:', length_intersect_13, sep = " "), file = paste("NanoConsensus_", args$Output_name,".log", sep=""), append = T)
write(paste('-Positions identified by Nanopolish-Tombo:', length_intersect_23, sep = " "), file = paste("NanoConsensus_", args$Output_name,".log", sep=""), append = T)
write(paste('-Positions identified by Epinano-Nanopolish-Tombo:', length_intersect_123, sep = " "), file = paste("NanoConsensus_", args$Output_name,".log", sep=""), append = T)
-
+
methods_name <- c('Epinano', 'Nanopolish', 'Tombo')
#draw_triple_venn_diagram(n1, n2, n3, length_intersect_12, length_intersect_13, length_intersect_23, length_intersect_123, methods_name, output_name)
-
- #Extract kmers supported by two or more softwares:
+
+ #Extract kmers supported by two or more softwares:
supported_kmers <- reduce(c(intersect_12,intersect_13,intersect_23,intersect_123))
-
+
} else if (n1 != 0 & n2 == 0 & n3 != 0 & n4 != 0 ) {
#Overlappings: checking which software has identified less significant positions and then it uses it as query
intersect_13 <- overlapping_GRobjects(grEpinano, grTombo, n1, n3)
length_intersect_13 <- extract_length_from_GRobjects(intersect_13)
-
+
intersect_14 <- overlapping_GRobjects(grEpinano, grNanocompore, n1, n4)
- length_intersect_14 <- extract_length_from_GRobjects(intersect_14)
-
+ length_intersect_14 <- extract_length_from_GRobjects(intersect_14)
+
intersect_34 <- overlapping_GRobjects(grTombo, grNanocompore, n3, n4)
- length_intersect_34 <- extract_length_from_GRobjects(intersect_34)
-
+ length_intersect_34 <- extract_length_from_GRobjects(intersect_34)
+
intersect_134 <- overlapping_GRobjects(intersect_13, grNanocompore, length_intersect_13, n4)
length_intersect_134 <- extract_length_from_GRobjects(intersect_134)
-
- #Venn Diagram:
+
+ #Venn Diagram:
write(paste('-Positions identified by Epinano-Tombo:', length_intersect_13, sep = " "), file = paste("NanoConsensus_", args$Output_name,".log", sep=""), append = T)
write(paste('-Positions identified by Epinano-Nanocompore:', length_intersect_14, sep = " "), file = paste("NanoConsensus_", args$Output_name,".log", sep=""), append = T)
write(paste('-Positions identified by Tombo-Nanocompore:', length_intersect_34, sep = " "), file = paste("NanoConsensus_", args$Output_name,".log", sep=""), append = T)
write(paste('-Positions identified by Epinano-Tombo-Nanocompore:', length_intersect_134, sep = " "), file = paste("NanoConsensus_", args$Output_name,".log", sep=""), append = T)
-
+
methods_name <- c('Epinano', 'Tombo', 'Nanocompore')
#draw_triple_venn_diagram(n1, n3, n4, length_intersect_13, length_intersect_14, length_intersect_34, length_intersect_134, methods_name, output_name)
-
- #Extract kmers supported by two or more softwares:
+
+ #Extract kmers supported by two or more softwares:
supported_kmers <- reduce(c(intersect_13,intersect_14,intersect_34,intersect_134))
-
+
} else if (n1 == 0 & n2 != 0 & n3 != 0 & n4 != 0 ) {
#Overlappings: checking which software has identified less significant positions and then it uses it as query
intersect_23 <- overlapping_GRobjects(grNanopolish, grTombo, n2, n3)
@@ -934,30 +1038,30 @@ analysis_significant_positions <- function (list_significant, list_plotting, fas
intersect_24 <- overlapping_GRobjects(grNanopolish, grNanocompore, n2, n4)
length_intersect_24 <- extract_length_from_GRobjects(intersect_24)
-
+
intersect_34 <- overlapping_GRobjects(grTombo, grNanocompore, n3, n4)
length_intersect_34 <- extract_length_from_GRobjects(intersect_34)
-
+
intersect_234 <- overlapping_GRobjects(intersect_23, grNanocompore, length_intersect_23, n4)
length_intersect_234 <- extract_length_from_GRobjects(intersect_234)
-
- #Venn Diagram:
+
+ #Venn Diagram:
write(paste('-Positions identified by Nanopolish-Tombo:', length_intersect_23, sep = " "), file = paste("NanoConsensus_", args$Output_name,".log", sep=""), append = T)
write(paste('-Positions identified by Nanopolish-Nanocompore:', length_intersect_24, sep = " "), file = paste("NanoConsensus_", args$Output_name,".log", sep=""), append = T)
write(paste('-Positions identified by Tombo-Nanocompore:', length_intersect_34, sep = " "), file = paste("NanoConsensus_", args$Output_name,".log", sep=""), append = T)
write(paste('-Positions identified by Nanopolish-Tombo-Nanocompore:', length_intersect_234, sep = " "), file = paste("NanoConsensus_", args$Output_name,".log", sep=""), append = T)
-
+
methods_name <- c('Nanopolish', 'Tombo', 'Nanocompore')
#draw_triple_venn_diagram(n2, n3, n4, length_intersect_23, length_intersect_24, length_intersect_34, length_intersect_234, methods_name, output_name)
-
- #Extract kmers supported by two or more softwares:
+
+ #Extract kmers supported by two or more softwares:
supported_kmers <- reduce(c(intersect_23,intersect_24,intersect_34,intersect_234))
-
+
} else if (n1 != 0 & n2 != 0 & n3 == 0 & n4 != 0 ) {
#Overlappings: checking which software has identified less significant positions and then it uses it as query
intersect_12 <- overlapping_GRobjects(grEpinano, grNanopolish, n1, n2)
length_intersect_12 <- extract_length_from_GRobjects(intersect_12)
-
+
intersect_14 <- overlapping_GRobjects(grEpinano, grNanocompore, n1, n4)
length_intersect_14 <- extract_length_from_GRobjects(intersect_14)
@@ -966,17 +1070,17 @@ analysis_significant_positions <- function (list_significant, list_plotting, fas
intersect_124 <- overlapping_GRobjects(intersect_12, grNanocompore, length_intersect_12, n4)
length_intersect_124 <- extract_length_from_GRobjects(intersect_124)
-
- #Venn Diagram:
+
+ #Venn Diagram:
write(paste('-Positions identified by Epinano-Nanopolish:', length_intersect_12, sep = " "), file = paste("NanoConsensus_", args$Output_name,".log", sep=""), append = T)
write(paste('-Positions identified by Epinano-Nanocompore:', length_intersect_14, sep = " "), file = paste("NanoConsensus_", args$Output_name,".log", sep=""), append = T)
write(paste('-Positions identified by Nanopolish-Nanocompore:', length_intersect_24, sep = " "), file = paste("NanoConsensus_", args$Output_name,".log", sep=""), append = T)
write(paste('-Positions identified by Epinano-Nanopolish-Nanocompore:', length_intersect_124, sep = " "), file = paste("NanoConsensus_", args$Output_name,".log", sep=""), append = T)
-
+
methods_name <- c('Epinano', 'Nanopolish', 'Nanocompore')
#draw_triple_venn_diagram(n1, n2, n4, length_intersect_12, length_intersect_14, length_intersect_24, length_intersect_124, methods_name, output_name)
-
- #Extract kmers supported by two or more softwares:
+
+ #Extract kmers supported by two or more softwares:
supported_kmers <- reduce(c(intersect_12,intersect_14,intersect_24,intersect_124))
} else if (n1 == 0 & n2 == 0 & n3 == 0 & n4 == 0 ) {
@@ -989,81 +1093,81 @@ analysis_significant_positions <- function (list_significant, list_plotting, fas
#Overlappings: checking which software has identified less significant positions and then it uses it as query
intersect_34 <- overlapping_GRobjects(grTombo, grNanocompore, n3, n4)
length_intersect_34 <- extract_length_from_GRobjects(intersect_34)
-
- #Venn Diagram:
+
+ #Venn Diagram:
write(paste('-Positions identified by Tombo-Nanocompore:', length_intersect_34, sep = " "), file = paste("NanoConsensus_", args$Output_name,".log", sep=""), append = T)
-
+
methods_name <- c('Tombo', 'Nanocompore')
#draw_pairwise_venn_diagram(n3, n4, length_intersect_34, methods_name, output_name)
-
- #Extract kmers supported by two or more softwares:
+
+ #Extract kmers supported by two or more softwares:
supported_kmers <- reduce(intersect_34)
-
+
} else if (n3 == 0 & n4 == 0) {
#Overlappings: checking which software has identified less significant positions and then it uses it as query
intersect_12 <- overlapping_GRobjects(grEpinano, grNanopolish, n1, n2)
length_intersect_12 <- extract_length_from_GRobjects(intersect_12)
-
- #Venn Diagram:
+
+ #Venn Diagram:
write(paste('-Positions identified by Epinano-Nanopolish:', length_intersect_12, sep = " "), file = paste("NanoConsensus_", args$Output_name,".log", sep=""), append = T)
-
+
methods_name <- c('Epinano', 'Nanopolish')
#draw_pairwise_venn_diagram(n1, n2, length_intersect_12, methods_name, output_name)
-
- #Extract kmers supported by two or more softwares:
+
+ #Extract kmers supported by two or more softwares:
supported_kmers <- reduce(intersect_12)
-
- } else if (n2 == 0 & n3 == 0) {
+
+ } else if (n2 == 0 & n3 == 0) {
#Overlappings: checking which software has identified less significant positions and then it uses it as query
intersect_14 <- overlapping_GRobjects(grEpinano, grNanocompore, n1, n4)
length_intersect_14 <- extract_length_from_GRobjects(intersect_14)
-
- #Venn Diagram:
+
+ #Venn Diagram:
write(paste('-Positions identified by Epinano-Nanocompore:', length_intersect_14, sep = " "), file = paste("NanoConsensus_", args$Output_name,".log", sep=""), append = T)
-
+
methods_name <- c('Epinano', 'Nanocompore')
#draw_pairwise_venn_diagram(n1, n4, length_intersect_14, methods_name, output_name)
-
- #Extract kmers supported by two or more softwares:
+
+ #Extract kmers supported by two or more softwares:
supported_kmers <- reduce(intersect_14)
-
- } else if (n1 == 0 & n3 == 0) {
+
+ } else if (n1 == 0 & n3 == 0) {
#Overlappings: checking which software has identified less significant positions and then it uses it as query
intersect_24 <- overlapping_GRobjects(grNanopolish, grNanocompore, n2, n4)
length_intersect_24 <- extract_length_from_GRobjects(intersect_24)
-
- #Venn Diagram:
+
+ #Venn Diagram:
write(paste('-Positions identified by Nanopolish-Nanocompore:', length_intersect_24, sep = " "), file = paste("NanoConsensus_", args$Output_name,".log", sep=""), append = T)
-
+
methods_name <- c('Nanopolish', 'Nanocompore')
#draw_pairwise_venn_diagram(n2, n4, length_intersect_24, methods_name, output_name)
-
- #Extract kmers supported by two or more softwares:
+
+ #Extract kmers supported by two or more softwares:
supported_kmers <- reduce(intersect_24)
-
+
} else {
#Overlappings: checking which software has identified less significant positions and then it uses it as query
intersect_12 <- overlapping_GRobjects(grEpinano, grNanopolish, n1, n2)
length_intersect_12 <- extract_length_from_GRobjects(intersect_12)
-
+
intersect_13 <- overlapping_GRobjects(grEpinano, grTombo, n1, n3)
length_intersect_13 <- extract_length_from_GRobjects(intersect_13)
-
+
intersect_14 <- overlapping_GRobjects(grEpinano, grNanocompore, n1, n4)
length_intersect_14 <- extract_length_from_GRobjects(intersect_14)
-
+
intersect_23 <- overlapping_GRobjects(grNanopolish, grTombo, n2, n3)
length_intersect_23 <- extract_length_from_GRobjects(intersect_23)
-
+
intersect_24 <- overlapping_GRobjects(grNanopolish, grNanocompore, n2, n4)
length_intersect_24 <- extract_length_from_GRobjects(intersect_24)
-
+
intersect_34 <- overlapping_GRobjects(grTombo, grNanocompore, n3, n4)
length_intersect_34 <- extract_length_from_GRobjects(intersect_34)
-
+
intersect_123 <- overlapping_GRobjects(intersect_12, grTombo, length_intersect_12, n3)
length_intersect_123 <- extract_length_from_GRobjects(intersect_123)
-
+
intersect_124 <- overlapping_GRobjects(intersect_12, grNanocompore, length_intersect_12, n4)
length_intersect_124 <- extract_length_from_GRobjects(intersect_124)
@@ -1072,11 +1176,11 @@ analysis_significant_positions <- function (list_significant, list_plotting, fas
intersect_234 <- overlapping_GRobjects(intersect_23, grNanocompore, length_intersect_23, n4)
length_intersect_234 <- extract_length_from_GRobjects(intersect_234)
-
+
intersect_1234 <- overlapping_GRobjects(intersect_12, intersect_34, length_intersect_12, length_intersect_34)
length_intersect_1234 <- extract_length_from_GRobjects(intersect_1234)
- #Venn Diagram:
+ #Venn Diagram:
write(paste('-Positions identified by Epinano-Nanopolish:', length_intersect_12, sep = " "), file = paste("NanoConsensus_", args$Output_name,".log", sep=""), append = T)
write(paste('-Positions identified by Epinano-Tombo:', length_intersect_13, sep = " "), file = paste("NanoConsensus_", args$Output_name,".log", sep=""), append = T)
write(paste('-Positions identified by Epinano-Nanocompore:', length_intersect_14, sep = " "), file = paste("NanoConsensus_", args$Output_name,".log", sep=""), append = T)
@@ -1088,52 +1192,52 @@ analysis_significant_positions <- function (list_significant, list_plotting, fas
write(paste('-Positions identified by Epinano-Tombo-Nanocompore:', length_intersect_134, sep = " "), file = paste("NanoConsensus_", args$Output_name,".log", sep=""), append = T)
write(paste('-Positions identified by Nanopolish-Tombo-Nanocompore:', length_intersect_123, sep = " "), file = paste("NanoConsensus_", args$Output_name,".log", sep=""), append = T)
write(paste('-Positions identified by Epinano-Nanopolish-Tombo-Nanocompore:', length_intersect_1234, sep = " "), file = paste("NanoConsensus_", args$Output_name,".log", sep=""), append = T)
-
-
+
+
#draw_venn_diagram(n1, n2, n3, n4, length_intersect_12, length_intersect_13, length_intersect_14, length_intersect_23, length_intersect_24,
# length_intersect_34, length_intersect_123, length_intersect_124, length_intersect_134, length_intersect_234, length_intersect_1234, methods_name, output_name)
-
+
#Extract kmers supported by two or more softwares:
supported_kmers <- reduce(c(intersect_12,intersect_13,intersect_14,intersect_23,intersect_24,
intersect_34,intersect_123,intersect_124,intersect_134,intersect_234,intersect_1234))
-
+
}
-
- ##Kmer analysis:
+
+ ##Kmer analysis:
#Analysis of all kmers across the chromosome:
all_kmers_raw <- GRanges(seqnames = chr, ranges = IRanges(initial_position:(final_position-4), end = (initial_position+4):final_position))
all_kmers <- extracting_modified_ZScores(all_kmers_raw, list_plotting, MZS_thr, FALSE, Consensus_score, model_score)
- kmer_analysis(all_kmers[[1]], fasta_file, paste(output_name,'Raw_kmers.txt', sep='_'), TRUE)
-
+ kmer_analysis(all_kmers[[1]], fasta_file, paste(output_name,'Raw_kmers.txt', sep='_'), TRUE, annotation, FALSE)
+
#Analyse the supported kmers - only if they are present:
if (is.null(supported_kmers)==FALSE) {
filtered_supported_kmers <- overlapping_GRobjects(reduce(supported_kmers), GRanges(seqnames=all_kmers[[2]][,c('Chr')],ranges=IRanges(all_kmers[[2]][,c('Start')], end = all_kmers[[2]][,c('End')])),1,2)
-
+
if(extract_length_from_GRobjects(filtered_supported_kmers)!=0){
all_ranges <- extracting_modified_ZScores(filtered_supported_kmers, list_plotting, MZS_thr, TRUE, Consensus_score, model_score)
- kmer_analysis(all_ranges[[1]], fasta_file, paste(output_name,'Supported_kmers.txt', sep='_'), FALSE)
-
+ kmer_analysis(all_ranges[[1]], fasta_file, paste(output_name,'Supported_kmers.txt', sep='_'), FALSE, annotation, TRUE)
+
#Plot NanoConsensus score across transcripts:
- Nanoconsensus_plotting(all_kmers[[1]], all_ranges[[1]], output_name, barplot_4soft)
+ Nanoconsensus_plotting(all_kmers[[1]], all_ranges[[1]], output_name, barplot_4soft, initial_position, final_position, annotation, ablines)
write("Step 5: Plotting NanoConsensus scores across the transcript", file = paste("NanoConsensus_", args$Output_name,".log", sep=""), append = T)
-
+
} else {
all_ranges <- data.frame()
#Plot NanoConsensus score across transcripts:
- Nanoconsensus_plotting(all_kmers[[1]], all_ranges, output_name, barplot_4soft)
-
+ Nanoconsensus_plotting(all_kmers[[1]], all_ranges, output_name, barplot_4soft, initial_position, final_position, annotation, ablines)
+
write("Step 5: Plotting NanoConsensus scores across the transcript", file = paste("NanoConsensus_", args$Output_name,".log", sep=""), append = T)
-
+
}
-
+
} else {
-
+
all_ranges <- data.frame()
#Plot NanoConsensus score across transcripts:
- Nanoconsensus_plotting(all_kmers[[1]], all_ranges, output_name, barplot_4soft)
+ Nanoconsensus_plotting(all_kmers[[1]], all_ranges, output_name, barplot_4soft, initial_position, final_position, annotation, ablines)
write("Step 5: Plotting NanoConsensus scores across the transcript", file = paste("NanoConsensus_", args$Output_name,".log", sep=""), append = T)
-
+
}
write("ANALYSIS COMPLETED SUCCESSFULLY", file = paste("NanoConsensus_", args$Output_name,".log", sep=""), append = T)
-
+
}
diff --git a/mop_consensus/comparison.tsv b/mop_consensus/comparison.tsv
index a66dcff..573bb64 100644
--- a/mop_consensus/comparison.tsv
+++ b/mop_consensus/comparison.tsv
@@ -1 +1,6 @@
-mod wt
+pU---bc_4 pU---bc_1
+pU---bc_4 pU---bc_2
+pU---bc_4 pU---bc_3
+Nm---bc_1 Nm---bc_2
+Nm---bc_1 Nm---bc_3
+Nm---bc_1 Nm---bc_4
diff --git a/mop_consensus/mop_consensus.nf b/mop_consensus/mop_consensus.nf
index 563e4e2..9a5cd33 100755
--- a/mop_consensus/mop_consensus.nf
+++ b/mop_consensus/mop_consensus.nf
@@ -2,7 +2,7 @@
nextflow.enable.dsl=2
-/*
+/*
* Define the pipeline parameters
*
*/
@@ -18,7 +18,7 @@ log.info """
╔╦╗╔═╗╔═╗ ╔═╗╔═╗╔╗╔╔═╗╔═╗╔╗╔╔═╗╦ ╦╔═╗
║║║║ ║╠═╝ ║ ║ ║║║║╚═╗║╣ ║║║╚═╗║ ║╚═╗
╩ ╩╚═╝╩ ╚═╝╚═╝╝╚╝╚═╝╚═╝╝╚╝╚═╝╚═╝╚═╝
-
+
====================================================
BIOCORE@CRG Master of Pores 2. Get consensus modifications - N F ~ version ${version}
====================================================
@@ -28,6 +28,7 @@ input_path : ${params.input_path}
output : ${params.output}
comparison : ${params.comparison}
padsize : ${params.padsize}
+extraparams : ${params.extraparams}
******* reference has to be the genome **********
reference : ${params.reference}
@@ -51,7 +52,7 @@ reference = file(params.reference)
if( !reference.exists() ) exit 1, "Missing reference file: ${reference}!"
-include { checkRef; mapIDPairs; indexFasta } from "${local_modules}"
+include { checkRef; mapIDPairs; indexFasta } from "${local_modules}"
include { nanoConsensus } from "${local_modules}" addParams(OUTPUT: params.output)
@@ -76,7 +77,7 @@ Channel
def padsize = (params.padsize == 0 ? 1 : params.padsize)
-workflow {
+workflow {
comparisons.flatten().unique().set{unique_samples}
unique_samples.map {
@@ -86,7 +87,7 @@ workflow {
unique_samples.map {
[it, file("${params.input_path}/nanopolish-compore_flow/${it}.csv.gz")]
}.transpose().set{nanopolish}
-
+
comparisons.map{
["${it[0]}---${it[1]}", file("${params.input_path}/tombo_flow/${it[0]}---${it[1]}_lsc.plus_Tombo_Output.tsv.gz")]
}.transpose().set{tombo}
@@ -113,10 +114,10 @@ workflow {
[ vals[0], chrStart, chrEnd]
}
}.set{transcript_coords}
-
+
data_to_process = epinano_combs.join(nanopolish_combs).join(tombo).join(nanocomp)
//data_to_process.view()
- nanoConsensus(nanoConScript, nanoScript, ref_file, data_to_process.combine(transcript_coords))
+ nanoConsensus(nanoConScript, nanoScript, ref_file, "${params.extraparams}", data_to_process.combine(transcript_coords))
}
@@ -125,7 +126,7 @@ workflow {
*/
workflow.onComplete {
println "Pipeline BIOCORE@CRG Master of Pore completed!"
- println "Started at $workflow.start"
+ println "Started at $workflow.start"
println "Finished at $workflow.complete"
println "Time elapsed: $workflow.duration"
println "Execution status: ${ workflow.success ? 'OK' : 'failed' }"
@@ -135,7 +136,7 @@ workflow.onComplete {
* Mail notification
*/
-if (params.email == "yourmail@yourdomain" || params.email == "") {
+if (params.email == "yourmail@yourdomain" || params.email == "") {
log.info 'Skipping the email\n'
}
else {
diff --git a/mop_consensus/nextflow.config b/mop_consensus/nextflow.config
index 421a6c0..44178f7 100644
--- a/mop_consensus/nextflow.config
+++ b/mop_consensus/nextflow.config
@@ -1,5 +1,2 @@
-includeConfig "$baseDir/params.config"
includeConfig "../nextflow.global.config"
singularity.cacheDir = "$baseDir/../singularity"
-
-
diff --git a/mop_consensus/params.config b/mop_consensus/params.config
deleted file mode 100755
index d481810..0000000
--- a/mop_consensus/params.config
+++ /dev/null
@@ -1,11 +0,0 @@
-params {
-
- input_path = "$baseDir/../mop_mod/output_mod"
- reference = "$baseDir/../anno/yeast_rRNA_ref.fa.gz"
-
- comparison = "$baseDir/comparison.tsv"
- padsize = 50
- output = "$baseDir/output"
-
- email = "yourname@yourdomain"
-}
diff --git a/mop_consensus/params.yaml b/mop_consensus/params.yaml
new file mode 100755
index 0000000..583f555
--- /dev/null
+++ b/mop_consensus/params.yaml
@@ -0,0 +1,9 @@
+input_path: "${projectDir}/../mop_mod/output_mod"
+reference: "/path/to/reference.fa"
+
+comparison: "comparison.tsv"
+padsize: 50
+output: "${projectDir}/output"
+
+extraparams: ""
+email: ""
diff --git a/mop_mod/bin/Merge_Tombo_v2.py b/mop_mod/bin/Merge_Tombo_v2.py
index 25e235e..af5194d 100755
--- a/mop_mod/bin/Merge_Tombo_v2.py
+++ b/mop_mod/bin/Merge_Tombo_v2.py
@@ -4,12 +4,11 @@
#Import libraries:
import sys
-
import pyBigWig
#Import input files:
statistic = pyBigWig.open(sys.argv[1])
-cov_ivt = pyBigWig.open(sys.argv[2])
+cov_ivt = pyBigWig.open(sys.argv[2])
cov_sample = pyBigWig.open(sys.argv[3])
output_name = sys.argv[4]
@@ -26,7 +25,7 @@
stat = statistic.values(transcript, position-1, position)[0]
ivt = cov_ivt.values(transcript, position-1, position)[0]
samples = cov_sample.values(transcript, position-1, position)[0]
-
+
#Calculate the kmer score:
try:
kmer_score = sum(statistic.values(transcript, position-3, position+2))
@@ -37,4 +36,4 @@
row = [ref_pos, transcript, position, stat, samples, ivt, kmer_score]
print('\t'.join([str(x) for x in row]), file=f)
-f.close()
\ No newline at end of file
+f.close()
diff --git a/mop_mod/bin/Merge_Tombo_v3.py b/mop_mod/bin/Merge_Tombo_v3.py
index 240335b..ef4cb6c 100755
--- a/mop_mod/bin/Merge_Tombo_v3.py
+++ b/mop_mod/bin/Merge_Tombo_v3.py
@@ -4,12 +4,11 @@
#Import libraries:
import sys
-
import pyBigWig
#Import input files:
statistic = pyBigWig.open(sys.argv[1])
-cov_ivt = pyBigWig.open(sys.argv[2])
+cov_ivt = pyBigWig.open(sys.argv[2])
cov_sample = pyBigWig.open(sys.argv[3])
output_name = sys.argv[4]
@@ -26,7 +25,7 @@
stat = statistic.values(transcript, position-1, position)[0]
ivt = cov_ivt.values(transcript, position-1, position)[0]
samples = cov_sample.values(transcript, position-1, position)[0]
-
+
#Calculate the kmer score:
try:
kmer_score = sum(statistic.values(transcript, position-3, position+2))
diff --git a/mop_mod/bin/Merge_Tombo_wigs_MoP_v2.R b/mop_mod/bin/Merge_Tombo_wigs_MoP_v2.R
index 92b1156..3a4a65d 100755
--- a/mop_mod/bin/Merge_Tombo_wigs_MoP_v2.R
+++ b/mop_mod/bin/Merge_Tombo_wigs_MoP_v2.R
@@ -1,4 +1,4 @@
-###Script to process Tombo data:
+###Script to process Tombo data:
##Import required libraries:
library('plyr')
@@ -17,137 +17,136 @@ parser$add_argument("-output", "--Output_name", type="character", help="Output(s
#Get command line options, if help option encountered - print help and exit:
args <- parser$parse_args()
-##Import data:
+##Import data:
statistic_wig <- read.delim(args$Statistic_wig, sep=" ", stringsAsFactors = FALSE)
coverage_WT <- read.delim(args$Coverage_WT_bedgraph, sep=" ", stringsAsFactors = FALSE)
coverage_control <- read.delim(args$Coverage_control_bedgraph, sep=" ", stringsAsFactors = FALSE)
output_name <- args$Output_name
if (nrow(statistic_wig)>1 && nrow(coverage_control)>1 && nrow(coverage_WT)>1){
-
+
##Parse coverage data:
coverage_samples <- list(coverage_WT, coverage_control)
coverage_samples_name <- c('Sample', 'IVT')
-
+
for (j in 1:length(coverage_samples)){
#Changing colnames:
colnames(coverage_samples[[j]]) <- c(paste('Coverage', coverage_samples_name[j], sep='_'), 'Chr', 'Position', 'Include')
-
- #Adding chromosome information per position:
+
+ #Adding chromosome information per position:
for (row in 1:nrow(coverage_samples[[j]])){
element <- coverage_samples[[j]][[row,1]]
-
- #If it is the start of a new chromosome, we need to capture its name and starting position:
+
+ #If it is the start of a new chromosome, we need to capture its name and starting position:
if (element == 'fixedStep'){
chrom <- sub(".*=", "", coverage_samples[[j]][row,2])
start_position <- sub(".*=", "", coverage_samples[[j]][row,3])
coverage_samples[[j]][[row,4]] <- FALSE
initial_position <- TRUE
-
+
} else {
if (initial_position==TRUE){
coverage_samples[[j]][[row,2]] <- chrom
coverage_samples[[j]][[row,3]] <- start_position
coverage_samples[[j]][[row,4]] <- TRUE
-
+
initial_position <- FALSE
i <- as.numeric(start_position)
-
+
} else {
i <- i + 1
-
+
#Update data:
coverage_samples[[j]][[row,2]] <- chrom
coverage_samples[[j]][[row,3]] <- i
coverage_samples[[j]][[row,4]] <- TRUE
}
-
+
}
}
-
+
data_filtered <- subset(coverage_samples[[j]][-c(5)], Include==TRUE)
data_filtered$Ref_Position <- paste(data_filtered$Chr, data_filtered$Position, sep = "_")
assign(paste(coverage_samples_name[j],'filtered', sep='_'), data_filtered[c(5,1)])
-
+
}
-
- #Merge both coverage tables:
+
+ #Merge both coverage tables:
final_coverage <- join(Sample_filtered, IVT_filtered, by='Ref_Position')[-c(4)]
-
-
+
+
##Parse stadistic data:
#Changing colnames:
colnames(statistic_wig) <- c('Position', 'statistic', 'Chr', 'Include')
-
- #Adding chromosome information per position:
+
+ #Adding chromosome information per position:
for (row in 1:nrow(statistic_wig)){
element <- statistic_wig[[row,1]]
-
- #If it is the start of a new chromosome, we need to capture its name and starting position:
+
+ #If it is the start of a new chromosome, we need to capture its name and starting position:
if (element == 'fixedStep' || element == 'variableStep'){
chrom <- sub(".*=", "", statistic_wig[row,2])
statistic_wig[[row,4]] <- FALSE
-
+
} else {
-
+
#Update data:
statistic_wig[[row,3]] <- chrom
statistic_wig[[row,4]] <- TRUE
-
+
}
}
-
+
stat_filtered <- subset(statistic_wig, Include==TRUE)
stat_filtered$Ref_Position <- paste(stat_filtered$Chr, stat_filtered$Position, sep = "_")
statistic_filtered <- stat_filtered[c(5,3,1,2)]
-
-
- ##Merging p-value and coverage data:
+
+
+ ##Merging p-value and coverage data:
tombo_output <- join(statistic_filtered, final_coverage, by='Ref_Position')
-
+
##Mean p-value per sliding window (5-mer):
#Extract all chromosomes:
unique_chr <- unique(tombo_output$Chr)
-
+
#Loop over all positions for every chromosome:
sum_pvalue_kmer <- c()
for (single_chr in unique_chr){
data_analysis <- subset(tombo_output[c(2,3,4)], Chr==single_chr)
-
+
#Looping over the rows while checking that positions are consecutive:
for (row in 1:nrow(data_analysis)){
if (row<3){
sum_pvalue_kmer <- c(sum_pvalue_kmer, NA)
-
+
} else {
- #Extract 5kmer positions:
+ #Extract 5kmer positions:
kmer <- as.integer(c(data_analysis[(row-2),2], data_analysis[(row-1),2], data_analysis[row,2],
data_analysis[(row+1),2], data_analysis[(row+2),2]))
-
+
test_consecutive <- rle(diff(kmer))
are_consecutive <- all(test_consecutive$values==1)
-
+
#If consecutive, calculate mean:
if (test_consecutive$lengths==4 && are_consecutive==TRUE){
pvalues <- as.numeric(c(data_analysis[(row-2),3], data_analysis[(row-1),3], data_analysis[row,3],
data_analysis[(row+1),3], data_analysis[(row+2),3]))
-
+
sum_pvalue_kmer <- c(sum_pvalue_kmer, sum(pvalues))
} else {
sum_pvalue_kmer <- c(sum_pvalue_kmer, NA)
}
-
+
}
-
+
}
-
+
}
tombo_output$statistic_kmer <- sum_pvalue_kmer
tombo_output_filtered <- subset(tombo_output, statistic>=0)
colnames(tombo_output_filtered) <- c('Ref_Position', 'Chr', 'Position', 'Tombo_SiteScore', 'Coverage_Sample', 'Coverage_IVT',
'Tombo_KmerScore')
-
+
#Output results table:
write.table(tombo_output_filtered, file = paste(output_name, "_Tombo_Output.tsv", sep = ""), sep = "\t", row.names=FALSE)
}
-
diff --git a/mop_mod/bin/Merging_processed_nanopolish_data_v1.py b/mop_mod/bin/Merging_processed_nanopolish_data_v1.py
index 108ba3d..80a633f 100755
--- a/mop_mod/bin/Merging_processed_nanopolish_data_v1.py
+++ b/mop_mod/bin/Merging_processed_nanopolish_data_v1.py
@@ -4,14 +4,14 @@
# Import required libraries:
import argparse
import csv
-import gzip
import statistics
+import gzip
def process_dicts(files):
intensity = dict()
coverage = dict()
-
+
for file in files:
#Open and read input file:
with gzip.open(file, mode='rt') as csv_file:
@@ -39,23 +39,23 @@ def process_dicts(files):
#Close input file:
#csv_file.close()
-
+
#Data processing of data stored in both dictionaries:
for key in intensity:
intensity[key] = statistics.median(intensity[key])
coverage[key] = sum(coverage[key])
-
+
return intensity,coverage
def generate_output(output_file, intensity, coverage):
#Create output file:
out_file = output_file+'.tsv'
f = open(out_file, 'w')
-
+
#Print header:
header=['contig','position','reference_kmer','read_name','median','coverage']
print('\t'.join(header),file=f)
-
+
#Print data stored in both dictionaries:
for key in intensity:
splitted_key = key.split(",")
@@ -73,9 +73,9 @@ def main():
#Read, parse and merge data from individual eventalign files:
intensity, coverage = process_dicts(a.input)
-
+
#Generate output file with processed data:
generate_output(a.output, intensity, coverage)
-if __name__=='__main__':
+if __name__=='__main__':
main()
diff --git a/mop_mod/bin/Merging_processed_nanopolish_data_v2.py b/mop_mod/bin/Merging_processed_nanopolish_data_v2.py
index f7d190d..1cf52d9 100755
--- a/mop_mod/bin/Merging_processed_nanopolish_data_v2.py
+++ b/mop_mod/bin/Merging_processed_nanopolish_data_v2.py
@@ -5,27 +5,25 @@
import argparse
import csv
import statistics
-
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from pyarrow import csv
-
def process_dicts(files):
intensity = dict()
coverage = dict()
-
+
for file in files:
#Open and read input file:
test = pq.read_table(file).to_pandas()
#test = pq.write_table(csv_reader, '2hpf-Rep1-Batch1.parquete')
-
+
#Update both dictionaries:
for i in zip(test['contig'],test['position'],test['reference_kmer'],test['read_name'],test['median'], test['coverage']):
-
+
key = ','.join(map(str, i[0:4]))
-
+
#Update intensity dict:
if key in intensity:
intensity[key].append(float(i[4]))
@@ -40,30 +38,30 @@ def process_dicts(files):
#Close input file:
#csv_file.close()
-
+
#Data processing of data stored in both dictionaries:
for key in intensity:
intensity[key] = statistics.median(intensity[key])
coverage[key] = sum(coverage[key])
-
+
return intensity,coverage
def generate_output(output_file, intensity, coverage):
#Create output file:
out_file = output_file+'.tsv'
f = open(out_file, 'w')
-
+
#Print header:
header=['contig','position','reference_kmer','read_name','median','coverage']
print('\t'.join(header),file=f)
-
+
#Print data stored in both dictionaries:
for key in intensity:
splitted_key = key.split(",")
print('\t'.join(splitted_key), "{:.3f}".format(intensity.get(key)), coverage.get(key), file=f, sep='\t')
-
+
f.close()
-
+
def main():
parser = argparse.ArgumentParser(description=desc)
@@ -71,12 +69,12 @@ def main():
parser.add_argument('-o', '--output', help='Output filename')
a = parser.parse_args()
-
+
#Read, parse and merge data from individual eventalign files:
intensity, coverage = process_dicts(a.input)
-
+
#Generate output file with processed data:
generate_output(a.output, intensity, coverage)
-if __name__=='__main__':
- main()
\ No newline at end of file
+if __name__=='__main__':
+ main()
diff --git a/mop_mod/bin/Merging_processed_nanopolish_data_v4.py b/mop_mod/bin/Merging_processed_nanopolish_data_v4.py
index d91d39b..d5a1eeb 100755
--- a/mop_mod/bin/Merging_processed_nanopolish_data_v4.py
+++ b/mop_mod/bin/Merging_processed_nanopolish_data_v4.py
@@ -4,36 +4,34 @@
# Import required libraries:
import argparse
import statistics
-
import pandas as pd
import pyarrow as pa
+import pyarrow.parquet as pq
import pyarrow.compute as pc
import pyarrow.dataset as ds
-import pyarrow.parquet as pq
from pyarrow import fs
-
def generate_output(output_file, data, initial):
#Create output file:
out_file = output_file+'.tsv.gz'
- if initial:
+ if initial:
data.to_csv(out_file, sep = '\t', index = False, compression = "gzip")
-
+
else:
data.to_csv(out_file, sep = '\t', mode='a', index = False, header = False, compression = "gzip")
def process_dicts(files, output_file):
list_tables = list()
-
+
#Concatenate parquete files:
for file in files:
#Open and read input file:
new_table = pq.read_table(file, columns=['contig','position', 'reference_kmer', 'median', 'coverage'])
list_tables.append(new_table)
- concat_table = pa.concat_tables(list_tables)
-
+ concat_table = pa.concat_tables(list_tables)
+
#Create partitioned data:
pq.write_to_dataset(concat_table, root_path='./dataset_name', partition_cols=['contig'])
@@ -56,10 +54,10 @@ def process_dicts(files, output_file):
#print(table)
#table = dataset.to_table(filter=ds.field("contig") == single_transcript).to_pandas().groupby(['contig', 'position','reference_kmer'], observed=True).agg({'median':'median', 'coverage':'sum'})
#mask = pc.equal(concat_table['contig'], single_transcript)
- #table = concat_table.filter(mask).to_pandas().groupby(['contig', 'position','reference_kmer'], observed=True).agg({'median':'median', 'coverage':'sum'})
+ #table = concat_table.filter(mask).to_pandas().groupby(['contig', 'position','reference_kmer'], observed=True).agg({'median':'median', 'coverage':'sum'})
table.columns = ['median', 'coverage']
table = table.reset_index()
-
+
#Generate output:
generate_output(output_file, table, initial)
initial = False
@@ -71,10 +69,10 @@ def main():
parser.add_argument('-o', '--output', help='Output filename')
a = parser.parse_args()
-
+
#Read, parse and merge data from individual eventalign files:
process_dicts(a.input, a.output)
os.system("rm -fr dataset_name")
-
-if __name__=='__main__':
+
+if __name__=='__main__':
main()
diff --git a/mop_mod/bin/Merging_processed_nanopolish_data_v5.py b/mop_mod/bin/Merging_processed_nanopolish_data_v5.py
index cad8e96..0320435 100755
--- a/mop_mod/bin/Merging_processed_nanopolish_data_v5.py
+++ b/mop_mod/bin/Merging_processed_nanopolish_data_v5.py
@@ -3,38 +3,36 @@
# Import required libraries:
import argparse
-import os
import statistics
-
+import os
import pandas as pd
import pyarrow as pa
+import pyarrow.parquet as pq
import pyarrow.compute as pc
import pyarrow.dataset as ds
-import pyarrow.parquet as pq
from pyarrow import fs
-
def generate_output(output_file, data, initial):
#Create output file:
out_file = output_file+'.tsv.gz'
- if initial:
+ if initial:
data.to_csv(out_file, sep = '\t', index = False, compression = "gzip")
-
+
else:
data.to_csv(out_file, sep = '\t', mode='a', index = False, header = False, compression = "gzip")
def process_dicts(files, output_file):
list_tables = list()
-
+
#Concatenate parquete files:
for file in files:
#Open and read input file:
new_table = pq.read_table(file, columns=['contig','position', 'reference_kmer', 'median', 'coverage'])
#list_tables.append(new_table)
- #concat_table = pa.concat_tables(list_tables)
-
+ #concat_table = pa.concat_tables(list_tables)
+
#Create partitioned data:
pq.write_to_dataset(new_table, root_path='dataset_name', partition_cols=['contig'])
@@ -48,10 +46,10 @@ def process_dicts(files, output_file):
for single_transcript in transcripts:
table = dataset.scanner(filter=ds.field("contig") == single_transcript).to_table().to_pandas(use_threads=True).groupby(['contig', 'position','reference_kmer'], observed=True).agg({'median':'median', 'coverage':'sum'})
- #table = concat_table.filter(mask).to_pandas().groupby(['contig', 'position','reference_kmer'], observed=True).agg({'median':'median', 'coverage':'sum'})
+ #table = concat_table.filter(mask).to_pandas().groupby(['contig', 'position','reference_kmer'], observed=True).agg({'median':'median', 'coverage':'sum'})
table.columns = ['median', 'coverage']
table = table.reset_index()
-
+
#Generate output:
generate_output(output_file, table, initial)
initial = False
@@ -63,9 +61,9 @@ def main():
parser.add_argument('-o', '--output', help='Output filename')
a = parser.parse_args()
-
+
#Read, parse and merge data from individual eventalign files:
process_dicts(a.input, a.output)
-
-if __name__=='__main__':
+
+if __name__=='__main__':
main()
diff --git a/mop_mod/bin/bedgraph2wig.pl b/mop_mod/bin/bedgraph2wig.pl
index b8dc555..dd4935d 100755
--- a/mop_mod/bin/bedgraph2wig.pl
+++ b/mop_mod/bin/bedgraph2wig.pl
@@ -2,7 +2,7 @@
# Description: This script converts bedGraph to fixedStep wig format with defined step size. Input file may be compressed as .gz.
# Coordinates in bedGraph input are assumed to be 0-based (http://genome.ucsc.edu/goldenPath/help/bedgraph.html).
-# Coordinates in wig output are 1-based (http://genome.ucsc.edu/goldenPath/help/wiggle.html).
+# Coordinates in wig output are 1-based (http://genome.ucsc.edu/goldenPath/help/wiggle.html).
# Usage: bedgraph_to_wig.pl --bedgraph input.bedgraph --wig output.wig --step step_size [--compact]
# --bedgraph : specify input file in bedGraph format.
@@ -69,10 +69,10 @@
next if (/^track/);
next if (/^#/);
- # Parse relevant information in current line
+ # Parse relevant information in current line
# e.g: chr1 3000400 3000500 2
my ($chr, $start, $end, $val) = split(/\t/);
-
+
# Print header for new chromosome and initialize variables.
if ($chr ne $cur_chr) {
$cur_chr = $chr;
@@ -84,12 +84,12 @@
# +1 was added to convert from 0-based to 1-based coordinates.
}
}
-
+
# Print values when gap in bedGraph file is greater than step.
while ($start >= $next_pos) {
print_wig_line($cur_chr, \$cur_pos, \$next_pos, \$exp_pos, \$cur_val, $chr, $start, $end, $val, $step);
}
-
+
# Print values when step overlaps with bedGraph interval and bedGraph interval is longer than step.
while ($end >= $next_pos) {
$cur_val += $val * ($next_pos - max($cur_pos, $start));
diff --git a/mop_mod/bin/epinano_paired.py b/mop_mod/bin/epinano_paired.py
index fec9407..b8c5cb0 100755
--- a/mop_mod/bin/epinano_paired.py
+++ b/mop_mod/bin/epinano_paired.py
@@ -1,13 +1,11 @@
#!/usr/bin/env python
# -- coding: utf-8 -
-import argparse
-import gzip
-import re
+import argparse, re
+import numpy as np
from collections import Counter as cnt
from collections import defaultdict
-
-import numpy as np
+import gzip
parser = argparse.ArgumentParser()
parser.add_argument ('-k','--knockout', required=True, dest='kos', action='append', help='knockout sample epinano prediciton results')
diff --git a/mop_mod/bin/epinano_scatterplot.R b/mop_mod/bin/epinano_scatterplot.R
index 48001a3..56c9a0f 100755
--- a/mop_mod/bin/epinano_scatterplot.R
+++ b/mop_mod/bin/epinano_scatterplot.R
@@ -1,9 +1,9 @@
-#Scatter plots
+#Scatter plots
#Rscript epinano_scatterplot.R input1 label1 input2 label2 feature
#Libraries needed
library(plyr)
library(ggplot2)
-library(ggrepel)
+library(ggrepel)
library(MASS)
library(reshape2)
@@ -23,7 +23,7 @@ label2 <- gsub("-", "_", paste0("X", as.character(args[4]))) #2nd label
feature <- as.character(args[5]) #Feature
-#Cleanup
+#Cleanup
cleanup <- function(input, label) {
#Filter low coverage reads
input$cov <- as.numeric(input$cov)
@@ -39,10 +39,10 @@ cleanup <- function(input, label) {
input$del <- as.numeric(input$del)
#Add summed errors column
- input$sum <- input$mis + input$del + input$ins
- #Add a column with position
+ input$sum <- input$mis + input$del + input$ins
+ #Add a column with position
input$position<- paste(input$X.Ref,input$pos)
- #Change column names
+ #Change column names
input <- input[, c("X.Ref","pos","position", "base", feature)]
colnames(input)<- c("Chr","Position","chr_pos","base",feature )
data_melted<- melt(data = input, id.vars = c("Chr", "Position", "chr_pos", "base"))
@@ -60,14 +60,14 @@ merged$Chr <- NULL
merged$Position <- NULL
merged$base <- NULL
merged$variable <- NULL
-
+
plot<- function(data)
for (chr in unique(data$Chr)) {
subs <- subset(data, Chr==chr)
if(nrow(subs)>0){
- res<- rlm(subs[,c(paste(label1, "value", sep="_"))] ~ subs[,c(paste(label2, "value", sep="_"))]) #linear model
- res_vec <- res$residuals#this contains residuals
+ res<- rlm(subs[,c(paste(label1, "value", sep="_"))] ~ subs[,c(paste(label2, "value", sep="_"))]) #linear model
+ res_vec <- res$residuals#this contains residuals
threshold <- 5 * sd(res_vec) #The threshold
subs$score<- abs(subs[,c(paste(label1, "value", sep="_"))] - subs[,c(paste(label2, "value", sep="_"))])
pdf(file=paste(chr,feature, plotlab1, plotlab2, "scatter.pdf", sep="_"),height=5,width=5,onefile=FALSE)
diff --git a/mop_mod/bin/join.r b/mop_mod/bin/join.r
index 2a8643b..85fac59 100755
--- a/mop_mod/bin/join.r
+++ b/mop_mod/bin/join.r
@@ -30,12 +30,12 @@ venn.diagram(
filename = 'venn_diagram.png',
imagetype = "png",
output=TRUE,
- height = 1024,
- width = 1024 ,
+ height = 1024,
+ width = 1024 ,
resolution = 300,
compression = "lzw",
main.pos = c(0.5,0.7),
-
+
# Circles
lwd = 2,
lty = 'blank',
@@ -43,12 +43,12 @@ venn.diagram(
margin = 0.6,
main = prefix,
main.fontfamily = "sans",
-
+
# Numbers
cex = .6,
fontface = "bold",
fontfamily = "sans",
-
+
# Set names
cat.cex = 0.6,
cat.fontface = "bold",
@@ -56,5 +56,3 @@ venn.diagram(
cat.pos = c(-135, 135),
cat.fontfamily = "sans"
)
-
-
diff --git a/mop_mod/bin/mean_per_pos_v1.py b/mop_mod/bin/mean_per_pos_v1.py
index 79b5c1f..243207d 100755
--- a/mop_mod/bin/mean_per_pos_v1.py
+++ b/mop_mod/bin/mean_per_pos_v1.py
@@ -3,20 +3,18 @@
# Import required libraries:
import argparse
-
import pandas as pd
-
def parse_input(file, size_chunks):
-
+
df_chunk = pd.read_csv(file, sep='\t', chunksize=size_chunks, compression='gzip', error_bad_lines=False)
#df_chunk = pd.read_csv(file, sep='\t', chunksize=size_chunks, error_bad_lines=False)
chunk_list = list()
# Process each portion of input file:
- for chunk in df_chunk:
-
- # Perform data filtering:
+ for chunk in df_chunk:
+
+ # Perform data filtering:
chunk_filter = chunk.iloc[:,[0,1,2,3,6]]
#chunk_filter = chunk
chunk_filter.columns = ['contig', 'position','reference_kmer', 'read_name','event_level_mean']
@@ -25,7 +23,7 @@ def parse_input(file, size_chunks):
chunk_filter = chunk_filter.groupby(['contig', 'position','reference_kmer', 'read_name']).agg({'event_level_mean':'mean'})
chunk_filter.columns = ['event_level_mean']
chunk_filter = chunk_filter.reset_index()
-
+
# Once the data filtering is done, append to list
chunk_list.append(chunk_filter)
print('Partition {}: Processed'.format(len(chunk_list)))
@@ -41,7 +39,7 @@ def parse_input(file, size_chunks):
def mean_perpos (sliced_data, output):
-
+
#Calculate mean per positions:
print('Analysing data - position level - mean')
sliced_data['read_name'] = 1
@@ -54,7 +52,7 @@ def mean_perpos (sliced_data, output):
mean_perpos.to_csv('{}_processed_perpos_mean.tsv'.format(output), sep='\t', index = False)
def median_perpos (sliced_data, output):
-
+
#Calculate mean per positions:
print('Analysing data - position level - median')
#sliced_data['read_name'] = 1
@@ -120,8 +118,8 @@ def main():
if a.mean:
mean_perpos(raw_import, a.output)
else:
- median_perpos(raw_import, a.output)
+ median_perpos(raw_import, a.output)
-if __name__=='__main__':
+if __name__=='__main__':
main()
diff --git a/mop_mod/bin/mean_per_pos_v2.py b/mop_mod/bin/mean_per_pos_v2.py
index 902638b..db6a524 100755
--- a/mop_mod/bin/mean_per_pos_v2.py
+++ b/mop_mod/bin/mean_per_pos_v2.py
@@ -3,26 +3,24 @@
# Import required libraries:
import argparse
-
import pandas as pd
-import pyarrow as pa
import pyarrow.parquet as pq
-
+import pyarrow as pa
def parse_input(file, size_chunks):
-
+
df_chunk = pd.read_csv(file, sep='\t', chunksize=size_chunks, compression='gzip', error_bad_lines=False)
chunk_list = list()
# Process each portion of input file:
- for chunk in df_chunk:
-
+ for chunk in df_chunk:
+
chunk_filter = chunk.iloc[:,[0,1,2,3,6]]
chunk_filter.columns = ['contig', 'position','reference_kmer', 'read_name','event_level_mean']
chunk_filter = chunk_filter.groupby(['contig', 'position','reference_kmer', 'read_name']).agg({'event_level_mean':'mean'})
chunk_filter.columns = ['event_level_mean']
chunk_filter = chunk_filter.reset_index()
-
+
# Once the data filtering is done, append to list
chunk_list.append(chunk_filter)
print('Partition {}: Processed'.format(len(chunk_list)))
@@ -36,7 +34,7 @@ def parse_input(file, size_chunks):
def mean_perpos (sliced_data, output):
-
+
#Calculate mean per positions:
print('Analysing data - position level - mean')
sliced_data['read_name'] = 1
@@ -49,7 +47,7 @@ def mean_perpos (sliced_data, output):
pq.write_table(pa.Table.from_pandas(mean_perpos), '{}_processed_perpos_mean.parquete'.format(output))
def median_perpos (sliced_data, output):
-
+
#Calculate mean per positions:
print('Analysing data - position level - median')
#sliced_data['read_name'] = 1
@@ -115,8 +113,8 @@ def main():
if a.mean:
mean_perpos(raw_import, a.output)
else:
- median_perpos(raw_import, a.output)
+ median_perpos(raw_import, a.output)
-if __name__=='__main__':
+if __name__=='__main__':
main()
diff --git a/mop_mod/bin/mean_per_pos_v3.py b/mop_mod/bin/mean_per_pos_v3.py
index cbc4efe..3b3d2b2 100755
--- a/mop_mod/bin/mean_per_pos_v3.py
+++ b/mop_mod/bin/mean_per_pos_v3.py
@@ -1,7 +1,7 @@
#!/usr/bin/env python
"""
-speeding up the
+speeding up the
/usr/bin/time ./mean_per_pos_v3.py -i mod_batch_0.fast5_event_align.tsv.gz -o mod_batch_0.fast5_event_align.polars --mean
Analysing data - position level - mean
Saving results to: mod_batch_0.fast5_event_align.polars_processed_perpos_mean.parquet
@@ -17,40 +17,43 @@
import argparse
+
import pandas as pd
-import polars as pl
import pyarrow as pa
import pyarrow.parquet as pq
+
+import polars as pl
+
def parse_input(input_fn):
-
- raw_data = pl.read_csv(input_fn, sep="\t",
- columns = ["contig", "position", "reference_kmer", "read_name", "event_level_mean"],
- dtype={"contig": pl.Utf8,
- "position": pl.Int32,
- "reference_kmer": pl.Utf8,
- "read_name": pl.Utf8,
+
+ raw_data = pl.read_csv(input_fn, sep="\t",
+ columns = ["contig", "position", "reference_kmer", "read_name", "event_level_mean"],
+ dtype={"contig": pl.Utf8,
+ "position": pl.Int32,
+ "reference_kmer": pl.Utf8,
+ "read_name": pl.Utf8,
"event_level_mean": pl.Float32 })
-
+
return raw_data
def mean_perpos (raw_data_polars, output_prefix):
-
+
output_fn = f"{output_prefix}_processed_perpos_mean.parquet"
-
+
#Calculate mean per positions:
print('Analysing data - position level - mean')
q = (raw_data_polars.lazy()
.groupby( ["contig", "position", "reference_kmer"])
.agg([
- (pl.col("event_level_mean").mean().alias("mean")),
+ (pl.col("event_level_mean").mean().alias("mean")),
(pl.col("read_name").n_unique().alias("coverage") )
])
.sort(["contig", "position","reference_kmer" ])
)
-
+
result_df = q.collect()
result_df["read_name"] = pd.Series([1 for x in range(result_df.height)])
result_df = result_df["contig", "position", "reference_kmer","read_name","mean", "coverage"]
@@ -59,9 +62,9 @@ def mean_perpos (raw_data_polars, output_prefix):
#Output parquet file:
print(f"Saving results to: {output_fn}")
-
+
def median_perpos (raw_data_polars, output_prefix):
-
+
output_fn = f"{output_prefix}_processed_perpos_median.parquet"
#Calculate mean per positions:
@@ -70,20 +73,20 @@ def median_perpos (raw_data_polars, output_prefix):
q = (raw_data_polars.lazy()
.groupby( ["contig", "position", "reference_kmer"])
.agg([
- (pl.col("event_level_mean").median().alias("median")),
+ (pl.col("event_level_mean").median().alias("median")),
(pl.col("read_name").n_unique().alias("coverage") )])
.sort(["contig", "position","reference_kmer" ])
)
-
+
result_df = q.collect()
result_df["read_name"] = pd.Series([1 for x in range(result_df.height)])
result_df = result_df["contig", "position", "reference_kmer","read_name","median", "coverage"]
result_df.to_parquet(output_fn)
#q.collect().to_parquet(output_fn, compression='zstd')
-
+
#Output .csv files:
print(f"Saving median_perpos results to: {output_fn}")
-
+
def mean_perpos_perread (raw_data, output):
@@ -115,7 +118,7 @@ def main():
parser.add_argument('-i', '--input', help='Input file to process.')
parser.add_argument('-o', '--output', help='Output filename')
-
+
#parser.add_argument("-s", "--chunk_size", default=100000, type=int, help='Size for input subsetting [%(default)s]')
parser.add_argument("--read_level", action='store_true', help='Analysis at per read level')
parser.add_argument("--mean", action='store_true', help='Analysis using the mean instead of the median.')
@@ -138,8 +141,8 @@ def main():
if a.mean:
mean_perpos(raw_import, a.output)
else:
- median_perpos(raw_import, a.output)
+ median_perpos(raw_import, a.output)
-if __name__=='__main__':
+if __name__=='__main__':
main()
diff --git a/mop_mod/bin/tombo_filter.py b/mop_mod/bin/tombo_filter.py
index d6a3d60..f0f08d1 100755
--- a/mop_mod/bin/tombo_filter.py
+++ b/mop_mod/bin/tombo_filter.py
@@ -1,11 +1,9 @@
#!/usr/bin/env python
-import argparse
-import re
+import argparse, re
+import numpy as np
from collections import Counter as cnt
from collections import defaultdict
-import numpy as np
-
parser = argparse.ArgumentParser()
parser.add_argument ('-t','--tomboouputs', required=True, dest='tbs', action='append', help='knockout sample tombo prediciton results')
parser.add_argument ('-p','--percentage', nargs = '?', const=0.5, default=0.5,type = float, help='threshod to filter for positives; (0,1]; default is 0.5')
diff --git a/mop_mod/config.yaml b/mop_mod/config.yaml
index faa50a8..2a39fc4 100644
--- a/mop_mod/config.yaml
+++ b/mop_mod/config.yaml
@@ -9,7 +9,7 @@ custom_logo: 'logo_small.png'
custom_logo_url: 'https://github.com/biocorecrg/nanopore_analysis'
custom_logo_title: 'Master of Pores'
-extra_fn_clean_trim:
+extra_fn_clean_trim:
- '_QC'
table_columns_visible:
@@ -23,4 +23,4 @@ module_order:
- alnQC:
name: "alnQC"
- info: 'This section of the report shows alnQC results on aligned reads'
+ info: 'This section of the report shows alnQC results on aligned reads'
diff --git a/mop_mod/mop_mod.nf b/mop_mod/mop_mod.nf
index e63275e..316ac95 100755
--- a/mop_mod/mop_mod.nf
+++ b/mop_mod/mop_mod.nf
@@ -2,13 +2,13 @@
nextflow.enable.dsl=2
-/*
+/*
* Define the pipeline parameters
*
*/
// Pipeline version
-version = '2.0'
+version = '3.0'
params.help = false
params.resume = false
@@ -18,9 +18,9 @@ log.info """
╔╦╗╔═╗╔═╗ ╔╦╗┌─┐┌┬┐
║║║║ ║╠═╝ ║║║│ │ ││
╩ ╩╚═╝╩ ╩ ╩└─┘─┴┘
-
+
====================================================
-BIOCORE@CRG Master of Pores 2. Detection of RNA modification - N F ~ version ${version}
+BIOCORE@CRG Master of Pores 3. Detection of RNA modification - N F ~ version ${version}
====================================================
***************** Input files *******************
@@ -30,11 +30,11 @@ comparison : ${params.comparison}
********** reference has to be the genome *************
reference : ${params.reference}
output : ${params.output}
-
-pars_tools : ${params.pars_tools}
+pars_tools : ${params.pars_tools}
************************* Flows *******************************
-epinano : ${params.epinano}
+epinano : ${params.epinano}
+modphred : ${params.modphred}
nanocompore : ${params.nanocompore}
tombo_lsc : ${params.tombo_lsc}
tombo_msc : ${params.tombo_msc}
@@ -67,22 +67,26 @@ flows["nanocompore"] = params.nanocompore
flows["tombo_lsc"] = params.tombo_msc
flows["tombo_msc"] = params.tombo_lsc
-include { getParameters; mapIDPairs } from "${local_modules}"
+include { getParameters; mapIDPairs } from "${local_modules}"
// Create a channel for tool options
progPars = getParameters(params.pars_tools)
-include { calcVarFrequencies as EPINANO_CALC_VAR_FREQUENCIES } from "${subworkflowsDir}/chem_modification/epinano_1.2.nf" addParams(LABEL: 'big_mem_cpus', EXTRAPARS: progPars["epinano--epinano"])
+include { calcVarFrequencies as EPINANO_CALC_VAR_FREQUENCIES } from "${subworkflowsDir}/chem_modification/epinano_1.2.4.nf" addParams(LABEL: 'big_mem_cpus', EXTRAPARS: progPars["epinano--epinano"])
include { joinEpinanoRes } from "${local_modules}" addParams(OUTPUT: outputEpinanoFlow)
+
+include { RUNBYCHROM as MODPHRED_CHR } from "${subworkflowsDir}/chem_modification/modphred.nf" addParams(LABEL: 'big_mem_cpus', EXTRAPARS: progPars["modphred--modphred"], OUTPUT: outputModPhredFlow)
+
+
include { EVENTALIGN as NANOPOLISH_EVENTALIGN } from "${subworkflowsDir}/chem_modification/nanopolish" addParams(LABEL: 'big_mem_cpus', LABELBIG: 'big_time_cpus', OUTPUT: outputNanoPolComFlow, EXTRAPARS: progPars["nanocompore--nanopolish"])
include { SAMPLE_COMPARE as NANOCOMPORE_SAMPLE_COMPARE } from "${subworkflowsDir}/chem_modification/nanocompore" addParams(LABEL: 'big_time_cpus', OUTPUT: outputNanoPolComFlow, EXTRAPARS: progPars["nanocompore--nanocompore"])
include { RESQUIGGLE_RNA as TOMBO_RESQUIGGLE_RNA } from "${subworkflowsDir}/chem_modification/tombo.nf" addParams(LABEL: 'big_cpus', EXTRAPARS: progPars["tombo_resquiggling--tombo"])
include { GET_MODIFICATION_MSC as TOMBO_GET_MODIFICATION_MSC } from "${subworkflowsDir}/chem_modification/tombo.nf" addParams(LABEL: 'big_mem_cpus', EXTRAPARS: progPars["tombo_msc--tombo"], OUTPUT: outputTomboFlow)
include { GET_MODIFICATION_LSC as TOMBO_GET_MODIFICATION_LSC } from "${subworkflowsDir}/chem_modification/tombo.nf" addParams(LABEL: 'big_mem_cpus', EXTRAPARS: progPars["tombo_lsc--tombo"], OUTPUT: outputTomboFlow)
-include { GET_VERSION as EPINANO_VER } from "${subworkflowsDir}/chem_modification/epinano_1.2.nf"
-include { GET_VERSION as NANOPOLISH_VER } from "${subworkflowsDir}/chem_modification/nanopolish"
-include { GET_VERSION as NANOCOMPORE_VER } from "${subworkflowsDir}/chem_modification/nanocompore"
+include { GET_VERSION as EPINANO_VER } from "${subworkflowsDir}/chem_modification/epinano_1.2.4.nf"
+include { GET_VERSION as NANOPOLISH_VER } from "${subworkflowsDir}/chem_modification/nanopolish"
+include { GET_VERSION as NANOCOMPORE_VER } from "${subworkflowsDir}/chem_modification/nanocompore"
include { GET_VERSION as TOMBO_VER } from "${subworkflowsDir}/chem_modification/tombo.nf"
include { wigToBigWig; getChromInfo; splitReference; splitBams; indexReference; callVariants; checkRef; bedGraphToWig as bedGraphToWig_msc; bedGraphToWig as bedGraphToWig_lsc } from "${local_modules}"
@@ -92,202 +96,248 @@ include { makeEpinanoPlots as makeEpinanoPlots_mis; makeEpinanoPlots as makeEpin
include { multiToSingleFast5 } addParams(LABEL: 'big_cpus') from "${local_modules}"
include { mean_per_pos } addParams(LABEL: 'big_mem_cpus') from "${local_modules}"
-include { concat_mean_per_pos } addParams(LABEL: 'big_mem_cpus') from "${local_modules}"
-include { concat_csv_files } addParams(OUTPUT: outputNanoPolComFlow, LABEL: 'big_mem_cpus') from "${local_modules}"
+include { concat_mean_per_pos } addParams(LABEL: 'big_mem_cpus') from "${local_modules}"
+include { concat_csv_files } addParams(OUTPUT: outputNanoPolComFlow, LABEL: 'big_mem_cpus') from "${local_modules}"
/*
* Creates the channels with comparisons
*/
-compfile = file(params.comparison)
-if( !compfile.exists() ) exit 1, "Missing comparison file: ${compfile}. Specify path with --comparisons"
+comparisons = channel.empty()
+
+if (params.comparison != "") {
+ compfile = file(params.comparison)
+ if( !compfile.exists() ) exit 1, "Missing comparison file: ${compfile}. Specify path with --comparisons"
- Channel
+ Channel
.from(compfile.readLines())
.map { line ->
list = line.split("\t")
+ if (list.length <2) {
+ error "ERROR!!! Comparison file has to be tab separated\n"
+ }
if (list[0]!= "") {
def sampleID = list[0]
def ctrlID = list[1]
[ sampleID, ctrlID ]
}
}.set {comparisons}
+}
+
+
+workflow {
+ // Get Sample Names from comparisons
+ comparisons.flatten().unique().set{unique_samples}
+ // get BAM FILEs
+ unique_samples.map {
+ [it, file("${params.input_path}/alignment/${it}_s.bam")]
+ }.transpose().set{bams}
+ // get BAI FILEs
+ unique_samples.map {
+ [it, file("${params.input_path}/alignment/${it}_s.bam.bai")]
+ }.transpose().set{bais}
+ // get BAI FILEs
+ unique_samples.map {
+ [it, file("${params.input_path}/fastq_files/${it}.fq.gz")]
+ }.transpose().set{fastqs}
+
+ unique_samples.map {
+ [it, file("${params.input_path}/QC_files/${it}_final_summary.stats")]
+ }.transpose().set{summaries}
+
+ unique_samples.map {
+ [it, file("${params.input_path}/fast5_files/${it}/", type: 'dir')]
+ }.transpose().set{fast5_folders}
+
+ unique_samples.map {
+ [it, file("${params.input_path}/fast5_files/${it}/*.fast5")]
+ }.transpose().set{fast5_files}
+
+ all_fast5 = channel.fromPath("${params.input_path}/fast5_files/*/*.fast5").map{
+ [it.getParent().getName(), it]
+ }
+
+ ref_file = checkRef(reference)
+
+ // Check chr sizes if modphred or tombo
+ if (params.modphred == "YES" || params.tombo_lsc == "YES" || params.tombo_msc == "YES") {
+ outchr = getChromInfo(ref_file)
+ chromSizes = outchr.sizes
+ chroms = outchr.chromosomes.splitText() { it.trim() }
+ }
+
+ if (params.modphred == "YES") {
+ //chroms.subscribe{ println "Got: ***${it}***" }
+ modphred_flow(all_fast5, ref_file, chroms)
+ }
+
+ if (params.epinano == "YES") {
+ epinano_flow(bams, ref_file, comparisons)
+ }
+ if (params.nanocompore == "YES") {
+ compore_polish_flow(comparisons, fast5_folders, bams, bais, fastqs, summaries, ref_file)
+ }
+
+ if (params.tombo_lsc == "YES" || params.tombo_msc == "YES") {
+ tombo_data = tombo_common_flow(fast5_files, ref_file, comparisons)
+ //chromSizes = getChromInfo(ref_file).size
+
+ wiggle_msc = Channel.empty()
+ stat_msc = Channel.empty()
+ stat_lsc = Channel.empty()
+ wiggle_lsc = Channel.empty()
+
+ if (params.tombo_msc == "YES") {
+ tombo_msc_flow(tombo_data, ref_file)
+
+ wiggle_msc = bedGraphToWig_msc(chromSizes, tombo_msc_flow.out.bed_graphs.transpose()).map{
+ ["${it[0]}_msc", it[1] ]
+ }
+ stat_msc = tombo_msc_flow.out.dampened_wiggles.transpose().map{
+ ["${it[0]}_msc", it[1] ]
+ }
+ }
+ if (params.tombo_lsc == "YES") {
+ tombo_lsc_flow(tombo_data, ref_file)
+ wiggle_lsc = bedGraphToWig_lsc(chromSizes, tombo_lsc_flow.out.bed_graphs.transpose()).map{
+ ["${it[0]}_lsc", it[1] ]
+ }
+ stat_lsc = tombo_lsc_flow.out.dampened_wiggles.transpose().map{
+ ["${it[0]}_lsc", it[1] ]
+ }
+ }
+ wiggle_msc.mix(wiggle_lsc).branch {
+ sampleplus: it[1] =~ /\.sample\.plus\./
+ sampleminus: it[1] =~ /\.sample\.minus\./
+ controlplus: it[1] =~ /\.control\.plus\./
+ controlminus: it[1] =~ /\.control\.minus\./
+ }.set{combo_tombo}
-workflow {
- comparisons.flatten().unique().set{unique_samples}
-
- unique_samples.map {
- [it, file("${params.input_path}/alignment/${it}_s.bam")]
- }.transpose().set{bams}
- unique_samples.map {
- [it, file("${params.input_path}/alignment/${it}_s.bam.bai")]
- }.transpose().set{bais}
- unique_samples.map {
- [it, file("${params.input_path}/fastq_files/${it}.fq.gz")]
- }.transpose().set{fastqs}
- unique_samples.map {
- [it, file("${params.input_path}/QC_files/${it}_final_summary.stats")]
- }.transpose().set{summaries}
- unique_samples.map {
- [it, file("${params.input_path}/fast5_files/${it}/", type: 'dir')]
- }.transpose().set{fast5_folders}
-
- unique_samples.map {
- [it, file("${params.input_path}/fast5_files/${it}/*.fast5")]
- }.transpose().set{fast5_files}
-
- ref_file = checkRef(reference)
-
- if (params.epinano == "YES") {
- epinano_flow(bams, ref_file, comparisons)
- }
- if (params.nanocompore == "YES") {
- compore_polish_flow(comparisons, fast5_folders, bams, bais, fastqs, summaries, ref_file)
- }
-
- if (params.tombo_lsc == "YES" || params.tombo_msc == "YES") {
- tombo_data = tombo_common_flow(fast5_files, ref_file, comparisons)
- chromSizes = getChromInfo(ref_file)
-
- if (params.tombo_msc == "YES") {
- tombo_msc_flow(tombo_data, ref_file)
-
- wiggle_msc = bedGraphToWig_msc(chromSizes, tombo_msc_flow.out.bed_graphs.transpose()).map{
- ["${it[0]}_msc", it[1] ]
- }
- stat_msc = tombo_msc_flow.out.dampened_wiggles.transpose().map{
- ["${it[0]}_msc", it[1] ]
- }
- }
- if (params.tombo_lsc == "YES") {
- tombo_lsc_flow(tombo_data, ref_file)
- wiggle_lsc = bedGraphToWig_lsc(chromSizes, tombo_lsc_flow.out.bed_graphs.transpose()).map{
- ["${it[0]}_lsc", it[1] ]
- }
- stat_lsc = tombo_lsc_flow.out.dampened_wiggles.transpose().map{
- ["${it[0]}_lsc", it[1] ]
- }
- }
-
- wiggle_msc.mix(wiggle_lsc).branch {
- sampleplus: it[1] =~ /\.sample\.plus\./
- sampleminus: it[1] =~ /\.sample\.minus\./
- controlplus: it[1] =~ /\.control\.plus\./
- controlminus: it[1] =~ /\.control\.minus\./
- }.set{combo_tombo}
- stat_bw = wigToBigWig(chromSizes, stat_lsc.mix(stat_msc))
-
- stat_bw.branch {
- plus: it[1] =~ /\.plus\./
- minus: it[1] =~ /\.minus\./
- }.set{combo_stats}
-
- //combo_stats.plus.view()
-
- mergeTomboWigsPlus("plus", combo_tombo.sampleplus.join(combo_tombo.controlplus).join(combo_stats.plus))
- mergeTomboWigsMinus("minus", combo_tombo.sampleminus.join(combo_tombo.controlminus).join(combo_stats.minus))
- }
-
- all_ver = EPINANO_VER().mix(NANOPOLISH_VER())
- .mix(NANOCOMPORE_VER()).mix(TOMBO_VER())
- .collectFile(name: 'tool_version.txt', newLine: false, storeDir:params.output)
+ stat_bw = wigToBigWig(chromSizes, stat_lsc.mix(stat_msc))
+
+ stat_bw.branch {
+ plus: it[1] =~ /\.plus\./
+ minus: it[1] =~ /\.minus\./
+ }.set{combo_stats}
+
+ mergeTomboWigsPlus("plus", combo_tombo.sampleplus.join(combo_tombo.controlplus).join(combo_stats.plus))
+ mergeTomboWigsMinus("minus", combo_tombo.sampleminus.join(combo_tombo.controlminus).join(combo_stats.minus))
+ }
+
+ //all_ver = EPINANO_VER().mix(NANOPOLISH_VER())
+ //.mix(NANOCOMPORE_VER()).mix(TOMBO_VER())
+ //.collectFile(name: 'tool_version.txt', newLine: false, storeDir:params.output)
+
+}
+
+workflow modphred_flow {
+
+ take:
+ fast5_files
+ ref_file
+ chroms
+
+ main:
+ fast5_per_sample = fast5_files.groupTuple()
+ MODPHRED_CHR(fast5_per_sample, ref_file, chroms)
}
+
workflow tombo_common_flow {
take:
- fast5_files
- ref_file
- comparisons
-
- main:
- fast5_files.map{
- ["${it[0]}___${it[1].simpleName}", it[1]]
- }.set{fast5_reshaped}
-
- single_fast5_folders = multiToSingleFast5(fast5_reshaped)
- resquiggle = TOMBO_RESQUIGGLE_RNA(single_fast5_folders, ref_file)
-
- resquiggle.join(single_fast5_folders).map{
- def ids = it[0].split("___")
- ["${ids[0]}", it[1], it[2]]
- }.groupTuple().map{
- [it[0], [it[1], it[2]]]
- }.set{reshape_resquiggle}
-
- data_for_tombo = mapIDPairs(comparisons, reshape_resquiggle).map{
- [it[0], it[1], it[2][0], it[2][1], it[3][0], it[3][1]]
- }
-
- emit:
- data_for_tombo
+ fast5_files
+ ref_file
+ comparisons
+
+ main:
+ fast5_files.map{
+ ["${it[0]}___${it[1].simpleName}", it[1]]
+ }.set{fast5_reshaped}
+
+ single_fast5_folders = multiToSingleFast5(fast5_reshaped)
+ resquiggle = TOMBO_RESQUIGGLE_RNA(single_fast5_folders, ref_file)
+
+ resquiggle.join(single_fast5_folders).map{
+ def ids = it[0].split("___")
+ ["${ids[0]}", it[1], it[2]]
+ }.groupTuple().map{
+ [it[0], [it[1], it[2]]]
+ }.set{reshape_resquiggle}
+
+ data_for_tombo = mapIDPairs(comparisons, reshape_resquiggle).map{
+ [it[0], it[1], it[2][0], it[2][1], it[3][0], it[3][1]]
+ }
+
+ emit:
+ data_for_tombo
}
workflow tombo_msc_flow {
take:
- data_for_tombo
- reference
-
- main:
- TOMBO_GET_MODIFICATION_MSC(data_for_tombo, reference)
- bed_graphs = TOMBO_GET_MODIFICATION_MSC.out.bedgraphs
- dampened_wiggles = TOMBO_GET_MODIFICATION_MSC.out.dampened_wiggles
-
- emit:
- bed_graphs
- dampened_wiggles
-
+ data_for_tombo
+ reference
+
+ main:
+ TOMBO_GET_MODIFICATION_MSC(data_for_tombo, reference)
+ bed_graphs = TOMBO_GET_MODIFICATION_MSC.out.bedgraphs
+ dampened_wiggles = TOMBO_GET_MODIFICATION_MSC.out.dampened_wiggles
+
+ emit:
+ bed_graphs
+ dampened_wiggles
+
}
workflow tombo_lsc_flow {
take:
- data_for_tombo
- reference
-
- main:
- TOMBO_GET_MODIFICATION_LSC(data_for_tombo, reference)
- bed_graphs = TOMBO_GET_MODIFICATION_LSC.out.bedgraphs
- dampened_wiggles = TOMBO_GET_MODIFICATION_LSC.out.dampened_wiggles
-
- emit:
- bed_graphs
- dampened_wiggles
+ data_for_tombo
+ reference
+
+ main:
+ TOMBO_GET_MODIFICATION_LSC(data_for_tombo, reference)
+ bed_graphs = TOMBO_GET_MODIFICATION_LSC.out.bedgraphs
+ dampened_wiggles = TOMBO_GET_MODIFICATION_LSC.out.dampened_wiggles
+
+ emit:
+ bed_graphs
+ dampened_wiggles
}
workflow compore_polish_flow {
take:
- comparisons
- fast5_folders
- bams
- bais
- fastqs
- summaries
- ref_file
-
- main:
- chromSizes = getChromInfo(ref_file)
- chromSizes.splitText(file: true, by: 500).set{chromFiles}
- outnp = NANOPOLISH_EVENTALIGN(fast5_folders, bams, bais, fastqs, summaries, ref_file)
- mean_pps = mean_per_pos(outnp.aligned_events)
- concat_chunks = concat_mean_per_pos(mean_pps.groupTuple().combine(chromFiles))
- concat_csv_files(concat_chunks.groupTuple())
-
- combs_events = mapIDPairs(comparisons, outnp.collapsed_aligned_events)
- NANOCOMPORE_SAMPLE_COMPARE(combs_events, ref_file)
-
+ comparisons
+ fast5_folders
+ bams
+ bais
+ fastqs
+ summaries
+ ref_file
+
+ main:
+ chromSizes = getChromInfo(ref_file)
+ chromSizes.sizes.splitText(file: true, by: 500).set{chromFiles}
+ outnp = NANOPOLISH_EVENTALIGN(fast5_folders, bams, bais, fastqs, summaries, ref_file)
+ mean_pps = mean_per_pos(outnp.aligned_events)
+ concat_chunks = concat_mean_per_pos(mean_pps.groupTuple().combine(chromFiles))
+ concat_csv_files(concat_chunks.groupTuple())
+
+ combs_events = mapIDPairs(comparisons, outnp.collapsed_aligned_events)
+ NANOCOMPORE_SAMPLE_COMPARE(combs_events, ref_file)
+
}
workflow epinano_flow {
take:
- bams
- reference
- comparisons
-
- main:
- splittedRefs = splitReference(reference).flatten()
+ bams
+ reference
+ comparisons
+
+ main:
+ splittedRefs = splitReference(reference).flatten()
splittedRefs.combine(bams).map{
def seqname = it[0].baseName
["${it[1]}___${seqname}", it[2], it[0]]
@@ -295,28 +345,28 @@ workflow epinano_flow {
splittedBams = splitBams(data2SplitBam)
splittedBams.map{
- def ids = it[0].split("___")
- [ids[1], ids[0], it[1], it[2]]
- }.set{reshaped_split_bams}
-
+ def ids = it[0].split("___")
+ [ids[1], ids[0], it[1], it[2]]
+ }.set{reshaped_split_bams}
+
split_indexes = indexReference(splittedRefs)
-
- reshaped_split_bams.combine(split_indexes, by:0).map{
- [it[1], it[2], it[3], it[4], it[5], it[6]]
- }.set{data_for_epinano}
-
+
+ reshaped_split_bams.combine(split_indexes, by:0).map{
+ [it[1], it[2], it[3], it[4], it[5], it[6]]
+ }.set{data_for_epinano}
+
per_site_vars = EPINANO_CALC_VAR_FREQUENCIES(data_for_epinano)
- epi_joined_res = joinEpinanoRes(per_site_vars.groupTuple()).plusepi
-
+ epi_joined_res = joinEpinanoRes(per_site_vars.transpose().groupTuple()).plusepi
+
if (params.epinano_plots == "YES") {
- epi_joined_res.combine(epi_joined_res).map {
- [ it[0], it[2], it[1], it[3] ]
- }.join(comparisons, by:[0,1]).set{per_site_for_plots}
-
- makeEpinanoPlots_ins(rscript, per_site_for_plots, "ins")
- makeEpinanoPlots_mis(rscript, per_site_for_plots, "mis")
- makeEpinanoPlots_del(rscript, per_site_for_plots, "del")
- }
+ epi_joined_res.combine(epi_joined_res).map {
+ [ it[0], it[2], it[1], it[3] ]
+ }.join(comparisons, by:[0,1]).set{per_site_for_plots}
+
+ makeEpinanoPlots_ins(rscript, per_site_for_plots, "ins")
+ makeEpinanoPlots_mis(rscript, per_site_for_plots, "mis")
+ makeEpinanoPlots_del(rscript, per_site_for_plots, "del")
+ }
}
@@ -326,7 +376,7 @@ workflow epinano_flow {
*/
workflow.onComplete {
println "Pipeline BIOCORE@CRG Master of Pore completed!"
- println "Started at $workflow.start"
+ println "Started at $workflow.start"
println "Finished at $workflow.complete"
println "Time elapsed: $workflow.duration"
println "Execution status: ${ workflow.success ? 'OK' : 'failed' }"
@@ -336,7 +386,7 @@ workflow.onComplete {
* Mail notification
*/
-if (params.email == "yourmail@yourdomain" || params.email == "") {
+if (params.email == "yourmail@yourdomain" || params.email == "") {
log.info 'Skipping the email\n'
}
else {
@@ -345,7 +395,7 @@ else {
workflow.onComplete {
def msg = """\
- Pipeline BIOCORE@CRG Master of Pore 2 modification module's execution summary
+ Pipeline BIOCORE@CRG Master of Pore 3 modification module's execution summary
---------------------------
Completed at: ${workflow.complete}
Duration : ${workflow.duration}
diff --git a/mop_mod/nextflow.config b/mop_mod/nextflow.config
index 421a6c0..bcff46c 100644
--- a/mop_mod/nextflow.config
+++ b/mop_mod/nextflow.config
@@ -1,5 +1,2 @@
-includeConfig "$baseDir/params.config"
includeConfig "../nextflow.global.config"
-singularity.cacheDir = "$baseDir/../singularity"
-
-
+//singularity.cacheDir = "$baseDir/../singularity"
diff --git a/mop_mod/output.size b/mop_mod/output.size
deleted file mode 100644
index 90594b5..0000000
--- a/mop_mod/output.size
+++ /dev/null
@@ -1 +0,0 @@
-2.9M output_mod/
diff --git a/mop_mod/params.config b/mop_mod/params.config
deleted file mode 120000
index 30bd91d..0000000
--- a/mop_mod/params.config
+++ /dev/null
@@ -1 +0,0 @@
-params.config.test
\ No newline at end of file
diff --git a/mop_mod/params.config.test b/mop_mod/params.config.test
deleted file mode 100755
index e2e07e7..0000000
--- a/mop_mod/params.config.test
+++ /dev/null
@@ -1,22 +0,0 @@
-params {
- input_path = "$baseDir/../mop_preprocess/output/"
- comparison = "$baseDir/comparison.tsv"
-
- reference = "$baseDir/../anno/yeast_rRNA_ref.fa.gz"
-
- output = "$baseDir/output_mod"
-
- pars_tools = "$baseDir/tools_opt.tsv"
-
- // flows
- epinano = "YES"
- nanocompore = "NO"
- tombo_lsc = "YES"
- tombo_msc = "YES"
-
- // epinano plots
- epinano_plots = "YES"
-
- email = ""
-}
-
diff --git a/mop_mod/params.yaml b/mop_mod/params.yaml
new file mode 100755
index 0000000..8b45a2e
--- /dev/null
+++ b/mop_mod/params.yaml
@@ -0,0 +1,17 @@
+input_path: "${projectDir}/../mop_preprocess/outfolder/"
+comparison: "${projectDir}/comparison.tsv"
+
+reference: "${projectDir}/../anno/yeast_rRNA_ref.fa.gz"
+output: "${projectDir}/output_mod"
+pars_tools: "${projectDir}/tools_opt.tsv"
+
+# flows
+epinano: "YES"
+nanocompore: "NO"
+tombo_lsc: "YES"
+tombo_msc: "YES"
+modphred: "NO"
+
+# epinano plots
+epinano_plots: "YES"
+email: ""
diff --git a/mop_mod/tools_opt.tsv b/mop_mod/tools_opt.tsv
index d199807..01b4da4 100644
--- a/mop_mod/tools_opt.tsv
+++ b/mop_mod/tools_opt.tsv
@@ -1,7 +1,7 @@
#flows tool extrapars
epinano epinano ""
nanocompore nanopolish ""
-nanocompore nanocompore "--sequence_context 2 --downsample_high_coverage 10000"
+nanocompore nanocompore ""
tombo_resquiggling tombo ""
tombo_msc tombo ""
tombo_lsc tombo ""
diff --git a/mop_preprocess/bin/RNA_to_DNA_fq.py b/mop_preprocess/bin/RNA_to_DNA_fq.py
index 8eb485e..831abcf 100755
--- a/mop_preprocess/bin/RNA_to_DNA_fq.py
+++ b/mop_preprocess/bin/RNA_to_DNA_fq.py
@@ -1,14 +1,13 @@
-#!/usr/bin/env python
+#!/usr/bin/env python
__author__ = 'luca.cozzuto@crg.eu'
# -*- coding utf-8 -*-
-import gzip
-import optparse
-import os
-import re
#MODULES
import sys
-
+import re
+import optparse
+import gzip
+import os
#BODY FUNTIONS
def options_arg():
@@ -30,7 +29,7 @@ def parsefile(file, ofile):
fwrite = open(ofile, 'a+')
if (file.endswith('.gz')):
infile = gzip.open(file, 'rt')
-
+
for line in infile:
count = count + 1
if (count%4==2):
diff --git a/mop_preprocess/bin/bam2stats.py b/mop_preprocess/bin/bam2stats.py
index 7fb4091..2f471d8 100755
--- a/mop_preprocess/bin/bam2stats.py
+++ b/mop_preprocess/bin/bam2stats.py
@@ -1,23 +1,20 @@
#!/usr/bin/env python3
-# Report stats (mapped reads and identity to reference) from samtools stats
+# Report stats (mapped reads and identity to reference) from samtools stats
# for bam file(s) ignoring secondary, suplementary and qc failed alignments
#
# USAGE: bam2stats.py bam1 bam2 ... bamN
-import os
-import subprocess
-import sys
-
+import os, subprocess, sys
def bam2stats(fn, flag=3840):
"""Get stats from samtools stats"""
args = ["samtools", "stats", "-F%s"%flag, fn]
proc = subprocess.Popen(args, stdout=subprocess.PIPE)
k2v = {}
- for l in proc.stdout:
+ for l in proc.stdout:
l = l.decode("utf-8")
- if l.startswith('SN'):
- ldata = l[:-1].split()#; print(ldata)
+ if l.startswith('SN'):
+ ldata = l[:-1].split()#; print(ldata)
kv = [[]]
for e in ldata[1:]:
kv[-1].append(e)
@@ -37,12 +34,12 @@ def bam2stats(fn, flag=3840):
text.append("{:,}\t{:,}".format(k2v['average length'], k2v['maximum length']))
text.append("{:.2f}%".format(100-100*k2v['mismatches']/k2v['bases mapped (cigar)'], )) #"identity: %.2f%"%(100-k2v['mismatches']/k2v['bases mapped (cigar)'], ))
return "\t".join(text)
-
+
for fn in sys.argv[1:]:
if os.path.isfile(fn):
sys.stdout.write("#File name\tMapped reads\tMap %\tBases\tBases %\tAvg read length\tMax read length\tidentity\n")
sys.stdout.write("%s\t%s\n"%(fn, bam2stats(fn)))
-
+
'''
CHK 4691e107 9942d94c cd9ffd51
# Summary Numbers. Use `grep ^SN | cut -f 2-` to extract this part.
@@ -79,4 +76,3 @@ def bam2stats(fn, flag=3840):
SN pairs on different chromosomes: 0
'''
-
diff --git a/mop_preprocess/bin/fast5_to_fastq.py b/mop_preprocess/bin/fast5_to_fastq.py
index a60f052..a80ce10 100755
--- a/mop_preprocess/bin/fast5_to_fastq.py
+++ b/mop_preprocess/bin/fast5_to_fastq.py
@@ -1,13 +1,9 @@
#!/usr/bin/env python
-import gzip
-import os
-import re
-import sys
-import warnings
-
-import h5py
+import sys, warnings,os
import numpy as np
-import ont_fast5_api
+import re
+import gzip
+import h5py, ont_fast5_api
from ont_fast5_api.fast5_interface import get_fast5_file
usage = '''
diff --git a/mop_preprocess/bin/fast5_type.py b/mop_preprocess/bin/fast5_type.py
index a3dcac6..626c1ed 100755
--- a/mop_preprocess/bin/fast5_type.py
+++ b/mop_preprocess/bin/fast5_type.py
@@ -1,10 +1,6 @@
#!/usr/bin/env python
-import os
-import sys
-import warnings
-
-import h5py
-import ont_fast5_api
+import sys, warnings,os
+import h5py, ont_fast5_api
from ont_fast5_api.fast5_interface import get_fast5_file
__author__ = 'Huanle.Liu@crg.eu'
diff --git a/mop_preprocess/config.yaml b/mop_preprocess/config.yaml
index 02e40f8..9d64d7d 100644
--- a/mop_preprocess/config.yaml
+++ b/mop_preprocess/config.yaml
@@ -1,15 +1,15 @@
-title: "MoP2 - pipeline"
+title: "MoP3 - pipeline"
subtitle: ""
intro_text: False
report_header_info:
- - Please cite: "Cozzuto L, Delgado-Tejedor A, Hermoso Pulido T, Novoa EM, Ponomarenko J. Nanopore Direct RNA Sequencing Data Processing and Analysis Using MasterOfPores. Methods Mol Biol. 2023;2624:185-205. doi: 10.1007/978-1-0716-2962-8_13."
+ - Please cite: "Cozzuto L, Liu H, Pryszcz LP, Pulido TH, Delgado-Tejedor A, Ponomarenko J, Novoa EM. MasterOfPores: A Workflow for the Analysis of Oxford Nanopore Direct RNA Sequencing Datasets. Front Genet. 2020 Mar 17;11:211. doi: 10.3389/fgene.2020.00211"
custom_logo: 'logo_small.png'
-custom_logo_url: 'https://github.com/biocorecrg/MoP2'
-custom_logo_title: 'Master of Pores 2'
+custom_logo_url: 'https://github.com/biocorecrg/master_of_pores'
+custom_logo_title: 'Master of Pores 3'
-extra_fn_clean_trim:
+extra_fn_clean_trim:
- '_QC'
- '.count'
@@ -20,7 +20,7 @@ read_count_desc: 'thousands'
table_columns_visible:
FastQC:
percent_duplicates: True
-
+
top_modules:
- fastqc:
name: 'FastQC'
@@ -30,4 +30,4 @@ top_modules:
info: 'This section of the report shows alnQC results on aligned reads'
- minionqc
- RNA201120181_REP2_bc_1_stats:
- info: 'Nanoplot stats'
+ info: 'Nanoplot stats'
diff --git a/mop_preprocess/deeplexicon_models/pAmps-final-actrun_newdata_nanopore_UResNet20v2_model.030.h5 b/mop_preprocess/deeplexicon_models/pAmps-final-actrun_newdata_nanopore_UResNet20v2_model.030.h5
new file mode 100644
index 0000000..924fdc4
Binary files /dev/null and b/mop_preprocess/deeplexicon_models/pAmps-final-actrun_newdata_nanopore_UResNet20v2_model.030.h5 differ
diff --git a/mop_preprocess/deeplexicon_models/pAmps-rep2-4-train1_newdata_nanopore_UResNet20v2_model.039.h5 b/mop_preprocess/deeplexicon_models/pAmps-rep2-4-train1_newdata_nanopore_UResNet20v2_model.039.h5
new file mode 100644
index 0000000..b87fdf5
Binary files /dev/null and b/mop_preprocess/deeplexicon_models/pAmps-rep2-4-train1_newdata_nanopore_UResNet20v2_model.039.h5 differ
diff --git a/mop_preprocess/deeplexicon/resnet20-final.h5 b/mop_preprocess/deeplexicon_models/resnet20-final.h5
similarity index 100%
rename from mop_preprocess/deeplexicon/resnet20-final.h5
rename to mop_preprocess/deeplexicon_models/resnet20-final.h5
diff --git a/mop_preprocess/dorado_models/README.txt b/mop_preprocess/dorado_models/README.txt
new file mode 100644
index 0000000..5034ffc
--- /dev/null
+++ b/mop_preprocess/dorado_models/README.txt
@@ -0,0 +1,5 @@
+here place the dorado models
+download them using
+```
+dorado download --model MODELNAME
+```
diff --git a/mop_preprocess/dorado_models/rna002_70bps_hac@v3/0.conv.bias.tensor b/mop_preprocess/dorado_models/rna002_70bps_hac@v3/0.conv.bias.tensor
new file mode 100644
index 0000000..ca184ed
Binary files /dev/null and b/mop_preprocess/dorado_models/rna002_70bps_hac@v3/0.conv.bias.tensor differ
diff --git a/mop_preprocess/dorado_models/rna002_70bps_hac@v3/0.conv.weight.tensor b/mop_preprocess/dorado_models/rna002_70bps_hac@v3/0.conv.weight.tensor
new file mode 100644
index 0000000..0c5740f
Binary files /dev/null and b/mop_preprocess/dorado_models/rna002_70bps_hac@v3/0.conv.weight.tensor differ
diff --git a/mop_preprocess/dorado_models/rna002_70bps_hac@v3/1.conv.bias.tensor b/mop_preprocess/dorado_models/rna002_70bps_hac@v3/1.conv.bias.tensor
new file mode 100644
index 0000000..9739067
Binary files /dev/null and b/mop_preprocess/dorado_models/rna002_70bps_hac@v3/1.conv.bias.tensor differ
diff --git a/mop_preprocess/dorado_models/rna002_70bps_hac@v3/1.conv.weight.tensor b/mop_preprocess/dorado_models/rna002_70bps_hac@v3/1.conv.weight.tensor
new file mode 100644
index 0000000..46b05c5
Binary files /dev/null and b/mop_preprocess/dorado_models/rna002_70bps_hac@v3/1.conv.weight.tensor differ
diff --git a/mop_preprocess/dorado_models/rna002_70bps_hac@v3/2.conv.bias.tensor b/mop_preprocess/dorado_models/rna002_70bps_hac@v3/2.conv.bias.tensor
new file mode 100644
index 0000000..65db185
Binary files /dev/null and b/mop_preprocess/dorado_models/rna002_70bps_hac@v3/2.conv.bias.tensor differ
diff --git a/mop_preprocess/dorado_models/rna002_70bps_hac@v3/2.conv.weight.tensor b/mop_preprocess/dorado_models/rna002_70bps_hac@v3/2.conv.weight.tensor
new file mode 100644
index 0000000..cd544e4
Binary files /dev/null and b/mop_preprocess/dorado_models/rna002_70bps_hac@v3/2.conv.weight.tensor differ
diff --git a/mop_preprocess/dorado_models/rna002_70bps_hac@v3/4.rnn.bias_hh_l0.tensor b/mop_preprocess/dorado_models/rna002_70bps_hac@v3/4.rnn.bias_hh_l0.tensor
new file mode 100644
index 0000000..0052cba
Binary files /dev/null and b/mop_preprocess/dorado_models/rna002_70bps_hac@v3/4.rnn.bias_hh_l0.tensor differ
diff --git a/mop_preprocess/dorado_models/rna002_70bps_hac@v3/4.rnn.bias_ih_l0.tensor b/mop_preprocess/dorado_models/rna002_70bps_hac@v3/4.rnn.bias_ih_l0.tensor
new file mode 100644
index 0000000..198df21
Binary files /dev/null and b/mop_preprocess/dorado_models/rna002_70bps_hac@v3/4.rnn.bias_ih_l0.tensor differ
diff --git a/mop_preprocess/dorado_models/rna002_70bps_hac@v3/4.rnn.weight_hh_l0.tensor b/mop_preprocess/dorado_models/rna002_70bps_hac@v3/4.rnn.weight_hh_l0.tensor
new file mode 100644
index 0000000..24633f1
Binary files /dev/null and b/mop_preprocess/dorado_models/rna002_70bps_hac@v3/4.rnn.weight_hh_l0.tensor differ
diff --git a/mop_preprocess/dorado_models/rna002_70bps_hac@v3/4.rnn.weight_ih_l0.tensor b/mop_preprocess/dorado_models/rna002_70bps_hac@v3/4.rnn.weight_ih_l0.tensor
new file mode 100644
index 0000000..a07b50e
Binary files /dev/null and b/mop_preprocess/dorado_models/rna002_70bps_hac@v3/4.rnn.weight_ih_l0.tensor differ
diff --git a/mop_preprocess/dorado_models/rna002_70bps_hac@v3/5.rnn.bias_hh_l0.tensor b/mop_preprocess/dorado_models/rna002_70bps_hac@v3/5.rnn.bias_hh_l0.tensor
new file mode 100644
index 0000000..412657c
Binary files /dev/null and b/mop_preprocess/dorado_models/rna002_70bps_hac@v3/5.rnn.bias_hh_l0.tensor differ
diff --git a/mop_preprocess/dorado_models/rna002_70bps_hac@v3/5.rnn.bias_ih_l0.tensor b/mop_preprocess/dorado_models/rna002_70bps_hac@v3/5.rnn.bias_ih_l0.tensor
new file mode 100644
index 0000000..62a7625
Binary files /dev/null and b/mop_preprocess/dorado_models/rna002_70bps_hac@v3/5.rnn.bias_ih_l0.tensor differ
diff --git a/mop_preprocess/dorado_models/rna002_70bps_hac@v3/5.rnn.weight_hh_l0.tensor b/mop_preprocess/dorado_models/rna002_70bps_hac@v3/5.rnn.weight_hh_l0.tensor
new file mode 100644
index 0000000..e77ba82
Binary files /dev/null and b/mop_preprocess/dorado_models/rna002_70bps_hac@v3/5.rnn.weight_hh_l0.tensor differ
diff --git a/mop_preprocess/dorado_models/rna002_70bps_hac@v3/5.rnn.weight_ih_l0.tensor b/mop_preprocess/dorado_models/rna002_70bps_hac@v3/5.rnn.weight_ih_l0.tensor
new file mode 100644
index 0000000..2b16e00
Binary files /dev/null and b/mop_preprocess/dorado_models/rna002_70bps_hac@v3/5.rnn.weight_ih_l0.tensor differ
diff --git a/mop_preprocess/dorado_models/rna002_70bps_hac@v3/6.rnn.bias_hh_l0.tensor b/mop_preprocess/dorado_models/rna002_70bps_hac@v3/6.rnn.bias_hh_l0.tensor
new file mode 100644
index 0000000..80af182
Binary files /dev/null and b/mop_preprocess/dorado_models/rna002_70bps_hac@v3/6.rnn.bias_hh_l0.tensor differ
diff --git a/mop_preprocess/dorado_models/rna002_70bps_hac@v3/6.rnn.bias_ih_l0.tensor b/mop_preprocess/dorado_models/rna002_70bps_hac@v3/6.rnn.bias_ih_l0.tensor
new file mode 100644
index 0000000..d0eb200
Binary files /dev/null and b/mop_preprocess/dorado_models/rna002_70bps_hac@v3/6.rnn.bias_ih_l0.tensor differ
diff --git a/mop_preprocess/dorado_models/rna002_70bps_hac@v3/6.rnn.weight_hh_l0.tensor b/mop_preprocess/dorado_models/rna002_70bps_hac@v3/6.rnn.weight_hh_l0.tensor
new file mode 100644
index 0000000..71c7e1d
Binary files /dev/null and b/mop_preprocess/dorado_models/rna002_70bps_hac@v3/6.rnn.weight_hh_l0.tensor differ
diff --git a/mop_preprocess/dorado_models/rna002_70bps_hac@v3/6.rnn.weight_ih_l0.tensor b/mop_preprocess/dorado_models/rna002_70bps_hac@v3/6.rnn.weight_ih_l0.tensor
new file mode 100644
index 0000000..8d214a0
Binary files /dev/null and b/mop_preprocess/dorado_models/rna002_70bps_hac@v3/6.rnn.weight_ih_l0.tensor differ
diff --git a/mop_preprocess/dorado_models/rna002_70bps_hac@v3/7.rnn.bias_hh_l0.tensor b/mop_preprocess/dorado_models/rna002_70bps_hac@v3/7.rnn.bias_hh_l0.tensor
new file mode 100644
index 0000000..1e8bbac
Binary files /dev/null and b/mop_preprocess/dorado_models/rna002_70bps_hac@v3/7.rnn.bias_hh_l0.tensor differ
diff --git a/mop_preprocess/dorado_models/rna002_70bps_hac@v3/7.rnn.bias_ih_l0.tensor b/mop_preprocess/dorado_models/rna002_70bps_hac@v3/7.rnn.bias_ih_l0.tensor
new file mode 100644
index 0000000..61e8277
Binary files /dev/null and b/mop_preprocess/dorado_models/rna002_70bps_hac@v3/7.rnn.bias_ih_l0.tensor differ
diff --git a/mop_preprocess/dorado_models/rna002_70bps_hac@v3/7.rnn.weight_hh_l0.tensor b/mop_preprocess/dorado_models/rna002_70bps_hac@v3/7.rnn.weight_hh_l0.tensor
new file mode 100644
index 0000000..a314d31
Binary files /dev/null and b/mop_preprocess/dorado_models/rna002_70bps_hac@v3/7.rnn.weight_hh_l0.tensor differ
diff --git a/mop_preprocess/dorado_models/rna002_70bps_hac@v3/7.rnn.weight_ih_l0.tensor b/mop_preprocess/dorado_models/rna002_70bps_hac@v3/7.rnn.weight_ih_l0.tensor
new file mode 100644
index 0000000..186482f
Binary files /dev/null and b/mop_preprocess/dorado_models/rna002_70bps_hac@v3/7.rnn.weight_ih_l0.tensor differ
diff --git a/mop_preprocess/dorado_models/rna002_70bps_hac@v3/8.rnn.bias_hh_l0.tensor b/mop_preprocess/dorado_models/rna002_70bps_hac@v3/8.rnn.bias_hh_l0.tensor
new file mode 100644
index 0000000..ed85351
Binary files /dev/null and b/mop_preprocess/dorado_models/rna002_70bps_hac@v3/8.rnn.bias_hh_l0.tensor differ
diff --git a/mop_preprocess/dorado_models/rna002_70bps_hac@v3/8.rnn.bias_ih_l0.tensor b/mop_preprocess/dorado_models/rna002_70bps_hac@v3/8.rnn.bias_ih_l0.tensor
new file mode 100644
index 0000000..e0cbdc4
Binary files /dev/null and b/mop_preprocess/dorado_models/rna002_70bps_hac@v3/8.rnn.bias_ih_l0.tensor differ
diff --git a/mop_preprocess/dorado_models/rna002_70bps_hac@v3/8.rnn.weight_hh_l0.tensor b/mop_preprocess/dorado_models/rna002_70bps_hac@v3/8.rnn.weight_hh_l0.tensor
new file mode 100644
index 0000000..29cd1c3
Binary files /dev/null and b/mop_preprocess/dorado_models/rna002_70bps_hac@v3/8.rnn.weight_hh_l0.tensor differ
diff --git a/mop_preprocess/dorado_models/rna002_70bps_hac@v3/8.rnn.weight_ih_l0.tensor b/mop_preprocess/dorado_models/rna002_70bps_hac@v3/8.rnn.weight_ih_l0.tensor
new file mode 100644
index 0000000..09de06d
Binary files /dev/null and b/mop_preprocess/dorado_models/rna002_70bps_hac@v3/8.rnn.weight_ih_l0.tensor differ
diff --git a/mop_preprocess/dorado_models/rna002_70bps_hac@v3/9.linear.bias.tensor b/mop_preprocess/dorado_models/rna002_70bps_hac@v3/9.linear.bias.tensor
new file mode 100644
index 0000000..379eaf0
Binary files /dev/null and b/mop_preprocess/dorado_models/rna002_70bps_hac@v3/9.linear.bias.tensor differ
diff --git a/mop_preprocess/dorado_models/rna002_70bps_hac@v3/9.linear.weight.tensor b/mop_preprocess/dorado_models/rna002_70bps_hac@v3/9.linear.weight.tensor
new file mode 100644
index 0000000..8bb1798
Binary files /dev/null and b/mop_preprocess/dorado_models/rna002_70bps_hac@v3/9.linear.weight.tensor differ
diff --git a/mop_preprocess/dorado_models/rna002_70bps_hac@v3/config.toml b/mop_preprocess/dorado_models/rna002_70bps_hac@v3/config.toml
new file mode 100644
index 0000000..aa77276
--- /dev/null
+++ b/mop_preprocess/dorado_models/rna002_70bps_hac@v3/config.toml
@@ -0,0 +1,33 @@
+[model]
+package = "bonito.crf"
+
+[labels]
+labels = [ "N", "A", "C", "G", "T",]
+
+[input]
+features = 1
+
+[run_info]
+sample_rate = 3000
+
+[global_norm]
+state_len = 4
+
+[encoder]
+activation = "swish"
+stride = 5
+rnn_type = "lstm"
+features = 384
+scale = 5.0
+winlen = 19
+blank_score = 2.0
+
+[normalisation]
+quantile_a = 0.2
+quantile_b = 0.8
+shift_multiplier = 0.48
+scale_multiplier = 0.59
+
+[qscore]
+scale = 1.8
+bias = -4.6
diff --git a/mop_preprocess/final_summary_FAT12104_2836aa20.txt b/mop_preprocess/final_summary_FAT12104_2836aa20.txt
new file mode 100755
index 0000000..d5d0539
--- /dev/null
+++ b/mop_preprocess/final_summary_FAT12104_2836aa20.txt
@@ -0,0 +1,17 @@
+instrument=MN40194
+position=
+flow_cell_id=FAT12104
+sample_id=cDNA12104
+protocol_group_id=Nano3Pseq-Bacteria
+protocol=sequencing/sequencing_MIN106_DNA:FLO-MIN106:SQK-DCS109
+protocol_run_id=ed70dea9-87ca-472a-8e4d-9f971b5eeccc
+acquisition_run_id=2836aa2020fcc8bc703144ad7edf048695fae26e
+started=2022-07-15T18:10:08.258110+02:00
+acquisition_stopped=2022-07-18T09:45:10.309970+02:00
+processing_stopped=2022-07-18T09:45:11.327982+02:00
+basecalling_enabled=0
+sequencing_summary_file=sequencing_summary_FAT12104_2836aa20.txt
+fast5_files_in_final_dest=34
+fast5_files_in_fallback=0
+fastq_files_in_final_dest=0
+fastq_files_in_fallback=0
diff --git a/mop_preprocess/guppy_models/README.md b/mop_preprocess/guppy_models/README.md
new file mode 100644
index 0000000..43718dc
--- /dev/null
+++ b/mop_preprocess/guppy_models/README.md
@@ -0,0 +1 @@
+here you can place custom models for GUPPY
diff --git a/mop_preprocess/keep_barcodes.txt b/mop_preprocess/keep_barcodes.txt
new file mode 100644
index 0000000..b5b3b8e
--- /dev/null
+++ b/mop_preprocess/keep_barcodes.txt
@@ -0,0 +1,2 @@
+anna---bc_104
+anna---bc_102
diff --git a/mop_preprocess/mop_preprocess.nf b/mop_preprocess/mop_preprocess.nf
old mode 100755
new mode 100644
index 33e2b20..87c951c
--- a/mop_preprocess/mop_preprocess.nf
+++ b/mop_preprocess/mop_preprocess.nf
@@ -2,13 +2,13 @@
nextflow.enable.dsl=2
-/*
+/*
* Define the pipeline parameters
*
*/
// Pipeline version
-version = '2.0'
+version = '3.0'
params.help = false
params.resume = false
@@ -18,12 +18,12 @@ log.info """
╔╦╗╔═╗╔═╗ ╔═╗┬─┐┌─┐┌─┐┬─┐┌─┐┌─┐┌─┐┌─┐┌─┐
║║║║ ║╠═╝ ╠═╝├┬┘├┤ ├─┘├┬┘│ ││ ├┤ └─┐└─┐
╩ ╩╚═╝╩ ╩ ┴└─└─┘┴ ┴└─└─┘└─┘└─┘└─┘└─┘
-
+
====================================================
-BIOCORE@CRG Master of Pores 2. Preprocessing - N F ~ version ${version}
+BIOCORE@CRG Master of Pores 3. Preprocessing - N F ~ version ${version}
====================================================
-conffile. : ${params.conffile}
+conffile : ${params.conffile}
fast5 : ${params.fast5}
fastq : ${params.fastq}
@@ -31,17 +31,18 @@ fastq : ${params.fastq}
reference : ${params.reference}
annotation : ${params.annotation}
-granularity. : ${params.granularity}
+granularity : ${params.granularity}
ref_type : ${params.ref_type}
pars_tools : ${params.pars_tools}
+barcodes : ${params.barcodes}
output : ${params.output}
GPU : ${params.GPU}
-basecalling : ${params.basecalling}
-demultiplexing : ${params.demultiplexing}
+basecalling : ${params.basecalling}
+demultiplexing : ${params.demultiplexing}
demulti_fast5 : ${params.demulti_fast5}
filtering : ${params.filtering}
@@ -50,11 +51,9 @@ mapping : ${params.mapping}
counting : ${params.counting}
discovery : ${params.discovery}
-cram_conv : ${params.cram_conv}
+cram_conv : ${params.cram_conv}
subsampling_cram : ${params.subsampling_cram}
-
-saveSpace : ${params.saveSpace}
email : ${params.email}
"""
@@ -62,575 +61,494 @@ email : ${params.email}
if (params.help) exit 1
if (params.resume) exit 1, "Are you making the classical --resume typo? Be careful!!!! ;)"
-// check multi5 and GPU usage. GPU maybe can be removed as param if there is a way to detect it
-if (params.GPU != "ON" && params.GPU != "OFF") exit 1, "Please specify ON or OFF in GPU processors are available"
-
// include functions, outdirs from other files
evaluate(new File("../outdirs.nf"))
-def local_modules = file("$baseDir/../local_modules.nf")
-def subworkflowsDir = "${baseDir}/../BioNextflow/subworkflows"
-joinScript = file("$baseDir/bin/join.r")
-
-// check input files
-reference = file(params.reference)
-if( !reference.exists() ) exit 1, "Missing reference file: ${reference}!"
-config_report = file("$baseDir/config.yaml")
-if( !config_report.exists() ) exit 1, "Missing config.yaml file!"
-logo = file("$baseDir/../img/logo_small.png")
-Channel.fromPath( "$baseDir/deeplexicon/*.h5").set{deepmodels}
-
-
+//def local_modulesDir = "${projectDir}/../local"
+def local_modules = file("${projectDir}/../local_modules.nf")
+def subworkflowsDir = "${projectDir}/../BioNextflow/subworkflows"
+def workflowsDir = "${projectDir}/../BioNextflow/workflows"
+joinScript = file("${projectDir}/bin/join.r")
+
+// get and check input files
+if (params.mapping != "NO") {
+ reference = file(params.reference)
+ if( !reference.exists() ) exit 1, "Missing reference file: ${reference}!"
+} else {
+ reference = ""
+}
-Channel
- .from( config_report, logo )
- .collect().set{multiqc_info}
+// INIZIALIZE MULTIQC REPORT
+config_report = file("${projectDir}/config.yaml")
+if( !config_report.exists() ) exit 1, "Missing config.yaml file!"
+logo = file("${projectDir}/../img/logo_small.png")
+Channel.from( config_report, logo ).set{multiqc_data}
-def gpu = params.GPU
-def tools = [:]
-tools["basecalling"] = params.basecalling
-tools["demultiplexing"] = params.demultiplexing
-tools["mapping"] = params.mapping
-tools["filtering"] = params.filtering
-tools["counting"] = params.counting
-tools["discovery"] = params.discovery
-//tools["variantcall"] = params.variantcall
-
-// Output files
outputReport = file("${outputMultiQC}/multiqc_report.html")
-/*
-* move old multiQCreport
-*/
if( outputReport.exists() ) {
log.info "Moving old report to multiqc_report.html multiqc_report.html.old"
outputReport.moveTo("${outputMultiQC}/multiqc_report.html.old")
}
-/*
-* This is default value in case guppy will be used for RNA demultiplexing
-*/
-params.barcodekit = ""
+// Get models
-if (params.ref_type == "genome") {
- if (params.annotation != "") {
- annotation = file(params.annotation)
- if( !annotation.exists() ) exit 1, "Missing annotation file: ${params.annotation}!"
- }
-}
-def demulti_fast5_opt = "OFF"
+demux_models = ""
-if (params.demultiplexing == "NO") {
- demulti_fast5_opt = "OFF"
+switch(params.demultiplexing) {
+ case "deeplexicon":
+ demux_models = "${projectDir}/deeplexicon_models/"
+ break;
}
-if (params.demulti_fast5 == "ON" || params.demulti_fast5 == "YES" ) {
- demulti_fast5_opt = "ON"
+dorado_models = "${projectDir}/dorado_models/"
+
+// check GPU usage.
+if (params.GPU != "cuda11" && params.GPU != "cuda10" && params.GPU != "OFF" && params.GPU != "ON") exit 1, "Please specify cuda11, cuda10, ON or OFF if GPU processors are available. ON is legacy for cuda10"
+def gpu = (params.GPU != 'OFF' ? 'ON' : 'OFF')
+def cuda_cont = (params.GPU == 'cuda11' ? 'biocorecrg/mopbasecallc11:0.3' : 'biocorecrg/mopbasecall:0.3')
+
+
+// CHECK INCOMPATIBILITIES AMONG PARAMETERS
+
+if (params.ref_type == "genome") {
+ if (params.annotation != "") {
+ annotation = file(params.annotation)
+ if( !annotation.exists() ) exit 1, "Missing annotation file: ${params.annotation}!"
+ }
}
-def guppy_basecall_label = (params.GPU == 'ON' ? 'basecall_gpus' : 'big_cpus')
-def deeplexi_basecall_label = (params.GPU == 'ON' ? 'demulti_gpus' : '')
-def output_bc = (demulti_fast5_opt == 'ON' ? '' : outputFast5)
-def outputMinionQC = (demulti_fast5_opt == 'ON' ? '': outputQual)
+outmode = "copy"
-if (params.saveSpace == "YES") outmode = "move"
-else outmode = "copy"
+include { final_message; notify_slack } from "${subworkflowsDir}/global_functions.nf"
+include { checkInput; filterPerBarcodes; get_barcode_list; RNA2DNA; parseFinalSummary; checkTools; reshapeSamples; reshapeDemuxSamples; checkRef; getParameters; homogenizeVals } from "${local_modules}"
-include { RNA2DNA; preparing_demultiplexing_fast5_deeplexicon; extracting_demultiplexed_fastq; parseFinalSummary; checkTools; reshapeSamples; reshapeDemuxSamples; checkRef; getParameters } from "${local_modules}"
-include { extracting_demultiplexed_fast5_deeplexicon } from "${local_modules}" addParams(OUTPUTF5: outputFast5, OUTPUTST: outputQual, LABEL: 'big_cpus')
-include { extracting_demultiplexed_fast5_guppy } from "${local_modules}" addParams(OUTPUT: outputFast5, LABEL: 'big_cpus')
+def demulti_fast5_opt = homogenizeVals(params.demulti_fast5)
+def basecall_label = (params.GPU != 'OFF' ? 'basecall_gpus' : 'big_cpus')
+def deeplexi_basecall_label = (params.GPU != 'OFF' ? 'demulti_gpus' : '')
+
+def output_bc = (demulti_fast5_opt == 'ON' ? '' : outputFast5)
+//def outputMinionQC = (demulti_fast5_opt == 'ON' ? '': outputQual)
-def guppypars = parseFinalSummary(params.conffile)
-// Create a channel for tool options
-if (workflow.profile == "awsbatch") guppypars = guppypars + " --data_path /nextflow-bin/ont-guppy/data"
+def guppypars = ""
+// GET PROGRAM PARS AND VERIFY
+def tools = [:]
+tools["basecalling"] = homogenizeVals(params.basecalling)
+tools["demultiplexing"] = homogenizeVals(params.demultiplexing)
+tools["mapping"] = homogenizeVals(params.mapping)
+tools["filtering"] = homogenizeVals(params.filtering)
+tools["counting"] = homogenizeVals(params.counting)
+tools["discovery"] = homogenizeVals(params.discovery)
+
+// Remove basecalling and demultiplexing in case of fastq input
+if(params.fast5 == "" && params.fastq != "") {
+ tools["basecalling"] = "NO"
+ tools["demultiplexing"] = "NO"
+} else {
+ guppypars = parseFinalSummary(params.conffile)
+ // Create a channel for tool options
+ if (workflow.profile == "awsbatch") guppypars = guppypars + " --data_path /nextflow-bin/ont-guppy/data"
+}
progPars = getParameters(params.pars_tools)
+checkTools(tools, progPars)
+
+// Create a channel for excluded ids
+barcodes_to_include = get_barcode_list(params.barcodes)
+
def guppy_basecall_pars = guppypars + " " + progPars["basecalling--guppy"]
-include { GET_WORKFLOWS; BASECALL as GUPPY_BASECALL; BASECALL_DEMULTI as GUPPY_BASECALL_DEMULTI } from "${subworkflowsDir}/basecalling/guppy" addParams(EXTRAPARS_BC: guppy_basecall_pars, EXTRAPARS_DEM: progPars["demultiplexing--guppy"], LABEL: guppy_basecall_label, GPU_OPTION: gpu, MOP: "YES", OUTPUT: output_bc, OUTPUTMODE: outmode)
-include { GET_VERSION as DEMULTIPLEX_VER; DEMULTIPLEX as DEMULTIPLEX_DEEPLEXICON } from "${subworkflowsDir}/demultiplexing/deeplexicon" addParams(EXTRAPARS: progPars["demultiplexing--deeplexicon"], LABEL:deeplexi_basecall_label, GPU_OPTION: gpu)
+def basecaller_pars = ["guppy" : guppy_basecall_pars, "dorado" : progPars["basecalling--dorado"] ]
+def demux_pars = ["guppy" : progPars["demultiplexing--guppy"] + " " + guppy_basecall_pars, "deeplexicon": progPars["demultiplexing--deeplexicon"] ]
+
+
+// INCLUDE WORKFLOWS
+include { BASECALL } from "${workflowsDir}/basecaller" addParams(gpu: gpu, output: output_bc, label: basecall_label, type:params.basecalling , extrapars: basecaller_pars[params.basecalling], models: dorado_models )
+include { DEMULTIPLEX } from "${workflowsDir}/demultiplexer.nf" addParams(gpu: gpu, output: output_bc, label: basecall_label, type:params.demultiplexing , extrapars: demux_pars[params.demultiplexing], models: demux_models )
+include { BASECALL_DEMULTIPLEX } from "${workflowsDir}/basecaller_demultiplexer.nf" addParams(gpu: gpu, output: output_bc, label: basecall_label, type:params.demultiplexing , extrapars: demux_pars[params.demultiplexing] )
+include { DEMULTI_FAST5; DEMULTI_FAST5_FILTER } from "${subworkflowsDir}/misc/demulti_fast5" addParams(OUTPUT: outputFast5, OUTPUTST: outputQual, LABEL: 'big_cpus', TYPE: params.demultiplexing)
+
+
+// INCLUDE MODULES
include { GET_VERSION as NANOFILT_VER; FILTER as NANOFILT_FILTER} from "${subworkflowsDir}/trimming/nanofilt" addParams(EXTRAPARS: progPars["filtering--nanofilt"])
include { GET_VERSION as NANOQ_VER; FILTER as NANOQ_FILTER} from "${subworkflowsDir}/trimming/nanoq" addParams(EXTRAPARS: progPars["filtering--nanoq"])
include { MAP as GRAPHMAP} from "${subworkflowsDir}/alignment/graphmap" addParams(EXTRAPARS: progPars["mapping--graphmap"], LABEL:'big_mem_cpus')
include { MAP as GRAPHMAP2} from "${subworkflowsDir}/alignment/graphmap2" addParams(EXTRAPARS: progPars["mapping--graphmap2"], LABEL:'big_mem_cpus')
include { MAP as MINIMAP2} from "${subworkflowsDir}/alignment/minimap2" addParams(EXTRAPARS: progPars["mapping--minimap2"], LABEL:'big_mem_cpus')
include { ALL as BWA} from "${subworkflowsDir}/alignment/bwa" addParams(EXTRAPARS: progPars["mapping--bwa"], LABEL:'big_mem_cpus')
-include { GET_VERSION as BWA_VER} from "${subworkflowsDir}/alignment/bwa"
-include { GET_VERSION as GRAPHMAP_VER} from "${subworkflowsDir}/alignment/graphmap"
-include { GET_VERSION as GRAPHMAP2_VER} from "${subworkflowsDir}/alignment/graphmap2"
-include { GET_VERSION as MINIMAP2_VER} from "${subworkflowsDir}/alignment/minimap2"
+include { GET_VERSION as BWA_VER} from "${subworkflowsDir}/alignment/bwa"
+include { GET_VERSION as GRAPHMAP_VER} from "${subworkflowsDir}/alignment/graphmap"
+include { GET_VERSION as GRAPHMAP2_VER} from "${subworkflowsDir}/alignment/graphmap2"
+include { GET_VERSION as MINIMAP2_VER} from "${subworkflowsDir}/alignment/minimap2"
include { FASTQCP as FASTQC} from "${subworkflowsDir}/qc/fastqc" addParams(LABEL: 'big_cpus')
include { GET_VERSION as FASTQC_VER} from "${subworkflowsDir}/qc/fastqc"
include { SORT as SAMTOOLS_SORT } from "${subworkflowsDir}/misc/samtools" addParams(LABEL: 'big_cpus', OUTPUT:outputMapping)
include { INDEX as SAMTOOLS_INDEX } from "${subworkflowsDir}/misc/samtools" addParams(OUTPUT:outputMapping)
include { GET_VERSION as SAMTOOLS_VERSION; CAT as SAMTOOLS_CAT } from "${subworkflowsDir}/misc/samtools"
include { MOP_QC as NANOPLOT_QC } from "${subworkflowsDir}/qc/nanoplot" addParams(LABEL: 'big_cpus_ignore')
-include { GET_VERSION as NANOPLOT_VER } from "${subworkflowsDir}/qc/nanoplot"
+include { GET_VERSION as NANOPLOT_VER } from "${subworkflowsDir}/qc/nanoplot"
include { GET_VERSION as NANOCOUNT_VER } from "${subworkflowsDir}/read_count/nanocount"
include { COUNT as NANOCOUNT } from "${subworkflowsDir}/read_count/nanocount" addParams(LABEL: 'big_mem', EXTRAPARS: progPars["counting--nanocount"], OUTPUT:outputCounts)
include { COUNT_AND_ANNO as HTSEQ_COUNT } from "${subworkflowsDir}/read_count/htseq" addParams(CONTAINER:"biocorecrg/htseq:30e9e9c", EXTRAPARS: progPars["counting--htseq"], OUTPUT:outputCounts, LABEL:'big_cpus')
include { GET_VERSION as HTSEQ_VER } from "${subworkflowsDir}/read_count/htseq" addParams(CONTAINER:"biocorecrg/htseq:30e9e9c")
-include { GET_VERSION as BAMBU_VER } from "${subworkflowsDir}/assembly/bambu"
+include { GET_VERSION as BAMBU_VER } from "${subworkflowsDir}/assembly/bambu"
include { ASSEMBLE as BAMBU_ASSEMBLE } from "${subworkflowsDir}/assembly/bambu" addParams(EXTRAPARS: progPars["discovery--bambu"], OUTPUT:outputAssembly, LABEL:'big_mem_cpus')
-include { GET_VERSION as ISOQUANT_VER } from "${subworkflowsDir}/assembly/isoquant"
-include { ASSEMBLE as ISOQUANT_ASSEMBLE } from "${subworkflowsDir}/assembly/isoquant" addParams(EXTRAPARS: progPars["discovery--isoquant"], OUTPUT:outputAssembly, LABEL:'big_mem_cpus')
+include { GET_VERSION as ISOQUANT_VER } from "${subworkflowsDir}/assembly/isoquant"
+include { ASSEMBLE as ISOQUANT_ASSEMBLE } from "${subworkflowsDir}/assembly/isoquant" addParams(EXTRAPARS: progPars["discovery--isoquant"], OUTPUT:outputAssembly, LABEL:'big_mem_cpus', CONTAINER:'quay.io/biocontainers/isoquant:3.2.0--hdfd78af_0')
include { REPORT as MULTIQC; GET_VERSION as MULTIQC_VER } from "${subworkflowsDir}/reporting/multiqc" addParams(EXTRAPARS: "-c ${config_report.getName()}", OUTPUT:outputMultiQC)
include { concatenateFastQFiles} from "${local_modules}" addParams(OUTPUT:outputFastq)
-include { MinIONQC} from "${local_modules}" addParams(OUTPUT:outputMinionQC, LABEL: 'big_mem_cpus')
-include { bam2stats; countStats; joinCountStats; joinAlnStats} from "${local_modules}"
+include { MinIONQC} from "${local_modules}" addParams(OUTPUT:outputQual, LABEL: 'big_mem_cpus')
+include { bam2stats; countStats; joinCountStats; joinAlnStats} from "${local_modules}"
include { cleanFile as fastqCleanFile; cleanFile as bamCleanFile; cleanFile as fast5CleanFile} from "${local_modules}"
include { AssignReads} from "${local_modules}" addParams(OUTPUT:outputAssigned)
include { bam2Cram } from "${local_modules}" addParams(OUTPUT:outputCRAM, LABEL: 'big_cpus_ignore')
+include { getFast5 } from "${local_modules}"
+
/*
-* Simple flow of basecalling
+* Wrapper for FILTERING
*/
-workflow flow1 {
- take:
- fast5_4_analysis
+workflow SEQFILTER {
+ take:
+ raw_bc_fastq
+
main:
- outbc = GUPPY_BASECALL (fast5_4_analysis)
- basecalled_fastq = outbc.basecalled_fastq
-
- // Optional fastq filtering
- if (params.filtering == "nanofilt") {
- basecalled_fastq = NANOFILT_FILTER(outbc.basecalled_fastq)
- //basecalled_fastq = reshapeSamples(nanofilt.out)
- } else if (params.filtering == "nanoq") {
- basecalled_fastq = NANOQ_FILTER(outbc.basecalled_fastq)
- //basecalled_fastq = reshapeSamples(nanofilt.out)
- }
-
- //bc_fastq = reshapeSamples(basecalled_fastq)
- bc_fast5 = reshapeSamples(outbc.basecalled_fast5)
- bc_stats = reshapeSamples(outbc.basecalling_stats)
-
- emit:
- basecalled_fast5 = bc_fast5
- basecalled_fastq = basecalled_fastq
- basecalled_stats = bc_stats
+ // Optional fastq filtering
+ switch(params.filtering) {
+ case "nanofilt":
+ bc_fastq = NANOFILT_FILTER(raw_bc_fastq)
+ break;
+ case "nanoq":
+ bc_fastq = NANOQ_FILTER(raw_bc_fastq)
+ break;
+ default:
+ bc_fastq = raw_bc_fastq
+ break;
+ }
+
+ emit:
+ out = bc_fastq
}
/*
-* Basecalling and Demultiplexing
+* Wrapper for MAPPING
*/
+workflow MAPPING {
+
+ take:
+ bc_fastq
-workflow flow2 {
- take:
- fast5_4_analysis
main:
- // IF DEMULTIPLEXING IS DEEPLEXICON
- if(params.demultiplexing == "deeplexicon") {
- outbc = GUPPY_BASECALL(fast5_4_analysis)
-
- demux = DEMULTIPLEX_DEEPLEXICON(deepmodels, fast5_4_analysis)
- fast5_res = outbc.basecalled_fast5
-
- // Optional demultiplex fast5
- if (demulti_fast5_opt == "ON") {
- basecalledbc = reshapeSamples(outbc.basecalled_fast5)
- alldemux = reshapeSamples(demux)
-
- //data_for_demux = alldemux.groupTuple().join(basecalledbc.transpose().groupTuple())
- prep_demux = preparing_demultiplexing_fast5_deeplexicon(alldemux.groupTuple()).transpose()
- data_for_demux = prep_demux.combine(basecalledbc.transpose().groupTuple(), by: 0)
-
- extracting_demultiplexed_fast5_deeplexicon(data_for_demux)
-
- // OPTIONAL CLEANING FASTQ5 FILES
- if (params.saveSpace == "YES") {
- fast5CleanFile(basecalledbc.transpose().groupTuple(), fast5_res.map{it[1]}.collect(), ".fast5")
- }
- }
- // Demultiplex fastq
- demufq = extracting_demultiplexed_fastq(demux.join(outbc.basecalled_fastq))
-
- } else if (params.demultiplexing == "guppy") {
- // IF DEMULTIPLEXING IS GUPPY
- outbc = GUPPY_BASECALL_DEMULTI (fast5_4_analysis)
- demufq = outbc.basecalled_fastq
- fast5_res = outbc.basecalled_fast5
-
- // Optional demultiplex fast5
- if (demulti_fast5_opt == "ON" ) {
- basecalledbc = reshapeSamples(outbc.basecalled_fast5)
- alldemux = reshapeSamples(outbc.basecalling_stats)
- fast5_res = extracting_demultiplexed_fast5_guppy(alldemux.groupTuple().join(basecalledbc.transpose().groupTuple()))
- // OPTIONAL CLEANING FASTQ5 FILES
- fast5CleanFile(basecalledbc.transpose().groupTuple(), fast5_res.map{it[1]}.collect(), ".fast5")
- }
- }
- reshapedDemufq = demufq.transpose().map{
- [it[1].name.replace(".fastq.gz", ""), it[1] ]
- }
- // Optional fastq filtering
- if (params.filtering == "nanofilt") {
- nanofilt = NANOFILT_FILTER(reshapedDemufq)
- reshapedDemufq = nanofilt
- } else if (params.filtering == "nanoq") {
- nanofilt = NANOQ_FILTER(outbc.basecalled_fastq)
- basecalled_fastq = reshapeSamples(nanofilt.out)
- }
- emit:
- basecalled_fast5 = fast5_res
- //basecalled_fastq = basecalled_fastq_res
- basecalled_fastq = reshapedDemufq
- basecalled_stats = reshapeSamples(outbc.basecalling_stats)
-
+
+ // Perform mapping on fastq files
+ if (params.mapping == "NO") {
+ stats_aln = Channel.empty()
+ stats_counts = Channel.empty()
+ sorted_alns = Channel.empty()
+ nanoplot_qcs = Channel.empty()
+ aln_indexes = Channel.empty()
+ aln_reads = Channel.empty()
+ }
+ else {
+ switch(params.mapping) {
+ case "graphmap":
+ //GRAPHMAP cannot align RNA, WE NEED TO CONVERT
+ dna_bc_fastq = RNA2DNA(bc_fastq)
+ aln_reads = GRAPHMAP(dna_bc_fastq, reference)
+ break
+ case "graphmap2":
+ aln_reads = GRAPHMAP2(bc_fastq, reference)
+ break
+ case "minimap2":
+ aln_reads = MINIMAP2(bc_fastq, reference)
+ break
+ case "bwa":
+ aln_reads = BWA(reference, bc_fastq)
+ break
+ default:
+ break
+
+ }
+ }
+
+ emit:
+ out = aln_reads
}
+/*
+* Wrapper for COUNTING
+*/
+workflow COUNTING {
-workflow preprocess_flow {
take:
- bc_fast5
- bc_fastq
- basecalled_stats
-
- main:
- // Perform MinIONQC on basecalling stats
- basecall_qc = MinIONQC(basecalled_stats.groupTuple())
- multiqc_data = basecall_qc.QC_folder.map{it[1]}.mix(multiqc_info)
-
- // Perform mapping on fastq files
- if (params.mapping == "NO") {
- stats_aln = Channel.value()
- sorted_alns = Channel.value()
- nanoplot_qcs = Channel.value()
- }
- else {
- switch(params.mapping) {
- case "graphmap":
- //GRAPHMAP cannot align RNA
- dna_bc_fastq = RNA2DNA(bc_fastq)
- aln_reads = GRAPHMAP(dna_bc_fastq, reference)
- break
- case "graphmap2":
- aln_reads = GRAPHMAP2(bc_fastq, reference)
- break
- case "minimap2":
- aln_reads = MINIMAP2(bc_fastq, reference)
- break
- case "bwa":
- aln_reads = BWA(reference, bc_fastq)
- break
- default:
- println "ERROR ################################################################"
- println "${params.mapping} is not a supported alignment"
- println "ERROR ################################################################"
- println "Exiting ..."
- System.exit(0)
- break
-
- }
-
- // Concatenate bamfiles
- if (params.demultiplexing == "NO" ) reshaped_aln_reads = reshapeSamples(aln_reads)
- else reshaped_aln_reads = reshapeDemuxSamples(aln_reads)
-
- jaln_reads = SAMTOOLS_CAT(reshaped_aln_reads.groupTuple())
-
- // Perform SORTING and INDEXING on bam files
- sorted_alns = SAMTOOLS_SORT(jaln_reads)
- aln_indexes = SAMTOOLS_INDEX(sorted_alns)
-
- // Converting BAM to CRAM and
- if (params.cram_conv == "YES") {
- good_ref = checkRef(reference)
- bam2Cram(good_ref, params.subsampling_cram, sorted_alns.join(aln_indexes))
- }
- // OPTIONAL CLEANING BAM FILES
- if (params.saveSpace == "YES") {
- bamCleanFile(reshaped_aln_reads.groupTuple(), jaln_reads.map{it[1]}.collect(), ".bam")
- }
- // Perform bam2stats on sorted bams
- aln_stats = bam2stats(sorted_alns)
- stats_aln = joinAlnStats(aln_stats.map{ it[1]}.collect())
-
- // Perform NanoPlot on sorted bams
- nanoplot_qcs = NANOPLOT_QC(sorted_alns)
- multiqc_data = multiqc_data.mix(stats_aln)
- }
+ sorted_alns
+ aln_indexes
- // Concatenate fastq files
- if (params.demultiplexing == "NO" ) reshaped_bc_fastq = reshapeSamples(bc_fastq)
- else reshaped_bc_fastq = reshapeDemuxSamples(bc_fastq)
+ main:
- fastq_files = concatenateFastQFiles(reshaped_bc_fastq.groupTuple())
+ // OPTIONAL Perform COUNTING / ASSIGNMENT
+ if (params.counting == "nanocount" && params.ref_type == "transcriptome") {
+ read_counts = NANOCOUNT(sorted_alns.join(aln_indexes))
+ assignments = AssignReads(sorted_alns, "nanocount")
+ stat_counts = countStats(assignments)
+ stats_counts = joinCountStats(stat_counts.map{ it[1]}.collect())
+ }
+ else if (params.counting == "htseq" && params.ref_type == "genome") {
+ htseq_out = HTSEQ_COUNT(annotation, sorted_alns.join(aln_indexes))
+ read_counts = htseq_out.counts
+ assignments = AssignReads(htseq_out.bam, "htseq")
+ stat_counts = countStats(assignments)
+ stats_counts = joinCountStats(stat_counts.map{ it[1]}.collect())
+ } else if (params.counting == "NO") {
+ stats_counts = Channel.empty()
+ } else {
+ println "ERROR ################################################################"
+ println "${params.counting} is not compatible with ${params.ref_type}"
+ println "htseq requires a genome as reference and an annotation in GTF"
+ println "nanocount requires a transcriptome as a reference"
+ println "ERROR ################################################################"
+ println "Exiting ..."
+ System.exit(0)
+ }
- // Perform fastqc QC on fastq
- fastqc_files = FASTQC(fastq_files)
- multiqc_data = multiqc_data.mix(fastqc_files.map{it[1]})
- // OPTIONAL CLEANING FASTQC FILES
- if (params.saveSpace == "YES") {
- fastqCleanFile(reshaped_bc_fastq.groupTuple(), fastq_files.map{it[1]}.collect().mix(fastqc_files.map{it[1]}.collect(), jaln_reads.map{it[1]}.collect()).collect(), ".gz")
- }
- // OPTIONAL Perform COUNTING / ASSIGNMENT
- if (params.counting == "nanocount" && params.ref_type == "transcriptome") {
- read_counts = NANOCOUNT(sorted_alns.join(aln_indexes))
- assignments = AssignReads(sorted_alns, "nanocount")
- stat_counts = countStats(assignments)
- stats_counts = joinCountStats(stat_counts.map{ it[1]}.collect())
- multiqc_data = multiqc_data.mix(stats_counts)
- }
- else if (params.counting == "htseq" && params.ref_type == "genome") {
- htseq_out = HTSEQ_COUNT(annotation, sorted_alns.join(aln_indexes))
- read_counts = htseq_out.counts
- assignments = AssignReads(htseq_out.bam, "htseq")
- stat_counts = countStats(assignments)
- stats_counts = joinCountStats(stat_counts.map{ it[1]}.collect())
- multiqc_data = multiqc_data.mix(stats_counts)
- } else if (params.counting == "NO") {
- } else {
- println "ERROR ################################################################"
- println "${params.counting} is not compatible with ${params.ref_type}"
- println "htseq requires a genome as reference and an annotation in GTF"
- println "nanocount requires a transcriptome as a reference"
- println "ERROR ################################################################"
- println "Exiting ..."
- System.exit(0)
- }
- if (params.discovery == "bambu" && params.ref_type == "genome"){
- sorted_alns.map{
- [it[1]]
- }.collect().map{
- ["assembly", it]
- }.set{data_to_bambu}
- bambu_out = BAMBU_ASSEMBLE(reference, annotation, data_to_bambu)
- } else if (params.discovery == "isoquant" && params.ref_type == "genome"){
- aln_indexes.map{
- [it[1]]
- }.collect().map{
- ["assembly", it]
- }.set{ixd_4_bambu}
-
- sorted_alns.map{
- [it[1]]
- }.collect().map{
- ["assembly", it]
- }.join(ixd_4_bambu).set{data_to_isoquant}
- data_to_isoquant.view()
-
- bambu_out = ISOQUANT_ASSEMBLE(reference, annotation, data_to_isoquant)
- } else if (params.discovery == "NO") {
- } else {
- println "ERROR ################################################################"
- println "${params.discovery} is not compatible with ${params.ref_type}"
- println "bambu requires a genome as reference and an annotation in GTF"
- println "ERROR ################################################################"
- println "Exiting ..."
- System.exit(0)
- }
-
- // Perform MULTIQC report
- MULTIQC(multiqc_data.collect())
-
+ emit:
+ stats_counts = stats_counts
+
}
-workflow preprocess_simple {
+/*
+* Wrapper for ASSEMBLY
+*/
+workflow ASSEMBLY {
+
take:
- bc_fastq
-
- main:
-
- // Perform Fastqc QC on fastq
- fastqc_files = FASTQC(bc_fastq)
-
- // Perform mapping on fastq files
- if (params.mapping == "NO") {
- stats_aln = Channel.value()
- sorted_alns = Channel.value()
- nanoplot_qcs = Channel.value()
- }
- else {
- switch(params.mapping) {
- case "graphmap":
- dna_bc_fastq = RNA2DNA(bc_fastq)
- aln_reads = GRAPHMAP(dna_bc_fastq, reference)
- break
- case "graphmap2":
- aln_reads = GRAPHMAP2(bc_fastq, reference)
- break
- case "minimap2":
- aln_reads = MINIMAP2(bc_fastq, reference)
- break
- case "bwa":
- aln_reads = BWA(reference, bc_fastq)
- break
- default:
- println "ERROR ################################################################"
- println "${params.mapping} is not a supported alignment"
- println "ERROR ################################################################"
- println "Exiting ..."
- System.exit(0)
- break
- }
-
- // Perform SORTING and INDEXING on bam files
- sorted_alns = SAMTOOLS_SORT(aln_reads)
- aln_indexes = SAMTOOLS_INDEX(sorted_alns)
-
- // Converting BAM to CRAM and
- if (params.cram_conv == "YES") {
- good_ref = checkRef(reference)
- bam2Cram(good_ref, params.subsampling_cram, sorted_alns.join(aln_indexes))
- }
-
- // Perform bam2stats on sorted bams
- aln_stats = bam2stats(sorted_alns)
- stats_aln = joinAlnStats(aln_stats.map{ it[1]}.collect())
-
- // Perform NanoPlot on sorted bams
- nanoplot_qcs = NANOPLOT_QC(sorted_alns)
- }
+ sorted_alns
+ reference
+ annotation
+ main:
+
+ if (params.discovery == "bambu" && params.ref_type == "genome"){
+ sorted_alns.map{
+ [it[1]]
+ }.collect().map{
+ ["assembly", it]
+ }.set{data_to_bambu}
+ BAMBU_ASSEMBLE(reference, annotation, data_to_bambu)
+ } else if (params.discovery == "isoquant" && params.ref_type == "genome"){
+ aln_indexes.map{
+ [it[1]]
+ }.collect().map{
+ ["assembly", it]
+ }.set{ixd_4_isoquant}
+
+ sorted_alns.map{
+ [it[1]]
+ }.collect().map{
+ ["assembly", it]
+ }.join(ixd_4_isoquant).set{data_to_isoquant}
+
+ ISOQUANT_ASSEMBLE(reference, annotation, data_to_isoquant)
+ } else if (params.discovery == "NO") {
+ } else {
+ println "ERROR ################################################################"
+ println "${params.discovery} is not compatible with ${params.ref_type}"
+ println "bambu requires a genome as reference and an annotation in GTF"
+ println "ERROR ################################################################"
+ println "Exiting ..."
+ System.exit(0)
+ }
+}
- // OPTIONAL Perform COUNTING / ASSIGNMENT
- if (params.counting == "nanocount" && params.ref_type == "transcriptome") {
- read_counts = NANOCOUNT(sorted_alns.join(aln_indexes))
+workflow BASECALL_MOP {
- //read_counts = NANOCOUNT(sorted_alns)
- assignments = AssignReads(sorted_alns, "nanocount")
- stat_counts = countStats(assignments)
- stats_counts = joinCountStats(stat_counts.map{ it[1]}.collect())
- }
- else if (params.counting == "htseq" && params.ref_type == "genome") {
- htseq_out = HTSEQ_COUNT(params.annotation, sorted_alns.join(aln_indexes))
- read_counts = htseq_out.counts
- assignments = AssignReads(htseq_out.bam, "htseq")
- stat_counts = countStats(assignments)
- stats_counts = joinCountStats(stat_counts.map{ it[1]}.collect())
- }
- else if (params.counting == "NO") {
- // Default empty channels for reporting
- stats_counts = Channel.value()
+ take:
+ input_fast5
+
+ main:
+ if (params.basecalling != "NO" ) {
+ outbc = BASECALL(input_fast5)
+ basecalled_fastq = outbc.basecalled_fastq
} else {
- println "ERROR ################################################################"
- println "${params.counting} is not compatible with ${params.ref_type}"
- println "htseq requires a genome as reference and an annotation in GTF"
- println "nanocount requires a transcriptome as a reference"
- println "ERROR ################################################################"
- println "Exiting ..."
- System.exit(0)
- }
-
+ basecalled_fast5 = input_fast5
+ basecalling_stats = channel.empty()
+ basecalled_fastq = channel.empty()
+ }
- // Perform MULTIQC report
- fastqc_files.map{it[1]}.set{qcs}
- all_res = qcs.mix(multiqc_info,stats_counts, stats_aln)
- MULTIQC(all_res.collect())
+ emit:
+ basecalled_fastq
+ basecalling_stats
+ basecalled_fast5
}
- workflow {
- if (params.fast5 != "" && params.fastq == "") {
+workflow {
+
+ analysis_type = checkInput(params.fast5, params.fastq)
+
+ switch(analysis_type) {
+ // INPUT IS RAW NANOPORE DATA
+ case "fast5":
+ fast5_4_analysis = getFast5(params.fast5)
+ // BASECALL ONLY
+ if (params.demultiplexing == "NO" ) {
+ outbc = BASECALL(fast5_4_analysis)
+ basecalled_fastq = outbc.basecalled_fastq
+ bc_stats = reshapeSamples(outbc.basecalling_stats)
+ }
+ else { // BASECALL AND DEMULTIPLEX
+ switch(params.demultiplexing) {
+ case "deeplexicon":
+ outbc = BASECALL(fast5_4_analysis)
+ demux = DEMULTIPLEX(fast5_4_analysis, outbc.basecalled_fastq)
+ demufq = demux.demultiplexed_fastq
+ bc_stats = reshapeSamples(outbc.basecalling_stats)
+ bc_demux_stats = reshapeSamples(demux.demultiplexed_tsv).groupTuple()
+ break;
+
+ case "guppy":
+ case "readucks":
+ outbc = BASECALL_DEMULTIPLEX(fast5_4_analysis)
+ demufq = outbc.demultiplexed_fastqs
+ bc_stats = reshapeSamples(outbc.basecalling_stats)
+ bc_demux_stats = reshapeSamples(outbc.basecalling_stats).groupTuple()
+ outbc.basecalled_fast5.view()
+ break;
+
+ case "dorado":
+ break;
+ }
+
+ bc_stats = reshapeSamples(outbc.basecalling_stats)
+
+ reshapedPrefiltDemufq = demufq.transpose().map{
+ [it[1].name.replace(".fastq.gz", "").replace(".fq.gz", ""), it[1] ]
+ }
+
+ // FILTER BARCODES FOR FASTQ
+ if (params.barcodes != "") {
+ log.info "*********************************************************************"
+ log.info "*************** Selecting only the requested barcodes ***************"
+ log.info "*********************************************************************"
+ basecalled_fastq = filterPerBarcodes(barcodes_to_include, reshapedPrefiltDemufq)
+ } else {
+ basecalled_fastq = reshapedPrefiltDemufq
+ }
+
+ basecalled_fastq.ifEmpty{exit 1, "NO COMBINATION SAMPLE---BARCODEID WAS FOUND\nPLEASE CHECK YOUR BARCODE LIST\nENDING NOW, BYE!!!"}
+
+ // DEMULTI FAST5
+ if (demulti_fast5_opt == "ON") {
+ outbc.basecalled_fast5.view()
+ basecalled_fast5 = reshapeSamples(outbc.basecalled_fast5).transpose().groupTuple()
+ if (params.barcodes == "") {
+ DEMULTI_FAST5(bc_demux_stats, basecalled_fast5)
+ } else {
+ // FILTER BARCODES FOR FAST5
+ DEMULTI_FAST5_FILTER(bc_demux_stats, basecalled_fast5, barcodes_to_include)
+ }
+ }
+ }
+
+ // Perform MinIONQC on basecalling stats
+ basecall_qc = MinIONQC(bc_stats.groupTuple())
+ multiqc_data = multiqc_data.mix(basecall_qc.QC_folder.map{it[1]})
+
+ // SEQUENCE FILTERING
+ bc_fastq = SEQFILTER(basecalled_fastq).out
+
+ // SEQUENCE ALIGNMENT
+ alns = MAPPING(bc_fastq).out
+
+ // Concatenate fastq and BAM files differently depending on if demultiplexed or not
+ if (params.demultiplexing == "NO" ) {
+ reshaped_bc_fastq = reshapeSamples(bc_fastq)
+ reshaped_aln_reads = reshapeSamples(alns)
+ } else {
+ reshaped_bc_fastq = reshapeDemuxSamples(bc_fastq)
+ reshaped_aln_reads = reshapeDemuxSamples(alns)
+ }
+
+ jaln_reads = SAMTOOLS_CAT(reshaped_aln_reads.groupTuple())
+ fastq_files = concatenateFastQFiles(reshaped_bc_fastq.groupTuple())
+ break
+
+ // INPUT IS BASECALLED SEQUENCES
+ case "fastq":
+ fastq_files = Channel.fromFilePairs( params.fastq , size: 1, checkIfExists: true)
+ jaln_reads = MAPPING(fastq_files).out
+ break
+ }
+
+ // Perform SORTING and INDEXING on bam files
+ sorted_alns = SAMTOOLS_SORT(jaln_reads)
+ aln_indexes = SAMTOOLS_INDEX(sorted_alns)
+
+ // Converting BAM to CRAM and
+ if (params.cram_conv == "YES") {
+ good_ref = checkRef(reference)
+ bam2Cram(good_ref, params.subsampling_cram, sorted_alns.join(aln_indexes))
+ }
+
+ // Perform bam2stats on sorted bams
+ aln_stats = bam2stats(sorted_alns)
+ stats_aln = joinAlnStats(aln_stats.map{ it[1]}.collect())
- Channel
- .fromPath( params.fast5)
- .ifEmpty { error "Cannot find any file matching: ${params.fast5}" }
- .set {fast5_files}
+ // Perform NanoPlot on sorted bams
+ nanoplot_qcs = NANOPLOT_QC(sorted_alns)
+
+ // Perform fastqc QC on fastq
+ fastqc_files = FASTQC(fastq_files)
+ multiqc_data = multiqc_data.mix(stats_aln).mix(fastqc_files.map{it[1]})
+
+
+ stats_counts = COUNTING(sorted_alns, aln_indexes).stats_counts
+ multiqc_data = multiqc_data.mix(stats_counts)
+
+ // REVISE THIS
+ //ASSEMBLY(sorted_alns, reference, params.annotation)
+
+ // Perform MULTIQC report
+ MULTIQC(multiqc_data.collect())
+
+ //all_ver = BAMBU_VER().mix(DEMULTIPLEX_VER()).mix(NANOQ_VER()).mix(NANOFILT_VER())
+ //.mix(GRAPHMAP_VER()).mix(GRAPHMAP2_VER())
+ //.mix(MINIMAP2_VER()).mix(BWA_VER()).mix(FASTQC_VER())
+ //.mix(SAMTOOLS_VERSION()).mix(NANOPLOT_VER()).mix(NANOCOUNT_VER()).mix(HTSEQ_VER()).mix(MULTIQC_VER())
+ //.collectFile(name: 'tool_version.txt', newLine: false, storeDir:outputMultiQC)
- fast5_files.map {
- def filepath = file(it)
- def file_parts = "${filepath}".tokenize("/")
- def folder_name = filepath[-2]
- [folder_name, it]
- }.groupTuple().set{ fast5_per_folder}
-
- // Check tools
- checkTools(tools, progPars)
-
- def num = 0
- fast5_per_folder.map{
- def folder_name = it[0]
- def buffer_files = it[1].flatten().collate(params.granularity)
- [folder_name, buffer_files]
- }.transpose().map{
- num++
- [ "${it[0]}---${num}", it[1] ]
- }.set{ fast5_4_analysis }
-
- //GET_WORKFLOWS(params.flowcell, params.kit).view()
- if (params.basecalling == "guppy" && params.demultiplexing == "NO" ) outf = flow1(fast5_4_analysis)
- else outf = flow2(fast5_4_analysis)
- def bc_fast5 = outf.basecalled_fast5
- def bc_fastq = outf.basecalled_fastq
- def basecalled_stats = outf.basecalled_stats
-
- preprocess_flow(bc_fast5, bc_fastq, basecalled_stats)
-
- } else if(params.fast5 == "" && params.fastq != "") {
-
- // Check tools
- tools["basecalling"] = "NO"
- tools["demultiplexing"] = "NO"
- checkTools(tools, progPars)
- Channel.fromFilePairs( params.fastq , size: 1)
- .ifEmpty { error "Cannot find any file matching: ${params.fastq}" }
- .set {fastq_files}
-
- preprocess_simple(fastq_files)
-
-
- } else {
- println "ERROR ################################################################"
- println "Please choose one between fast5 and fastq as input!!!"
- println "ERROR ################################################################"
- println "Exiting ..."
- System.exit(0)
-
- }
-
- //all_ver = BAMBU_VER().mix(DEMULTIPLEX_VER()).mix(NANOQ_VER()).mix(NANOFILT_VER())
- //.mix(GRAPHMAP_VER()).mix(GRAPHMAP2_VER())
- //.mix(MINIMAP2_VER()).mix(BWA_VER()).mix(FASTQC_VER())
- //.mix(SAMTOOLS_VERSION()).mix(NANOPLOT_VER()).mix(NANOCOUNT_VER()).mix(HTSEQ_VER()).mix(MULTIQC_VER())
- //.collectFile(name: 'tool_version.txt', newLine: false, storeDir:outputMultiQC)
-
-
}
+
workflow.onComplete {
- println "Pipeline BIOCORE@CRG Master of Pore - preprocess completed!"
- println "Started at $workflow.start"
- println "Finished at $workflow.complete"
- println "Time elapsed: $workflow.duration"
- println "Execution status: ${ workflow.success ? 'OK' : 'failed' }"
+
+ def text = final_message("MoP3")
+ println text
+ if (params.hook != "") {
+ notify_slack(text, params.hook)
+ }
}
/*
* Mail notification
*/
-if (params.email == "yourmail@yourdomain" || params.email == "") {
+if (params.email == "yourmail@yourdomain" || params.email == "") {
log.info 'Skipping the email\n'
}
else {
log.info "Sending the email to ${params.email}\n"
workflow.onComplete {
-
- def msg = """\
- Pipeline BIOCORE@CRG Master of Pore 2 preprocess execution summary
- ---------------------------
- Completed at: ${workflow.complete}
- Duration : ${workflow.duration}
- Success : ${workflow.success}
- workDir : ${workflow.workDir}
- exit status : ${workflow.exitStatus}
- Error report: ${workflow.errorReport ?: '-'}
- """
- .stripIndent()
-
- sendMail(to: params.email, subject: "Master of Pore 2 execution", body: msg, attach: "${outputMultiQC}/multiqc_report.html")
+ def msg = final_message("MoP3")
+ sendMail(to: params.email, subject: "MoP3 - preprocess execution", body: msg, attach: "${outputMultiQC}/multiqc_report.html")
}
}
-
-
diff --git a/mop_preprocess/nextflow.config b/mop_preprocess/nextflow.config
index cbedbf8..f9a21e8 100644
--- a/mop_preprocess/nextflow.config
+++ b/mop_preprocess/nextflow.config
@@ -1,4 +1,2 @@
-includeConfig "$baseDir/params.config"
-includeConfig "$baseDir/../nextflow.global.config"
-singularity.cacheDir = "$baseDir/../singularity"
-
+includeConfig "${projectDir}/../nextflow.global.config"
+//singularity.cacheDir = "${projectDir}/../singularity"
diff --git a/mop_preprocess/params.config b/mop_preprocess/params.config
deleted file mode 120000
index 30bd91d..0000000
--- a/mop_preprocess/params.config
+++ /dev/null
@@ -1 +0,0 @@
-params.config.test
\ No newline at end of file
diff --git a/mop_preprocess/params.config.test b/mop_preprocess/params.config.test
deleted file mode 100644
index b310427..0000000
--- a/mop_preprocess/params.config.test
+++ /dev/null
@@ -1,32 +0,0 @@
-params {
- conffile = "final_summary_01.txt"
- fast5 = "$baseDir/../data/**/*.fast5"
- fastq = ""
-
- reference = "$baseDir/../anno/yeast_rRNA_ref.fa.gz"
- annotation = ""
- ref_type = "transcriptome"
-
- pars_tools = "drna_tool_splice_opt.tsv"
- output = "$baseDir/output"
- qualityqc = 5
- granularity = 1
-
- basecalling = "guppy"
- GPU = "OFF"
- demultiplexing = "NO"
- demulti_fast5 = "NO"
-
- filtering = "nanoq"
-
- mapping = "graphmap"
- counting = "nanocount"
- discovery = "NO"
-
- cram_conv = "YES"
- subsampling_cram = 50
-
- saveSpace = "NO"
-
- email = ""
-}
diff --git a/mop_preprocess/params.f5.demdna.yaml b/mop_preprocess/params.f5.demdna.yaml
new file mode 100644
index 0000000..8cfb472
--- /dev/null
+++ b/mop_preprocess/params.f5.demdna.yaml
@@ -0,0 +1,43 @@
+# Parameters
+
+# Needed for fast5 input
+fast5: "${projectDir}/../data/fast5_dna_dem/**/*.fast5"
+## This can be empty but then you need to add specify kit and flowcell via command line inside pars_tools
+conffile: "final_summary_FAT12104_2836aa20.txt"
+## Can be wither guppy or dorado
+basecalling: "guppy"
+## Can be OFF / cuda10 / cuda11. Newer version of GUPPY may require cuda11
+GPU: "cuda11"
+demultiplexing: "guppy"
+demulti_fast5: "YES"
+### Number of fast5 basecalled per parallel job
+granularity: 1
+### File with the list of accepted barcodes. It can be empty
+barcodes: ""
+
+# Needed for fastq input
+fastq: ""
+
+# Common
+reference: "${projectDir}/../anno/yeast_rRNA_ref.fa.gz"
+## Can be transcriptome / genome
+ref_type: "transcriptome"
+annotation: ""
+## command line options
+pars_tools: "${projectDir}/tool_opts/cdna_tool_opt.tsv"
+## Cut off quality for QC
+qualityqc: 5
+## Can be nanoq / nanofilt
+filtering: "nanoq"
+## Can be graphmap / graphmap2 / minimap2 / bwa
+mapping: "graphmap"
+## Can be nanocount for transcriptome / htseq for genome
+counting: "nanocount"
+## Can be NO / bambu / isoquant
+discovery: "NO"
+## Convert bam to cram
+cram_conv: "YES"
+subsampling_cram: 50
+hook: ""
+email: ""
+output: "${projectDir}/outfolder"
diff --git a/mop_preprocess/params.f5.yaml b/mop_preprocess/params.f5.yaml
new file mode 100644
index 0000000..d7aa88d
--- /dev/null
+++ b/mop_preprocess/params.f5.yaml
@@ -0,0 +1,43 @@
+# Parameters
+
+# Needed for fast5 input
+fast5: "${projectDir}/../data/fast5/**/*.fast5"
+## This can be empty but then you need to add specify kit and flowcell via command line inside pars_tools
+conffile: "final_summary_01.txt"
+## Can be wither guppy or dorado
+basecalling: "guppy"
+## Can be OFF / cuda10 / cuda11. Newer version of GUPPY may require cuda11
+GPU: "OFF"
+demultiplexing: "NO"
+demulti_fast5: "NO"
+### Number of fast5 basecalled per parallel job
+granularity: 1
+### File with the list of accepted barcodes. It can be empty
+barcodes: ""
+
+# Needed for fastq input
+fastq: ""
+
+# Common
+reference: "${projectDir}/../anno/yeast_rRNA_ref.fa.gz"
+## Can be transcriptome / genome
+ref_type: "transcriptome"
+annotation: ""
+## command line options
+pars_tools: "${projectDir}/tool_opts/drna_tool_splice_opt.tsv"
+## Cut off quality for QC
+qualityqc: 5
+## Can be nanoq / nanofilt
+filtering: "nanoq"
+## Can be graphmap / graphmap2 / minimap2 / bwa
+mapping: "graphmap"
+## Can be nanocount for transcriptome / htseq for genome
+counting: "nanocount"
+## Can be NO / bambu / isoquant
+discovery: "NO"
+## Convert bam to cram
+cram_conv: "YES"
+subsampling_cram: 50
+hook: ""
+email: ""
+output: "${projectDir}/outfolder"
diff --git a/mop_preprocess/params.fq.yaml b/mop_preprocess/params.fq.yaml
new file mode 100644
index 0000000..be855d6
--- /dev/null
+++ b/mop_preprocess/params.fq.yaml
@@ -0,0 +1,43 @@
+# Parameters
+
+# Needed for fast5 input
+fast5: ""
+## This can be empty but then you need to add specify kit and flowcell via command line inside pars_tools
+conffile: ""
+## Can be wither guppy or dorado
+basecalling: ""
+## Can be OFF / cuda10 / cuda11. Newer version of GUPPY may require cuda11
+GPU: "OFF"
+demultiplexing: "NO"
+demulti_fast5: "NO"
+### Number of fast5 basecalled per parallel job
+granularity: 1
+### File with the list of accepted barcodes. It can be empty
+barcodes: ""
+
+# Needed for fastq input
+fastq: "${projectDir}/../data/fastq/*.fq.gz"
+
+# Common
+reference: "${projectDir}/../anno/yeast_rRNA_ref.fa.gz"
+## Can be transcriptome / genome
+ref_type: "transcriptome"
+annotation: ""
+## command line options
+pars_tools: "${projectDir}/tool_opts/drna_tool_splice_opt.tsv"
+## Cut off quality for QC
+qualityqc: 5
+## Can be nanoq / nanofilt
+filtering: "nanoq"
+## Can be graphmap / graphmap2 / minimap2 / bwa
+mapping: "graphmap"
+## Can be nanocount for transcriptome / htseq for genome
+counting: "nanocount"
+## Can be NO / bambu / isoquant
+discovery: "NO"
+## Convert bam to cram
+cram_conv: "YES"
+subsampling_cram: 50
+hook: ""
+email: ""
+output: "${projectDir}/outfolder_fq"
diff --git a/mop_preprocess/cdna_tool_opt.tsv b/mop_preprocess/tool_opts/cdna_tool_opt.tsv
similarity index 92%
rename from mop_preprocess/cdna_tool_opt.tsv
rename to mop_preprocess/tool_opts/cdna_tool_opt.tsv
index 9e1cd21..91f04c9 100644
--- a/mop_preprocess/cdna_tool_opt.tsv
+++ b/mop_preprocess/tool_opts/cdna_tool_opt.tsv
@@ -1,5 +1,5 @@
#step tool extrapars
-basecalling guppy "--kit"
+basecalling guppy ""
demultiplexing guppy "--barcode_kits EXP-NBD104"
filtering nanofilt ""
filtering nanoq ""
diff --git a/mop_preprocess/tool_opts/cdna_tool_readucks_opt.tsv b/mop_preprocess/tool_opts/cdna_tool_readucks_opt.tsv
new file mode 100644
index 0000000..844e401
--- /dev/null
+++ b/mop_preprocess/tool_opts/cdna_tool_readucks_opt.tsv
@@ -0,0 +1,13 @@
+#step tool extrapars
+basecalling guppy ""
+demultiplexing guppy "--flowcell FLO-MIN106 --kit SQK-DCS109 --barcode_kits EXP-NBD104"
+demultiplexing readucks "--limit_barcodes_to 1 2 3 4 --native_barcodes --threshold 50"
+filtering nanofilt ""
+filtering nanoq ""
+mapping graphmap ""
+mapping graphmap2 "-x rnaseq"
+mapping minimap2 "-ax splice -L --secondary=no"
+mapping bwa ""
+counting htseq "-a 0 --secondary-alignments ignore"
+counting nanocount ""
+discovery bambu ""
diff --git a/mop_preprocess/dna_tool_opt.tsv b/mop_preprocess/tool_opts/dna_tool_opt.tsv
similarity index 100%
rename from mop_preprocess/dna_tool_opt.tsv
rename to mop_preprocess/tool_opts/dna_tool_opt.tsv
diff --git a/mop_preprocess/tool_opts/drna_tool_m6A_splice_opt.tsv b/mop_preprocess/tool_opts/drna_tool_m6A_splice_opt.tsv
new file mode 100644
index 0000000..21e30fb
--- /dev/null
+++ b/mop_preprocess/tool_opts/drna_tool_m6A_splice_opt.tsv
@@ -0,0 +1,14 @@
+#step tool extrapars
+basecalling dorado "rna002_70bps_hac@v3"
+basecalling guppy "-c rna_r9.4.1_70bps_m6A_hac.cfg"
+demultiplexing deeplexicon ""
+demultiplexing guppy ""
+filtering nanofilt ""
+filtering nanoq ""
+mapping graphmap ""
+mapping graphmap2 "-x rnaseq"
+mapping minimap2 "-uf -ax splice -k14"
+mapping bwa ""
+counting htseq "-a 0"
+counting nanocount ""
+discovery bambu ""
diff --git a/mop_preprocess/drna_tool_splice_opt.tsv b/mop_preprocess/tool_opts/drna_tool_splice_opt.tsv
similarity index 79%
rename from mop_preprocess/drna_tool_splice_opt.tsv
rename to mop_preprocess/tool_opts/drna_tool_splice_opt.tsv
index 03ffb45..25f469e 100644
--- a/mop_preprocess/drna_tool_splice_opt.tsv
+++ b/mop_preprocess/tool_opts/drna_tool_splice_opt.tsv
@@ -1,13 +1,14 @@
#step tool extrapars
+basecalling dorado "rna002_70bps_hac@v3"
basecalling guppy ""
demultiplexing deeplexicon "-f multi -m resnet20-final.h5"
demultiplexing guppy ""
filtering nanofilt ""
-filtering nanoq ""
+filtering nanoq ""
mapping graphmap ""
mapping graphmap2 "-x rnaseq"
mapping minimap2 "-uf -ax splice -k14"
-mapping bwa ""
+mapping bwa ""
counting htseq "-a 0"
counting nanocount ""
discovery bambu ""
diff --git a/mop_preprocess/drna_tool_unsplice_guppy6_opt.tsv b/mop_preprocess/tool_opts/drna_tool_splice_opt_ozu.tsv
similarity index 51%
rename from mop_preprocess/drna_tool_unsplice_guppy6_opt.tsv
rename to mop_preprocess/tool_opts/drna_tool_splice_opt_ozu.tsv
index 6bae2df..9cf2dc2 100644
--- a/mop_preprocess/drna_tool_unsplice_guppy6_opt.tsv
+++ b/mop_preprocess/tool_opts/drna_tool_splice_opt_ozu.tsv
@@ -1,12 +1,12 @@
#step tool extrapars
-demultiplexing deeplexicon "-f multi -m resnet20-final.h5"
+basecalling guppy "-c dna_r9.4.1_450bps_hac.cfg --barcode_kits EXP-NBD104 "
+demultiplexing deeplexicon ""
demultiplexing guppy ""
filtering nanofilt ""
-basecalling guppy "--disable_qscore_filtering"
filtering nanoq ""
mapping graphmap ""
-mapping graphmap2 ""
-mapping minimap2 "-ax map-ont -k14"
+mapping graphmap2 "-x rnaseq"
+mapping minimap2 "-uf -ax splice -k14"
mapping bwa ""
counting htseq "-a 0"
counting nanocount ""
diff --git a/mop_preprocess/drna_tool_unsplice_opt.tsv b/mop_preprocess/tool_opts/drna_tool_unsplice_opt.tsv
similarity index 100%
rename from mop_preprocess/drna_tool_unsplice_opt.tsv
rename to mop_preprocess/tool_opts/drna_tool_unsplice_opt.tsv
diff --git a/mop_tail/mop_tail.nf b/mop_tail/mop_tail.nf
index 55ca6b2..c3aca63 100755
--- a/mop_tail/mop_tail.nf
+++ b/mop_tail/mop_tail.nf
@@ -2,41 +2,42 @@
nextflow.enable.dsl=2
-/*
+/*
* Define the pipeline parameters
*
*/
// Pipeline version
-version = '2.0'
+version = '3.0'
params.help = false
params.resume = false
log.info """
-╔╦╗╔═╗╔═╗ ╔╦╗╔═╗╦╦
-║║║║ ║╠═╝ ║ ╠═╣║║
+╔╦╗╔═╗╔═╗ ╔╦╗╔═╗╦╦
+║║║║ ║╠═╝ ║ ╠═╣║║
╩ ╩╚═╝╩ ╩ ╩ ╩╩╩═╝
-
+
====================================================
-BIOCORE@CRG Master of Pores 2. Estimating PolyA tail - N F ~ version ${version}
+BIOCORE@CRG Master of Pores 3. Estimating PolyA tail - N F ~ version ${version}
====================================================
***************** Input files *********************
input_path : ${params.input_path}
output : ${params.output}
-pars_tools : ${params.pars_tools}
+pars_tools : ${params.pars_tools}
******* reference has to be the genome **********
reference : ${params.reference}
-email : ${params.email}
+email : ${params.email}
************************* Flows *******************************
tailfindr : ${params.tailfindr}
nanopolish : ${params.nanopolish}
-
+***************************************************************
+tailfindr_mode : ${params.tailfindr_mode}
email : ${params.email}
"""
@@ -58,21 +59,39 @@ def flows = [:]
flows["tailfindr"] = params.tailfindr
flows["nanopolish"] = params.nanopolish
-include { getParameters; checkRef } from "${local_modules}"
+include { getParameters; checkRef } from "${local_modules}"
progPars = getParameters(params.pars_tools)
-include { ESTIMATE_TAIL as TAILFINDR_ESTIMATE_TAIL } from "${subworkflowsDir}/chem_modification/tailfindr" addParams(LABEL: 'big_cpus_retry', EXTRAPARS: progPars["tailfindr--tailfindr"])
-include { GET_VERSION as TAILFINDR_VER } from "${subworkflowsDir}/chem_modification/tailfindr"
+
+switch(params.tailfindr_mode) {
+ case "n3ps_r9":
+ tailfindr_mode = params.tailfindr_mode
+ println "tailfindr is in nano3p mode, R9 chemistry"
+ break
+ case "n3ps_r10":
+ tailfindr_mode = params.tailfindr_mode
+ println "tailfindr is in nano3p mode, R10 chemistry"
+ break
+ default:
+ tailfindr_mode = "default"
+ println "tailfindr is in default mode"
+ break
+}
+
+
+include { GET_VERSION as TAILFINDR_VER; ESTIMATE_TAIL as TAILFINDR_ESTIMATE_TAIL } from "${subworkflowsDir}/chem_modification/tailfindr" addParams(LABEL: 'big_cpus_retry', EXTRAPARS: progPars["tailfindr--tailfindr"], MODE:tailfindr_mode)
+
+
include { GET_VERSION as SAMTOOLS_VER; INDEX as SAMTOOLS_INDEX } from "${subworkflowsDir}/misc/samtools"
include { POLYA_LEN as NANOPOLISH_POLYA_LEN } from "${subworkflowsDir}/chem_modification/nanopolish" addParams(LABEL: 'big_cpus', OUTPUT: outputNanopolish, EXTRAPARS: progPars["nanopolish--nanopolish"])
-include { GET_VERSION as NANOPOLISH_VER } from "${subworkflowsDir}/chem_modification/nanopolish"
+include { GET_VERSION as NANOPOLISH_VER } from "${subworkflowsDir}/chem_modification/nanopolish"
-include { reshapeSamples } from "${local_modules}"
+include { reshapeSamples } from "${local_modules}"
include { collect_tailfindr_results} addParams(OUTPUT: outputTailFindr) from "${local_modules}"
include { join_nanotail_results } addParams(OUTPUT: outputFinalPolyA) from "${local_modules}"
include { filter_bam} addParams(LABEL: 'big_cpus') from "${local_modules}"
-
+
Channel.fromFilePairs("${params.input_path}/alignment/*_s.bam", size: 1).set{bams}
Channel.fromFilePairs("${params.input_path}/alignment/*_s.bam.bai", size: 1).set{bais}
@@ -91,7 +110,7 @@ fast5_files_4_np.map{
}.set{fast5_files_4_tf}
-workflow {
+workflow {
if (params.tailfindr == "YES") {
tail_estim = TAILFINDR_ESTIMATE_TAIL(fast5_files_4_np)
@@ -103,13 +122,12 @@ workflow {
ref_file = checkRef(reference)
filt_bams = filter_bam(ref_file, bams)
filt_bais = SAMTOOLS_INDEX(filt_bams)
- nanores = NANOPOLISH_POLYA_LEN(fast5_files_4_np, bams, bais, fastqs, ref_file)
+ nanores = NANOPOLISH_POLYA_LEN(fast5_files_4_np, bams, bais, fastqs, ref_file)
}
if (params.tailfindr == "YES" && params.nanopolish == "YES") {
- log.info "Joining results"
- //nanores.filtered_est.view()
- join_nanotail_results(nanores.filtered_est.join(tailres.length).join(assigned), joinScript)
-
+ log.info "Joining results from TailfindR and NanoPolish\n\n"
+ //nanores.filtered_est.view()
+ join_nanotail_results(nanores.filtered_est.join(tailres.length).join(assigned), joinScript)
}
all_ver = TAILFINDR_VER().mix(NANOPOLISH_VER())
@@ -124,7 +142,7 @@ workflow {
*/
workflow.onComplete {
println "Pipeline BIOCORE@CRG Master of Pore completed!"
- println "Started at $workflow.start"
+ println "Started at $workflow.start"
println "Finished at $workflow.complete"
println "Time elapsed: $workflow.duration"
println "Execution status: ${ workflow.success ? 'OK' : 'failed' }"
@@ -134,7 +152,7 @@ workflow.onComplete {
* Mail notification
*/
-if (params.email == "yourmail@yourdomain" || params.email == "") {
+if (params.email == "yourmail@yourdomain" || params.email == "") {
log.info 'Skipping the email\n'
}
else {
diff --git a/mop_tail/nextflow.config b/mop_tail/nextflow.config
index 421a6c0..bcff46c 100644
--- a/mop_tail/nextflow.config
+++ b/mop_tail/nextflow.config
@@ -1,5 +1,2 @@
-includeConfig "$baseDir/params.config"
includeConfig "../nextflow.global.config"
-singularity.cacheDir = "$baseDir/../singularity"
-
-
+//singularity.cacheDir = "$baseDir/../singularity"
diff --git a/mop_tail/params.config b/mop_tail/params.config
deleted file mode 120000
index 30bd91d..0000000
--- a/mop_tail/params.config
+++ /dev/null
@@ -1 +0,0 @@
-params.config.test
\ No newline at end of file
diff --git a/mop_tail/params.config.test b/mop_tail/params.config.test
deleted file mode 100755
index 05b212e..0000000
--- a/mop_tail/params.config.test
+++ /dev/null
@@ -1,14 +0,0 @@
-params {
-
- input_path = "$baseDir/../mop_preprocess/output/"
- reference = "$baseDir/../anno/yeast_rRNA_ref.fa.gz"
-
- pars_tools = "$baseDir/tools_opt.tsv"
-
- output = "$baseDir/outputPoly"
-
- tailfindr = "YES"
- nanopolish = "YES"
-
- email = "yourname@yourdomain"
-}
diff --git a/mop_tail/params.yaml b/mop_tail/params.yaml
new file mode 100644
index 0000000..a8d92fb
--- /dev/null
+++ b/mop_tail/params.yaml
@@ -0,0 +1,15 @@
+input_path: "${projectDir}/../mop_preprocess/outfolder/"
+reference: "${projectDir}/../anno/yeast_rRNA_ref.fa.gz"
+
+pars_tools: "${projectDir}/tools_opt.tsv"
+
+output: "${projectDir}/outputPoly"
+
+tailfindr: "YES"
+
+# Different modes: standard, n3ps_r9 or n3ps_r10
+tailfindr_mode: "standard"
+
+nanopolish: "YES"
+
+email: "yourname@yourdomain"
diff --git a/nextflow.global.config b/nextflow.global.config
index 529e22c..d3d9b38 100644
--- a/nextflow.global.config
+++ b/nextflow.global.config
@@ -1,4 +1,10 @@
-singularity.autoMounts = true
+singularity {
+ autoMounts = true
+ //runOptions = '-B $SINGULARITY_TMPDIR:/tmp -B $SINGULARITY_TMPDIR:/scratch'
+ //Used to allow Singularity to access bashrc variables
+ envWhitelist = ['SINGULARITY_TMPDIR']
+}
+
env {
R_PROFILE_USER = "/.Rprofile"
@@ -6,10 +12,6 @@ env {
PYTHONNOUSERSITE = 1
}
-singularity {
- runOptions = "--no-home"
-}
-
profiles {
m1mac {
includeConfig 'conf/m1_apple.config'
@@ -20,8 +22,14 @@ profiles {
local {
includeConfig 'conf/local.config'
}
+ crg {
+ includeConfig 'conf/crg.config'
+ }
+ newcrg {
+ includeConfig 'conf/newcrg.config'
+ }
cluster {
- includeConfig 'conf/sge.config'
+ includeConfig 'conf/crg.config'
}
sge {
includeConfig 'conf/sge.config'
diff --git a/outdirs.nf b/outdirs.nf
index 0f93650..593872d 100644
--- a/outdirs.nf
+++ b/outdirs.nf
@@ -17,12 +17,9 @@ outputAssigned = "${params.output}/assigned"
outputEpinanoFlow = "${params.output}/epinano_flow"
outputNanoPolComFlow = "${params.output}/nanopolish-compore_flow"
outputTomboFlow = "${params.output}/tombo_flow"
+outputModPhredFlow = "${params.output}/modphred_flow"
// MOP_TAIL
outputTailFindr = "${params.output}/tailfindr_flow"
outputNanopolish = "${params.output}/nanopolish_flow"
outputFinalPolyA = "${params.output}/polya_common"
-
-
-
-
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..483a4e9
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1 @@
+sphinx_rtd_theme