Merge remote-tracking branch 'refs/remotes/origin/master'

biocorecrg · Sep 4, 2024 · f3a8497 · f3a8497
2 parents acf88bb + 6ded4a1
commit f3a8497
Show file tree

Hide file tree

Showing 7 changed files with 41 additions and 14 deletions.
diff --git a/.gitmodules b/.gitmodules
@@ -1,3 +1,3 @@
 [submodule "BioNextflow"]
 	path = BioNextflow
-	url = git@github.com:biocorecrg/BioNextflow.git
+	url = https://github.com/biocorecrg/BioNextflow
diff --git a/BioNextflow b/BioNextflow
diff --git a/docs/mop_mod.rst b/docs/mop_mod.rst
@@ -112,3 +112,16 @@ Here an example of a plot from Epinano:
    gene_A	5	GAAGA	1	104.25	471
 
 4. **Nanocompore** results are stored in **nanopolish-compore_flow** directory. It contains one file per comparison (**wt_1_vs_ko_1_nanocompore_results.tsv**). Default output from Nanocompore (see Nanocompore's repository for a more detailed explanation).
+
+Encoding of modification information from m6A-aware basecalled data using modPhred
+=====================================================================================
+
+Once the data has been basecalled with our m6A modification-aware basecalling model, modification information data should be encoded for its later downstream analysis. This step is performed by **modPhred**, another software included in the **mop_mod** module.
+
+To run this software, in the ``params.yaml`` file you should specify ``modphred: "YES"`` and run the code below:
+
+.. code-block:: console
+
+   cd mop_mod
+   nextflow run mop_mod.nf -params-file params.yaml -with-singularity -bg > yourlog.txt
+
diff --git a/docs/mop_preprocess.rst b/docs/mop_preprocess.rst
@@ -7,7 +7,9 @@ MOP_PREPROCESS
 .. autosummary::
    :toctree: generated
 
-This pipeline takes as input the raw fast5 reads - single or multi - and it produces several outputs (basecalled fast5, sequences in fastq format, aligned reads in BAM format etc). The pre-processing pipeline can perform base-calling, demultiplexing (optional), filtering, quality control, mapping to a reference (either a genome or a transcriptome), feature counting, discovery of novel transcripts, and it generates a final report with the performance and results of each of the steps performed. It automatically detects the kind of input fast5 file (single or multi-sequence). It can also support the new pod5 format but it won't output basecalled fastq useful for the other pipelines. The basecalling can be performed with guppy or dorado and the demultiplexing with either guppy, deeplexicon or seqtagger. Basecalled fastq and Fast5 files can be demultiplexed as well. You can restrict the number of barcodes by indicating a file with barcode list using the **barcodes** parameter.
+This pipeline takes as input the raw fast5 reads - single or multi - and it produces several outputs (basecalled fast5, sequences in fastq format, aligned reads in BAM format etc). The pre-processing pipeline can perform base-calling, demultiplexing (optional), filtering, quality control, mapping to a reference (either a genome or a transcriptome), feature counting, discovery of novel transcripts, and it generates a final report with the performance and results of each of the steps performed. 
+
+It automatically detects the kind of input fast5 file (single or multi-sequence). It can also support the new pod5 format but it won't output basecalled fastq useful for the other pipelines. The basecalling can be performed with guppy or dorado and the demultiplexing with either guppy, deeplexicon or seqtagger. Basecalled fastq and Fast5 files can be demultiplexed as well. You can restrict the number of barcodes by indicating a file with barcode list using the **barcodes** parameter.
 
 
 .. image:: ../img/flow_preproc.png
@@ -190,6 +192,18 @@ The sample id is given by either the folder containing the fast5 files or the ba
    The naming convention of the different barcodes is decided by each tool, so **seqtagger** will produce **bc_1**, **bc_2**, etc. while guppy will produce **barcode01**, **barcode02**, etc.
 
 
+Basecalling with the m6A-aware model
+=========================================
+
+For m6A basecalling in your ``params.f5.yaml`` file you should specify ``basecalling: "guppy"`` and ``pars_tools: "tool_opts/drna_tool_m6A_splice_opt.tsv" `` so that guppy will use the m6A model. In your output folder you will have the ``fast5_files`` folder containing the m6A basecalled fast5 files for downstream analysis. Then run: 
+
+.. code-block:: console
+
+   cd mop_preprocess
+   nextflow run mop_preprocess.nf -params-file params.f5.yaml -with-singularity -bg > yourlog.txt
+ 
+
+
 Results
 ====================
 
@@ -208,3 +222,4 @@ Several folders are created by the pipeline within the output directory specifie
 
 .. note::
    MOP3 will automatically detect the version of guppy and modify the parameters accordingly. You don't need to add any extra parameter as in MOP2.
+
diff --git a/local_modules.nf b/local_modules.nf
@@ -350,17 +350,16 @@ process joinEpinanoRes {
     tuple val(id), path(epinanores)
 
     output:
-    tuple val(id), path("*.plus_strand.per.site.csv.gz"), emit: plusepi
-    tuple val(id), path("*.minus_strand.per.site.csv.gz"), emit: minusepi
-
+    tuple val(id), path("*.plus_strand.per.site.csv.gz"), optional: true, emit: plusepi
+    tuple val(id), path("*.minus_strand.per.site.csv.gz"), optional: true, emit: minusepi
 
     script:
 	"""
-	if compgen -G "*.plus_strand.per.site.csv.gz" > /dev/null; then
-		zcat *pieces*.plus_strand.per.site.csv.gz | awk '!(NR>1 && /#Ref/)' | gzip >>  ${id}.plus_strand.per.site.csv.gz
+	if compgen -G "*.fwd.per.site.csv.gz" > /dev/null; then
+		zcat *pieces*.fwd.per.site.csv.gz | awk '!(NR>1 && /#Ref/)' | gzip >>  ${id}.plus_strand.per.site.csv.gz
 	fi
-	if compgen -G "*.minus_strand.per.site.csv.gz" > /dev/null; then
-		zcat *pieces*.minus_strand.per.site.csv.gz | awk '!(NR>1 && /#Ref/)' | gzip >>  ${id}.minus_strand.per.site.csv.gz
+	if compgen -G "*.rev.per.site.csv.gz" > /dev/null; then
+		zcat *pieces*.rev.per.site.csv.gz | awk '!(NR>1 && /#Ref/)' | gzip >>  ${id}.minus_strand.per.site.csv.gz
 	fi
 	"""
 }

diff --git a/mop_consensus/bin/scripts/Accessory_functions_consensusNanoMod.R b/mop_consensus/bin/scripts/Accessory_functions_consensusNanoMod.R
@@ -29,7 +29,7 @@ epinano_processing <- function(sample_file, ivt_file, initial_position, final_po
   sample <- subset(sample, pos<=final_position)
   sample$reference <- paste(sample$X.Ref, sample$pos, sep='_')
   sample$Difference <- as.numeric(sample$mis)+as.numeric(sample$ins)+as.numeric(sample$del)
-  sample <- sample[,c(1,2,13,12)]
+  sample <- sample[,c(1,2,14,13)]
   colnames(sample) <- c('Reference', 'Position', 'Difference_sample', 'Merge')
 
   ivt <- read_csv_file(ivt_file)
@@ -38,7 +38,7 @@ epinano_processing <- function(sample_file, ivt_file, initial_position, final_po
   ivt <- subset(ivt, pos<=final_position)
   ivt$reference <- paste(ivt$X.Ref, ivt$pos, sep='_')
   ivt$Difference <- as.numeric(ivt$mis)+as.numeric(ivt$ins)+as.numeric(ivt$del)
-  ivt <- ivt[,c(1,2,13,12)]
+  ivt <- ivt[,c(1,2,14,13)]
   colnames(ivt) <- c('Reference', 'Position', 'Difference_IVT', 'Merge')
 
   if (nrow(sample)!=0 && nrow(ivt)!=0) {

diff --git a/mop_mod/mop_mod.nf b/mop_mod/mop_mod.nf
@@ -72,7 +72,7 @@ include { getParameters; mapIDPairs } from "${local_modules}"
 // Create a channel for tool options
 progPars = getParameters(params.pars_tools)
 
-include { calcVarFrequencies as EPINANO_CALC_VAR_FREQUENCIES } from "${subworkflowsDir}/chem_modification/epinano_1.2.nf" addParams(LABEL: 'big_mem_cpus', EXTRAPARS: progPars["epinano--epinano"])
+include { calcVarFrequencies as EPINANO_CALC_VAR_FREQUENCIES } from "${subworkflowsDir}/chem_modification/epinano_1.2.4.nf" addParams(LABEL: 'big_mem_cpus', EXTRAPARS: progPars["epinano--epinano"])
 include { joinEpinanoRes }  from "${local_modules}" addParams(OUTPUT: outputEpinanoFlow)
 
 include { RUNBYCHROM as MODPHRED_CHR } from "${subworkflowsDir}/chem_modification/modphred.nf" addParams(LABEL: 'big_mem_cpus', EXTRAPARS: progPars["modphred--modphred"], OUTPUT: outputModPhredFlow)
@@ -84,7 +84,7 @@ include { RESQUIGGLE_RNA as TOMBO_RESQUIGGLE_RNA } from "${subworkflowsDir}/chem
 include { GET_MODIFICATION_MSC as TOMBO_GET_MODIFICATION_MSC } from "${subworkflowsDir}/chem_modification/tombo.nf" addParams(LABEL: 'big_mem_cpus', EXTRAPARS: progPars["tombo_msc--tombo"], OUTPUT: outputTomboFlow)
 include { GET_MODIFICATION_LSC as TOMBO_GET_MODIFICATION_LSC } from "${subworkflowsDir}/chem_modification/tombo.nf" addParams(LABEL: 'big_mem_cpus', EXTRAPARS: progPars["tombo_lsc--tombo"], OUTPUT: outputTomboFlow)
 
-include { GET_VERSION as EPINANO_VER } from "${subworkflowsDir}/chem_modification/epinano_1.2.nf"
+include { GET_VERSION as EPINANO_VER } from "${subworkflowsDir}/chem_modification/epinano_1.2.4.nf"
 include { GET_VERSION as NANOPOLISH_VER } from "${subworkflowsDir}/chem_modification/nanopolish"
 include { GET_VERSION as NANOCOMPORE_VER } from "${subworkflowsDir}/chem_modification/nanocompore"
 include { GET_VERSION as TOMBO_VER } from "${subworkflowsDir}/chem_modification/tombo.nf"
+32 −0		docker_files/Dockerfile_nanoplot_1.42.0
+154 −0		subworkflows/alignment/biscuit.nf
+150 −0		subworkflows/alignment/bismark.nf
+2 −3		subworkflows/alignment/star.nf
+13 −9		subworkflows/allele_specific/snpsplit.nf
+75 −0		subworkflows/chem_modification/epinano_1.2.4.nf
+1 −1		subworkflows/chem_modification/tailfindr.nf
+2 −2		subworkflows/global_functions.nf
+53 −7		subworkflows/misc/demulti_fast5.nf
+36 −0		subworkflows/misc/misc.nf
+1 −1		subworkflows/qc/falco.nf
+1 −1		subworkflows/qc/fastqc.nf
+5 −5		subworkflows/qc/nanoplot.nf
+9 −12		subworkflows/read_count/htseq.nf
+44 −2		subworkflows/single_cell/cellranger.nf
+80 −0		subworkflows/snp_calling/clairS.nf
+1 −1		subworkflows/trimming/skewer.nf
+6 −3		workflows/aligner.nf