Skip to content

Commit a2c9f87

Browse files
authoredNov 7, 2023
Merge pull request #1032 from nf-core/patch
DSL1: Release 2.5.0 - Bopfingen
2 parents 631f18e + f3cde49 commit a2c9f87

10 files changed

+1906
-1715
lines changed
 

‎.github/workflows/ci.yml

+13-10
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ jobs:
2323
strategy:
2424
matrix:
2525
# Nextflow versions: check pipeline minimum and current latest
26-
nxf_ver: ['20.07.1', '22.10.6']
26+
nxf_ver: ["20.07.1", "22.10.6"]
2727
steps:
2828
- name: Check out pipeline code
2929
uses: actions/checkout@v2
@@ -37,13 +37,13 @@ jobs:
3737
3838
- name: Build new docker image
3939
if: env.MATCHED_FILES
40-
run: docker build --no-cache . -t nfcore/eager:2.4.7
40+
run: docker build --no-cache . -t nfcore/eager:2.5.0
4141

4242
- name: Pull docker image
4343
if: ${{ !env.MATCHED_FILES }}
4444
run: |
4545
docker pull nfcore/eager:dev
46-
docker tag nfcore/eager:dev nfcore/eager:2.4.7
46+
docker tag nfcore/eager:dev nfcore/eager:2.5.0
4747
4848
- name: Install Nextflow
4949
env:
@@ -58,7 +58,7 @@ jobs:
5858
run: |
5959
git clone --single-branch --branch eager https://github.com/nf-core/test-datasets.git data
6060
- name: DELAY to try address some odd behaviour with what appears to be a conflict between parallel htslib jobs leading to CI hangs
61-
run: |
61+
run: |
6262
if [[ $NXF_VER = '' ]]; then sleep 1200; fi
6363
- name: BASIC Run the basic pipeline with directly supplied single-end FASTQ
6464
run: |
@@ -74,7 +74,7 @@ jobs:
7474
nextflow run ${GITHUB_WORKSPACE} -profile test,docker --save_reference
7575
- name: REFERENCE Basic workflow, with supplied indices
7676
run: |
77-
nextflow run ${GITHUB_WORKSPACE} -profile test,docker --bwa_index 'results/reference_genome/bwa_index/BWAIndex/' --fasta_index 'https://github.com/nf-core/test-datasets/blob/eager/reference/Mammoth/Mammoth_MT_Krause.fasta.fai'
77+
nextflow run ${GITHUB_WORKSPACE} -profile test,docker --bwa_index 'results/reference_genome/bwa_index/BWAIndex/' --fasta_index 'https://github.com/nf-core/test-datasets/blob/eager/reference/Mammoth/Mammoth_MT_Krause.fasta.fai'
7878
- name: REFERENCE Run the basic pipeline with FastA reference with `fna` extension
7979
run: |
8080
nextflow run ${GITHUB_WORKSPACE} -profile test_tsv_fna,docker
@@ -107,7 +107,7 @@ jobs:
107107
nextflow run ${GITHUB_WORKSPACE} -profile test,docker --clip_adapters_list 'https://github.com/nf-core/test-datasets/raw/eager/databases/adapters/adapter-list.txt'
108108
- name: ADAPTER LIST Run the basic pipeline using an adapter list, skipping adapter removal
109109
run: |
110-
nextflow run ${GITHUB_WORKSPACE} -profile test,docker --clip_adapters_list 'https://github.com/nf-core/test-datasets/raw/eager/databases/adapters/adapter-list.txt' --skip_adapterremoval
110+
nextflow run ${GITHUB_WORKSPACE} -profile test,docker --clip_adapters_list 'https://github.com/nf-core/test-datasets/raw/eager/databases/adapters/adapter-list.txt' --skip_adapterremoval
111111
- name: POST_AR_FASTQ_TRIMMING Run the basic pipeline post-adapterremoval FASTQ trimming
112112
run: |
113113
nextflow run ${GITHUB_WORKSPACE} -profile test,docker --run_post_ar_trimming
@@ -141,6 +141,9 @@ jobs:
141141
- name: BEDTOOLS Test bedtools feature annotation
142142
run: |
143143
nextflow run ${GITHUB_WORKSPACE} -profile test,docker --run_bedtools_coverage --anno_file 'https://github.com/nf-core/test-datasets/raw/eager/reference/Mammoth/Mammoth_MT_Krause.gff3'
144+
- name: MAPDAMAGE2 damage calculation
145+
run: |
146+
nextflow run ${GITHUB_WORKSPACE} -profile test,docker --damage_calculation_tool 'mapdamage'
144147
- name: GENOTYPING_HC Test running GATK HaplotypeCaller
145148
run: |
146149
nextflow run ${GITHUB_WORKSPACE} -profile test_tsv_fna,docker --run_genotyping --genotyping_tool 'hc' --gatk_hc_out_mode 'EMIT_ALL_ACTIVE_SITES' --gatk_hc_emitrefconf 'BP_RESOLUTION'
@@ -193,11 +196,11 @@ jobs:
193196
nextflow run ${GITHUB_WORKSPACE} -profile test,docker --run_bam_filtering --bam_unmapped_type 'fastq' --run_metagenomic_screening --metagenomic_tool 'malt' --database "/home/runner/work/eager/eager/databases/malt/" --metagenomic_complexity_filter
194197
- name: MALTEXTRACT Download resource files
195198
run: |
196-
mkdir -p databases/maltextract
197-
for i in ncbi.tre ncbi.map; do wget https://github.com/rhuebler/HOPS/raw/0.33/Resources/"$i" -P databases/maltextract/; done
199+
mkdir -p databases/maltextract
200+
for i in ncbi.tre ncbi.map; do wget https://github.com/rhuebler/HOPS/raw/0.33/Resources/"$i" -P databases/maltextract/; done
198201
- name: MALTEXTRACT Basic with MALT plus MaltExtract
199202
run: |
200-
nextflow run ${GITHUB_WORKSPACE} -profile test,docker --run_bam_filtering --bam_unmapped_type 'fastq' --run_metagenomic_screening --metagenomic_tool 'malt' --database "/home/runner/work/eager/eager/databases/malt" --run_maltextract --maltextract_ncbifiles "/home/runner/work/eager/eager/databases/maltextract/" --maltextract_taxon_list 'https://raw.githubusercontent.com/nf-core/test-datasets/eager/testdata/Mammoth/maltextract/MaltExtract_list.txt'
203+
nextflow run ${GITHUB_WORKSPACE} -profile test,docker --run_bam_filtering --bam_unmapped_type 'fastq' --run_metagenomic_screening --metagenomic_tool 'malt' --database "/home/runner/work/eager/eager/databases/malt" --run_maltextract --maltextract_ncbifiles "/home/runner/work/eager/eager/databases/maltextract/" --maltextract_taxon_list 'https://raw.githubusercontent.com/nf-core/test-datasets/eager/testdata/Mammoth/maltextract/MaltExtract_list.txt'
201204
- name: METAGENOMIC Run the basic pipeline but with unmapped reads going into Kraken
202205
run: |
203206
nextflow run ${GITHUB_WORKSPACE} -profile test_tsv_kraken,docker --run_bam_filtering --bam_unmapped_type 'fastq'
@@ -216,4 +219,4 @@ jobs:
216219
nextflow run ${GITHUB_WORKSPACE} -profile test_tsv_humanbam,docker --skip_fastqc --skip_adapterremoval --skip_deduplication --skip_qualimap --skip_preseq --skip_damage_calculation --run_mtnucratio
217220
- name: RESCALING Run basic pipeline with basic pipeline but with mapDamage rescaling of BAM files. Note this will be slow
218221
run: |
219-
nextflow run ${GITHUB_WORKSPACE} -profile test,docker --run_mapdamage_rescaling --run_genotyping --genotyping_tool hc --genotyping_source 'rescaled'
222+
nextflow run ${GITHUB_WORKSPACE} -profile test,docker --run_mapdamage_rescaling --run_genotyping --genotyping_tool hc --genotyping_source 'rescaled'

‎CHANGELOG.md

+19
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,25 @@
33
The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
44
and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
55

6+
## [2.5.0] - Bopfingen - 2023-11-03
7+
8+
### `Added`
9+
10+
- [#1020](https://github.com/nf-core/eager/issues/1020) Added mapDamage2 as an alternative for damage calculation.
11+
12+
### `Fixed`
13+
14+
- [#1017](https://github.com/nf-core/eager/issues/1017) Fixed file name collision in niche cases with multiple libraries of multiple UDG treatments.
15+
- [#1024](https://github.com/nf-core/eager/issues/1024) `multiqc_general_stats.txt` is now generated even if the table is a beeswarm plot in the report.
16+
- [#655](https://github.com/nf-core/eager/issues/655) Updated RG tags for all mappers. RG-id now includes Sample as well as Library ID. Added `LB:` tag with the library ID.
17+
- [#1031](https://github.com/nf-core/eager/issues/1031) Always index fasta regardless of mapper. This ensures that DamageProfiler and genotyping processes get submitted when using bowtie2 and not providing a fasta index.
18+
19+
### `Dependencies`
20+
21+
- `multiqc`: 1.14 -> 1.16
22+
23+
### `Deprecated`
24+
625
## [2.4.7] - 2023-05-16
726

827
### `Added`

‎Dockerfile

+2-2
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ COPY environment.yml /
77
RUN conda env create --quiet -f /environment.yml && conda clean -a
88

99
# Add conda installation dir to PATH (instead of doing 'conda activate')
10-
ENV PATH /opt/conda/envs/nf-core-eager-2.4.7/bin:$PATH
10+
ENV PATH /opt/conda/envs/nf-core-eager-2.5.0/bin:$PATH
1111

1212
# Dump the details of the installed packages to a file for posterity
13-
RUN conda env export --name nf-core-eager-2.4.7 > nf-core-eager-2.4.7.yml
13+
RUN conda env export --name nf-core-eager-2.5.0 > nf-core-eager-2.5.0.yml

‎README.md

+9-6
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,9 @@
1515

1616
[![Get help on Slack](http://img.shields.io/badge/slack-nf--core%20%23eager-4A154B?logo=slack)](https://nfcore.slack.com/channels/eager)
1717

18+
>[!IMPORTANT]
19+
> nf-core/eager versions 2.* are only compatible with Nextflow versions up to 22.10.6!
20+
1821
## Introduction
1922

2023
<!-- nf-core: Write a 1-2 sentence summary of what data the pipeline is for and what it does -->
@@ -28,7 +31,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool
2831

2932
## Quick Start
3033

31-
1. Install [`nextflow`](https://nf-co.re/usage/installation) (`>=20.07.1`)
34+
1. Install [`nextflow`](https://nf-co.re/usage/installation) (`>=20.07.1` && `<=22.10.6`)
3235

3336
2. Install any of [`Docker`](https://docs.docker.com/engine/installation/), [`Singularity`](https://www.sylabs.io/guides/3.0/user-guide/), [`Podman`](https://podman.io/), [`Shifter`](https://nersc.gitlab.io/development/shifter/how-to-use/) or [`Charliecloud`](https://hpc.github.io/charliecloud/) for full pipeline reproducibility _(please only use [`Conda`](https://conda.io/miniconda.html) as a last resort; see [docs](https://nf-co.re/usage/configuration#basic-configuration-profiles))_
3437

@@ -52,7 +55,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool
5255
nextflow clean -f -k
5356
```
5457

55-
See [usage docs](https://nf-co.re/eager/docs/usage.md) for all of the available options when running the pipeline.
58+
See [usage docs](https://nf-co.re/eager/usage) for all of the available options when running the pipeline.
5659

5760
**N.B.** You can see an overview of the run in the MultiQC report located at `./results/MultiQC/multiqc_report.html`
5861

@@ -69,7 +72,7 @@ By default the pipeline currently performs the following:
6972
* Sequencing adapter removal, paired-end data merging (`AdapterRemoval`)
7073
* Read mapping to reference using (`bwa aln`, `bwa mem`, `CircularMapper`, or `bowtie2`)
7174
* Post-mapping processing, statistics and conversion to bam (`samtools`)
72-
* Ancient DNA C-to-T damage pattern visualisation (`DamageProfiler`)
75+
* Ancient DNA C-to-T damage pattern visualisation (`DamageProfiler` or `mapDamage`)
7376
* PCR duplicate removal (`DeDup` or `MarkDuplicates`)
7477
* Post-mapping statistics and BAM quality control (`Qualimap`)
7578
* Library Complexity Estimation (`preseq`)
@@ -133,9 +136,9 @@ The nf-core/eager pipeline comes with documentation about the pipeline: [usage](
133136
* [Pipeline installation](https://nf-co.re/usage/local_installation)
134137
* [Adding your own system config](https://nf-co.re/usage/adding_own_config)
135138
* [Reference genomes](https://nf-co.re/usage/reference_genomes)
136-
3. [Running the pipeline](https://nf-co.re/eager/docs/usage.md)
139+
3. [Running the pipeline](https://nf-co.re/eager/usage)
137140
* This includes tutorials, FAQs, and troubleshooting instructions
138-
4. [Output and how to interpret the results](https://nf-co.re/eager/docs/output.md)
141+
4. [Output and how to interpret the results](https://nf-co.re/eager/output)
139142

140143
## Credits
141144

@@ -246,7 +249,7 @@ In addition, references of tools and data used in this pipeline are as follows:
246249
* **Bowtie2** Langmead, B. and Salzberg, S. L. 2012 Fast gapped-read alignment with Bowtie 2. Nature methods, 9(4), p. 357–359. doi: [10.1038/nmeth.1923](https:/dx.doi.org/10.1038/nmeth.1923).
247250
* **sequenceTools** Stephan Schiffels (Unpublished). Download: [https://github.com/stschiff/sequenceTools](https://github.com/stschiff/sequenceTools)
248251
* **EigenstratDatabaseTools** Thiseas C. Lamnidis (Unpublished). Download: [https://github.com/TCLamnidis/EigenStratDatabaseTools.git](https://github.com/TCLamnidis/EigenStratDatabaseTools.git)
249-
* **mapDamage2** Jónsson, H., et al 2013. mapDamage2.0: fast approximate Bayesian estimates of ancient DNA damage parameters. Bioinformatics , 29(13), 1682–1684. [https://doi.org/10.1093/bioinformatics/btt193](https://doi.org/10.1093/bioinformatics/btt193)
252+
* **mapDamage** Jónsson, H., et al 2013. mapDamage2.0: fast approximate Bayesian estimates of ancient DNA damage parameters. Bioinformatics , 29(13), 1682–1684. [https://doi.org/10.1093/bioinformatics/btt193](https://doi.org/10.1093/bioinformatics/btt193)
250253
* **BBduk** Brian Bushnell (Unpublished). Download: [https://sourceforge.net/projects/bbmap/](sourceforge.net/projects/bbmap/)
251254
252255
## Data References

‎assets/multiqc_config.yaml

+15-3
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ run_modules:
1717
- gatk
1818
- kraken
1919
- malt
20+
- mapdamage
2021
- mtnucratio
2122
- multivcfanalyzer
2223
- picard
@@ -91,6 +92,7 @@ top_modules:
9192
path_filters:
9293
- "*.preseq"
9394
- "damageprofiler"
95+
- "mapdamage"
9496
- "mtnucratio"
9597
- "qualimap"
9698
- "sexdeterrmine"
@@ -155,6 +157,11 @@ table_columns_visible:
155157
3 Prime2: False
156158
mean_readlength: True
157159
median: True
160+
mapDamage:
161+
5 Prime1: True
162+
5 Prime2: True
163+
3 Prime1: False
164+
3 Prime2: False
158165
mtnucratio:
159166
mt_nuc_ratio: True
160167
QualiMap:
@@ -240,10 +247,15 @@ table_columns_placement:
240247
3 Prime2: 730
241248
mean_readlength: 740
242249
median: 750
250+
mapDamage:
251+
5 Prime1: 760
252+
5 Prime2: 765
253+
3 Prime1: 770
254+
3 Prime2: 775
243255
mtnucratio:
244-
mtreads: 760
245-
mt_cov_avg: 770
246-
mt_nuc_ratio: 780
256+
mtreads: 780
257+
mt_cov_avg: 785
258+
mt_nuc_ratio: 790
247259
QualiMap:
248260
mapped_reads: 800
249261
mean_coverage: 805

‎docs/output.md

+10-7
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ The possible columns displayed by default are as follows (note you may see addit
8080
* **Endogenous DNA Post-Filter (%)** This is from the endorS.py tool. It displays a percentage of mapped reads _after_ BAM filtering (i.e. for mapping quality and/or bam-level length filtering) over total reads that went into mapped (i.e. the percentage DNA content of the library that matches the reference). This column will only be displayed if BAM filtering is turned on and is based on the original mapping for total reads, and mapped reads as calculated from the post-filtering BAM.
8181
* **ClusterFactor** This is from **DeDup only**. This is a value representing how many duplicates in the library exist for each unique read. This ratio is calculated as `reads_before_deduplication / reads_after_deduplication`. Can be converted to %Dups by calculating `1 - (1 / CF)`. A cluster factor close to one indicates a highly complex library and could be sequenced further. Generally with a value of more than 2 you will not be gaining much more information by sequencing deeper.
8282
* **% Dup. Mapped Reads** This is from **Picard's markDuplicates only**. It represents the percentage of reads in your library that were exact duplicates of other reads in your library. The lower the better, as high duplication rate means lots of sequencing of the same information (and therefore is not time or cost effective).
83-
* **X Prime Y>Z N base** These columns are from DamageProfiler. The prime numbers represent which end of the reads the damage is referring to. The Y>Z is the type of substitution (C>T is the true damage, G>A is the complementary). You should see for no- and half-UDG treatment a decrease in frequency from the 1st to 2nd base.
83+
* **X Prime Y>Z N base** These columns are from DamageProfiler or mapDamage. The prime numbers represent which end of the reads the damage is referring to. The Y>Z is the type of substitution (C>T is the true damage, G>A is the complementary). You should see for no- and half-UDG treatment a decrease in frequency from the 1st to 2nd base.
8484
* **Mean Length Mapped Reads** This is from DamageProfiler. This is the mean length of all de-duplicated mapped reads. Ancient DNA normally will have a mean between 30-75, however this can vary.
8585
* **Median Length Mapped Reads** This is from DamageProfiler. This is the median length of all de-duplicated mapped reads. Ancient DNA normally will have a mean between 30-75, however this can vary.
8686
* **Nr. Dedup. Mapped Reads** This is from Qualimap. This is the total number of _deduplicated_ reads that mapped to your reference genome. This is the **best** number to report for final mapped reads in final publications.
@@ -494,11 +494,11 @@ Plateauing can be caused by a number of reasons:
494494
* You have an over-amplified library with many PCR duplicates. You should consider rebuilding the library to maximise data to cost ratio
495495
* You have a low quality library made up of mappable sequencing artefacts that were able to pass filtering (e.g. adapters)
496496

497-
### DamageProfiler
497+
### Damage Calculation
498498

499499
#### Background
500500

501-
DamageProfiler is a tool which calculates a variety of standard 'aDNA' metrics from a BAM file. The primary plots here are the misincorporation and length distribution plots. Ancient DNA undergoes depurination and hydrolysis, causing fragmentation of molecules into gradually shorter fragments, and cytosine to thymine deamination damage, that occur on the subsequent single-stranded overhangs at the ends of molecules.
501+
DamageProfiler and mapDamage are tools that calculate a variety of standard 'aDNA' metrics from a BAM file. The primary plots here are the misincorporation and length distribution plots. Ancient DNA undergoes depurination and hydrolysis, causing fragmentation of molecules into gradually shorter fragments, and cytosine to thymine deamination damage, that occur on the subsequent single-stranded overhangs at the ends of molecules.
502502

503503
Therefore, three main characteristics of ancient DNA are:
504504

@@ -510,7 +510,7 @@ You will receive output for each deduplicated _library_. This means that if you
510510

511511
#### Misincorporation Plots
512512

513-
The MultiQC DamageProfiler module misincorporation plots shows the percent frequency (Y axis) of C to T mismatches at 5' read ends and complementary G to A mismatches at the 3' ends. The X axis represents base pairs from the end of the molecule from the given prime end, going into the middle of the molecule i.e. 1st base of molecule, 2nd base of molecule etc until the 14th base pair. The mismatches are when compared to the base of the reference genome at that position.
513+
The MultiQC DamageProfiler and mapDamage module misincorporation plots shows the percent frequency (Y axis) of C to T mismatches at 5' read ends and complementary G to A mismatches at the 3' ends. The X axis represents base pairs from the end of the molecule from the given prime end, going into the middle of the molecule i.e. 1st base of molecule, 2nd base of molecule etc until the 14th base pair. The mismatches are when compared to the base of the reference genome at that position.
514514

515515
When looking at the misincorporation plots, keep the following in mind:
516516

@@ -520,22 +520,24 @@ When looking at the misincorporation plots, keep the following in mind:
520520
* If your library is **single-stranded**, you will expect to see only C to T misincorporations at both 5' and 3' ends of the fragments.
521521
* We generally expect that the older the sample, or the less-ideal preservational environment (hot/wet) the greater the frequency of C to T/G to A.
522522
* The curve will be not smooth then you have few reads informing the frequency calculation. Read counts of less than 500 are likely not reliable.
523+
* If the `mapdamage_downsample` parameter was specified and mapDamage was used for damage calculation, the damage frequency for each base is based only on the specified number of reads.
523524

524525
<p align="center">
525526
<img src="images/output/damageprofiler/damageprofiler_deaminationpatterns.png" width="75%" height = "75%">
526527
</p>
527528

528-
> **NB:** An important difference to note compared to the MapDamage tool, which DamageProfiler is an exact-reimplementation of, is that the percent frequency on the Y axis is not fixed between 0 and 0.3, and will 'zoom' into small values the less damage there is
529+
> **NB:** An important difference to note compared to the mapDamage tool, which DamageProfiler is otherwise an exact-re-implementation of, is that the percent frequency on the Y axis is not fixed between 0 and 0.3, and will 'zoom' into small values the less damage there is
529530
530531
#### Length Distribution
531532

532-
The MultiQC DamageProfiler module length distribution plots show the frequency of read lengths across forward and reverse reads respectively.
533+
The MultiQC DamageProfiler and mapDamage module length distribution plots show the frequency of read lengths across forward and reverse reads respectively.
533534

534535
When looking at the length distribution plots, keep in mind the following:
535536

536537
* Your curves will likely not start at 0, and will start wherever your minimum read-length setting was when removing adapters.
537538
* You should typically see the bulk of the distribution falling between 40-120bp, which is normal for aDNA
538539
* You may see large peaks at paired-end turn-arounds, due to very-long reads that could not overlap for merging being present, however this reads are normally from modern contamination.
540+
* If the `mapdamage_downsample` parameter was specified and mapDamage was used for damage calculation, the length distribution is based only on the specified number of reads.
539541

540542
### QualiMap
541543

@@ -686,9 +688,10 @@ Each module has it's own output directory which sit alongside the `MultiQC/` dir
686688
* `preseq/`: this contains a `.preseq` file for every BAM file that had enough deduplication statistics to generate a complexity curve for estimating the amount unique reads that will be yield if the library is re-sequenced. You can use this file for plotting e.g. in `R` to find your sequencing target depth.
687689
* `qualimap/`: this contains a sub-directory for every sample, which includes a qualimap report and associated raw statistic files. You can open the `.html` file in your internet browser to see the in-depth report (this will be more detailed than in MultiQC). This includes stuff like percent coverage, depth coverage, GC content and so on of your mapped reads.
688690
* `damageprofiler/`: this contains sample specific directories containing raw statistics and damage plots from DamageProfiler. The `.pdf` files can be used to visualise C to T miscoding lesions or read length distributions of your mapped reads. All raw statistics used for the PDF plots are contained in the `.txt` files.
691+
* `mapdamage/`: this contains sample specific directories containing raw statistics and damage plots from mapDamage. The `.pdf` files can be used to visualise C to T miscoding lesions or read length distributions of your mapped reads. All raw statistics used for the PDF plots are contained in the `.txt` files. The `Runtime_log.txt` file contains runtime information.
689692
* `pmdtools/`: this contains raw output statistics of pmdtools (estimates of frequencies of substitutions), and BAM files which have been filtered to remove reads that do not have a Post-mortem damage (PMD) score of `--pmdtools_threshold`.
690693
* `trimmed_bam/`: this contains the BAM files with X number of bases trimmed off as defined with the `--bamutils_clip_half_udg_left`, `--bamutils_clip_half_udg_right`, `--bamutils_clip_none_udg_left`, and `--bamutils_clip_none_udg_right` flags and corresponding index files. You can use these BAM files for downstream analysis such as re-mapping data with more stringent parameters (if you set trimming to remove the most likely places containing damage in the read).
691-
* `damage_rescaling/`: this contains rescaled BAM files from mapDamage2. These BAM files have damage probabilistically removed via a bayesian model, and can be used for downstream genotyping.
694+
* `damage_rescaling/`: this contains rescaled BAM files from mapDamage. These BAM files have damage probabilistically removed via a bayesian model, and can be used for downstream genotyping.
692695
* `genotyping/`: this contains all the (gzipped) genotyping files produced by your genotyping module. The file suffix will have the genotyping tool name. You will have files corresponding to each of your deduplicated BAM files (except pileupcaller), or any turned-on downstream processes that create BAMs (e.g. trimmed bams or pmd tools). If `--gatk_ug_keep_realign_bam` supplied, this may also contain BAM files from InDel realignment when using GATK 3 and UnifiedGenotyping for variant calling. When pileupcaller is used to create eigenstrat genotypes, this directory also contains eigenstrat SNP coverage statistics.
693696
* `multivcfanalyzer/`: this contains all output from MultiVCFAnalyzer, including SNP calling statistics, various SNP table(s) and FASTA alignment files.
694697
* `sex_determination/`: this contains the output for the sex determination run. This is a single `.tsv` file that includes a table with the sample name, the number of autosomal SNPs, number of SNPs on the X/Y chromosome, the number of reads mapping to the autosomes, the number of reads mapping to the X/Y chromosome, the relative coverage on the X/Y chromosomes, and the standard error associated with the relative coverages. These measures are provided for each bam file, one row per file. If the `sexdeterrmine_bedfile` option has not been provided, the error bars cannot be trusted, and runtime will be considerably longer.

‎environment.yml

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# You can use this file to create a conda environment for this pipeline:
22
# conda env create -f environment.yml
3-
name: nf-core-eager-2.4.7
3+
name: nf-core-eager-2.5.0
44
channels:
55
- conda-forge
66
- bioconda
@@ -26,7 +26,7 @@ dependencies:
2626
- bioconda::qualimap=2.2.2d
2727
- bioconda::vcf2genome=0.91
2828
- bioconda::damageprofiler=0.4.9 # Don't upgrade - later versions don't allow java 8
29-
- bioconda::multiqc=1.14
29+
- bioconda::multiqc=1.16
3030
- bioconda::pmdtools=0.60
3131
- bioconda::bedtools=2.30.0
3232
- conda-forge::libiconv=1.16

‎main.nf

+46-17
Original file line numberDiff line numberDiff line change
@@ -186,7 +186,7 @@ if("${params.fasta}".endsWith(".gz")){
186186
path zipped_fasta from file(params.fasta) // path doesn't like it if a string of an object is not prefaced with a root dir (/), so use file() to resolve string before parsing to `path`
187187

188188
output:
189-
path "$unzip" into ch_fasta into ch_fasta_for_bwaindex,ch_fasta_for_bt2index,ch_fasta_for_faidx,ch_fasta_for_seqdict,ch_fasta_for_circulargenerator,ch_fasta_for_circularmapper,ch_fasta_for_damageprofiler,ch_fasta_for_qualimap,ch_unmasked_fasta_for_masking,ch_unmasked_fasta_for_pmdtools,ch_fasta_for_genotyping_ug,ch_fasta_for_genotyping_hc,ch_fasta_for_genotyping_freebayes,ch_fasta_for_genotyping_pileupcaller,ch_fasta_for_vcf2genome,ch_fasta_for_multivcfanalyzer,ch_fasta_for_genotyping_angsd,ch_fasta_for_damagerescaling,ch_fasta_for_bcftools_stats
189+
path "$unzip" into ch_fasta into ch_fasta_for_bwaindex,ch_fasta_for_bt2index,ch_fasta_for_faidx,ch_fasta_for_seqdict,ch_fasta_for_circulargenerator,ch_fasta_for_circularmapper,ch_fasta_for_damageprofiler, ch_fasta_for_mapdamage ,ch_fasta_for_qualimap,ch_unmasked_fasta_for_masking,ch_unmasked_fasta_for_pmdtools,ch_fasta_for_genotyping_ug,ch_fasta_for_genotyping_hc,ch_fasta_for_genotyping_freebayes,ch_fasta_for_genotyping_pileupcaller,ch_fasta_for_vcf2genome,ch_fasta_for_multivcfanalyzer,ch_fasta_for_genotyping_angsd,ch_fasta_for_damagerescaling,ch_fasta_for_bcftools_stats
190190

191191
script:
192192
unzip = zipped_fasta.toString() - '.gz'
@@ -197,7 +197,7 @@ if("${params.fasta}".endsWith(".gz")){
197197
} else {
198198
fasta_for_indexing = Channel
199199
.fromPath("${params.fasta}", checkIfExists: true)
200-
.into{ ch_fasta_for_bwaindex; ch_fasta_for_bt2index; ch_fasta_for_faidx; ch_fasta_for_seqdict; ch_fasta_for_circulargenerator; ch_fasta_for_circularmapper; ch_fasta_for_damageprofiler; ch_fasta_for_qualimap; ch_unmasked_fasta_for_masking; ch_unmasked_fasta_for_pmdtools; ch_fasta_for_genotyping_ug; ch_fasta__for_genotyping_hc; ch_fasta_for_genotyping_hc; ch_fasta_for_genotyping_freebayes; ch_fasta_for_genotyping_pileupcaller; ch_fasta_for_vcf2genome; ch_fasta_for_multivcfanalyzer; ch_fasta_for_genotyping_angsd; ch_fasta_for_damagerescaling; ch_fasta_for_bcftools_stats }
200+
.into{ ch_fasta_for_bwaindex; ch_fasta_for_bt2index; ch_fasta_for_faidx; ch_fasta_for_seqdict; ch_fasta_for_circulargenerator; ch_fasta_for_circularmapper; ch_fasta_for_damageprofiler; ch_fasta_for_mapdamage; ch_fasta_for_qualimap; ch_unmasked_fasta_for_masking; ch_unmasked_fasta_for_pmdtools; ch_fasta_for_genotyping_ug; ch_fasta__for_genotyping_hc; ch_fasta_for_genotyping_hc; ch_fasta_for_genotyping_freebayes; ch_fasta_for_genotyping_pileupcaller; ch_fasta_for_vcf2genome; ch_fasta_for_multivcfanalyzer; ch_fasta_for_genotyping_angsd; ch_fasta_for_damagerescaling; ch_fasta_for_bcftools_stats }
201201
}
202202

203203
// Check that fasta index file path ends in '.fai'
@@ -561,7 +561,7 @@ process makeFastaIndex {
561561
else null
562562
}
563563

564-
when: !params.fasta_index && params.fasta && ( params.mapper == 'bwaaln' || params.mapper == 'bwamem' || params.mapper == 'circularmapper')
564+
when: !params.fasta_index && params.fasta
565565

566566
input:
567567
path fasta from ch_fasta_for_faidx
@@ -1276,14 +1276,14 @@ process bwa {
12761276
"""
12771277
bwa aln -t ${task.cpus} $fasta ${r1} -n ${params.bwaalnn} -l ${params.bwaalnl} -k ${params.bwaalnk} -o ${params.bwaalno} -f ${libraryid}.r1.sai
12781278
bwa aln -t ${task.cpus} $fasta ${r2} -n ${params.bwaalnn} -l ${params.bwaalnl} -k ${params.bwaalnk} -o ${params.bwaalno} -f ${libraryid}.r2.sai
1279-
bwa sampe -r "@RG\\tID:ILLUMINA-${libraryid}\\tSM:${samplename}\\tPL:illumina\\tPU:ILLUMINA-${libraryid}-${seqtype}" $fasta ${libraryid}.r1.sai ${libraryid}.r2.sai ${r1} ${r2} | samtools sort -@ ${task.cpus - 1} -O bam - > ${libraryid}_"${seqtype}".mapped.bam
1279+
bwa sampe -r "@RG\\tID:ILLUMINA-${samplename}_${libraryid}\\tSM:${samplename}\\tLB:${libraryid}\\tPL:illumina\\tPU:ILLUMINA-${libraryid}-${seqtype}" $fasta ${libraryid}.r1.sai ${libraryid}.r2.sai ${r1} ${r2} | samtools sort -@ ${task.cpus - 1} -O bam - > ${libraryid}_"${seqtype}".mapped.bam
12801280
samtools index "${libraryid}"_"${seqtype}".mapped.bam ${size}
12811281
"""
12821282
} else {
12831283
//PE collapsed, or SE data
12841284
"""
12851285
bwa aln -t ${task.cpus} ${fasta} ${r1} -n ${params.bwaalnn} -l ${params.bwaalnl} -k ${params.bwaalnk} -o ${params.bwaalno} -f ${libraryid}.sai
1286-
bwa samse -r "@RG\\tID:ILLUMINA-${libraryid}\\tSM:${samplename}\\tPL:illumina\\tPU:ILLUMINA-${libraryid}-${seqtype}" $fasta ${libraryid}.sai $r1 | samtools sort -@ ${task.cpus - 1} -O bam - > "${libraryid}"_"${seqtype}".mapped.bam
1286+
bwa samse -r "@RG\\tID:ILLUMINA-${samplename}_${libraryid}\\tSM:${samplename}\\tLB:${libraryid}\\tPL:illumina\\tPU:ILLUMINA-${libraryid}-${seqtype}" $fasta ${libraryid}.sai $r1 | samtools sort -@ ${task.cpus - 1} -O bam - > "${libraryid}"_"${seqtype}".mapped.bam
12871287
samtools index "${libraryid}"_"${seqtype}".mapped.bam ${size}
12881288
"""
12891289
}
@@ -1314,12 +1314,12 @@ process bwamem {
13141314

13151315
if (!params.single_end && params.skip_collapse){
13161316
"""
1317-
bwa mem -t ${split_cpus} $fasta $r1 $r2 -R "@RG\\tID:ILLUMINA-${libraryid}\\tSM:${samplename}\\tPL:illumina\\tPU:ILLUMINA-${libraryid}-${seqtype}" | samtools sort -@ ${split_cpus} -O bam - > "${libraryid}"_"${seqtype}".mapped.bam
1317+
bwa mem -t ${split_cpus} $fasta $r1 $r2 -R "@RG\\tID:ILLUMINA-${samplename}_${libraryid}\\tSM:${samplename}\\tLB:${libraryid}\\tPL:illumina\\tPU:ILLUMINA-${libraryid}-${seqtype}" | samtools sort -@ ${split_cpus} -O bam - > "${libraryid}"_"${seqtype}".mapped.bam
13181318
samtools index ${size} -@ ${task.cpus} "${libraryid}"_"${seqtype}".mapped.bam
13191319
"""
13201320
} else {
13211321
"""
1322-
bwa mem -t ${split_cpus} $fasta $r1 -R "@RG\\tID:ILLUMINA-${libraryid}\\tSM:${samplename}\\tPL:illumina\\tPU:ILLUMINA-${libraryid}-${seqtype}" | samtools sort -@ ${split_cpus} -O bam - > "${libraryid}"_"${seqtype}".mapped.bam
1322+
bwa mem -t ${split_cpus} $fasta $r1 -R "@RG\\tID:ILLUMINA-${samplename}_${libraryid}\\tSM:${samplename}\\tLB:${libraryid}\\tPL:illumina\\tPU:ILLUMINA-${libraryid}-${seqtype}" | samtools sort -@ ${split_cpus} -O bam - > "${libraryid}"_"${seqtype}".mapped.bam
13231323
samtools index -@ ${task.cpus} "${libraryid}"_"${seqtype}".mapped.bam ${size}
13241324
"""
13251325
}
@@ -1382,15 +1382,15 @@ process circularmapper{
13821382
"""
13831383
bwa aln -t ${task.cpus} $elongated_root $r1 -n ${params.bwaalnn} -l ${params.bwaalnl} -k ${params.bwaalnk} -f ${libraryid}.r1.sai
13841384
bwa aln -t ${task.cpus} $elongated_root $r2 -n ${params.bwaalnn} -l ${params.bwaalnl} -k ${params.bwaalnk} -f ${libraryid}.r2.sai
1385-
bwa sampe -r "@RG\\tID:ILLUMINA-${libraryid}\\tSM:${samplename}\\tPL:illumina\\tPU:ILLUMINA-${libraryid}-${seqtype}" $elongated_root ${libraryid}.r1.sai ${libraryid}.r2.sai $r1 $r2 > tmp.out
1385+
bwa sampe -r "@RG\\tID:ILLUMINA-${samplename}_${libraryid}\\tSM:${samplename}\\tLB:${libraryid}\\tPL:illumina\\tPU:ILLUMINA-${libraryid}-${seqtype}" $elongated_root ${libraryid}.r1.sai ${libraryid}.r2.sai $r1 $r2 > tmp.out
13861386
realignsamfile -Xmx${task.memory.toGiga()}g -e ${params.circularextension} -i tmp.out -r $fasta $filter
13871387
samtools sort -@ ${task.cpus} -O bam tmp_realigned.bam > ${libraryid}_"${seqtype}".mapped.bam
13881388
samtools index "${libraryid}"_"${seqtype}".mapped.bam ${size}
13891389
"""
13901390
} else {
13911391
"""
13921392
bwa aln -t ${task.cpus} $elongated_root $r1 -n ${params.bwaalnn} -l ${params.bwaalnl} -k ${params.bwaalnk} -f ${libraryid}.sai
1393-
bwa samse -r "@RG\\tID:ILLUMINA-${libraryid}\\tSM:${samplename}\\tPL:illumina\\tPU:ILLUMINA-${libraryid}-${seqtype}" $elongated_root ${libraryid}.sai $r1 > tmp.out
1393+
bwa samse -r "@RG\\tID:ILLUMINA-${samplename}_${libraryid}\\tSM:${samplename}\\tLB:${libraryid}\\tPL:illumina\\tPU:ILLUMINA-${libraryid}-${seqtype}" $elongated_root ${libraryid}.sai $r1 > tmp.out
13941394
realignsamfile -Xmx${task.memory.toGiga()}g -e ${params.circularextension} -i tmp.out -r $fasta $filter
13951395
samtools sort -@ ${task.cpus} -O bam tmp_realigned.bam > "${libraryid}"_"${seqtype}".mapped.bam
13961396
samtools index "${libraryid}"_"${seqtype}".mapped.bam ${size}
@@ -1460,13 +1460,13 @@ process bowtie2 {
14601460
//PE data without merging, PE data without any AR applied
14611461
if ( seqtype == 'PE' && ( params.skip_collapse || params.skip_adapterremoval ) ){
14621462
"""
1463-
bowtie2 -x ${fasta} -1 ${r1} -2 ${r2} -p ${split_cpus} ${sensitivity} ${bt2n} ${bt2l} ${trim5} ${trim3} --maxins ${params.bt2_maxins} --rg-id ILLUMINA-${libraryid} --rg SM:${samplename} --rg PL:illumina --rg PU:ILLUMINA-${libraryid}-${seqtype} 2> "${libraryid}"_bt2.log | samtools sort -@ ${split_cpus} -O bam > "${libraryid}"_"${seqtype}".mapped.bam
1463+
bowtie2 -x ${fasta} -1 ${r1} -2 ${r2} -p ${split_cpus} ${sensitivity} ${bt2n} ${bt2l} ${trim5} ${trim3} --maxins ${params.bt2_maxins} --rg-id ILLUMINA-${samplename}_${libraryid} --rg SM:${samplename} --rg LB:${libraryid} --rg PL:illumina --rg PU:ILLUMINA-${libraryid}-${seqtype} 2> "${libraryid}"_bt2.log | samtools sort -@ ${split_cpus} -O bam > "${libraryid}"_"${seqtype}".mapped.bam
14641464
samtools index "${libraryid}"_"${seqtype}".mapped.bam ${size}
14651465
"""
14661466
} else {
14671467
//PE collapsed, or SE data
14681468
"""
1469-
bowtie2 -x ${fasta} -U ${r1} -p ${split_cpus} ${sensitivity} ${bt2n} ${bt2l} ${trim5} ${trim3} --rg-id ILLUMINA-${libraryid} --rg SM:${samplename} --rg PL:illumina --rg PU:ILLUMINA-${libraryid}-${seqtype} 2> "${libraryid}"_bt2.log | samtools sort -@ ${split_cpus} -O bam > "${libraryid}"_"${seqtype}".mapped.bam
1469+
bowtie2 -x ${fasta} -U ${r1} -p ${split_cpus} ${sensitivity} ${bt2n} ${bt2l} ${trim5} ${trim3} --rg-id ILLUMINA-${samplename}_${libraryid} --rg SM:${samplename} --rg LB:${libraryid} --rg PL:illumina --rg PU:ILLUMINA-${libraryid}-${seqtype} 2> "${libraryid}"_bt2.log | samtools sort -@ ${split_cpus} -O bam > "${libraryid}"_"${seqtype}".mapped.bam
14701470
samtools index "${libraryid}"_"${seqtype}".mapped.bam ${size}
14711471
"""
14721472
}
@@ -1903,10 +1903,10 @@ process markduplicates{
19031903
// form of library merging.
19041904
if ( params.skip_deduplication ) {
19051905
ch_skiprmdup_for_libeval.mix(ch_dedup_for_libeval, ch_markdup_for_libeval)
1906-
.into{ ch_rmdup_for_preseq; ch_rmdup_for_damageprofiler; ch_for_nuclear_contamination; ch_rmdup_formtnucratio }
1906+
.into{ ch_rmdup_for_preseq; ch_rmdup_for_damageprofiler; ch_rmdup_for_mapdamage; ch_for_nuclear_contamination; ch_rmdup_formtnucratio }
19071907
} else {
19081908
ch_dedup_for_libeval.mix(ch_markdup_for_libeval)
1909-
.into{ ch_rmdup_for_preseq; ch_rmdup_for_damageprofiler; ch_for_nuclear_contamination; ch_rmdup_formtnucratio }
1909+
.into{ ch_rmdup_for_preseq; ch_rmdup_for_damageprofiler; ch_rmdup_for_mapdamage; ch_for_nuclear_contamination; ch_rmdup_formtnucratio }
19101910
}
19111911

19121912
// Merge independent libraries sequenced but with same treatment (often done to
@@ -2077,7 +2077,7 @@ process bedtools {
20772077
/* -- ANCIENT DNA EVALUATION AND BAM MODIFICATION -- */
20782078
//////////////////////////////////////////////////////////////
20792079

2080-
// Calculate typical aDNA damage frequency distribution
2080+
// Calculate typical aDNA damage frequency distribution with DamageProfiler
20812081

20822082
process damageprofiler {
20832083
label 'sc_small'
@@ -2086,7 +2086,7 @@ process damageprofiler {
20862086
publishDir "${params.outdir}/damageprofiler", mode: params.publish_dir_mode
20872087

20882088
when:
2089-
!params.skip_damage_calculation
2089+
!params.skip_damage_calculation && params.damage_calculation_tool == 'damageprofiler'
20902090

20912091
input:
20922092
tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path(bam), path(bai) from ch_rmdup_for_damageprofiler
@@ -2106,6 +2106,34 @@ process damageprofiler {
21062106
"""
21072107
}
21082108

2109+
// Calculate typical aDNA damage frequency distribution with mapDamage
2110+
2111+
process mapdamage_calculation {
2112+
label 'sc_small'
2113+
tag "${libraryid}"
2114+
2115+
publishDir "${params.outdir}/mapdamage", mode: params.publish_dir_mode
2116+
2117+
when:
2118+
!params.skip_damage_calculation && params.damage_calculation_tool == 'mapdamage'
2119+
2120+
input:
2121+
tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path(bam), path(bai) from ch_rmdup_for_mapdamage
2122+
file fasta from ch_fasta_for_mapdamage.collect()
2123+
2124+
output:
2125+
tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("results_${base}") into ch_output_from_mapdamage
2126+
path ("results_${base}") into ch_mapdamage_for_multiqc
2127+
2128+
script:
2129+
base = "${bam.baseName}"
2130+
def singlestranded = strandedness == "single" ? '--single-stranded' : ''
2131+
def downsample = params.mapdamage_downsample != 0 ? "-n ${params.mapdamage_downsample} --downsample-seed=1" : '' // Include seed to make results consistent between runs
2132+
"""
2133+
mapDamage -i ${bam} -r ${fasta} ${singlestranded} ${downsample} --ymax=${params.mapdamage_yaxis} --no-stats
2134+
"""
2135+
}
2136+
21092137
// Damage rescaling with mapDamage
21102138

21112139
process mapdamage_rescaling {
@@ -2245,8 +2273,8 @@ process bam_trim {
22452273
// def right_clipping = udg == "half" ? "${params.bamutils_clip_half_udg_right}" : "${params.bamutils_clip_none_udg_right}"
22462274
"""
22472275
bam trimBam $bam tmp.bam -L ${left_clipping} -R ${right_clipping} ${softclip}
2248-
samtools sort -@ ${task.cpus} tmp.bam -o ${libraryid}.trimmed.bam
2249-
samtools index ${libraryid}.trimmed.bam ${size}
2276+
samtools sort -@ ${task.cpus} tmp.bam -o ${libraryid}_udg${udg}.trimmed.bam
2277+
samtools index ${libraryid}_udg${udg}.trimmed.bam ${size}
22502278
"""
22512279
}
22522280

@@ -3230,6 +3258,7 @@ process multiqc {
32303258
file ('flagstat_filtered/*') from ch_bam_filtered_flagstat_for_multiqc.collect().ifEmpty([])
32313259
file ('preseq/*') from ch_preseq_for_multiqc.collect().ifEmpty([])
32323260
file ('damageprofiler/dmgprof*/*') from ch_damageprofiler_results.collect().ifEmpty([])
3261+
file ('mapdamage/*') from ch_mapdamage_for_multiqc.collect().ifEmpty([])
32333262
file ('qualimap/qualimap*/*') from ch_qualimap_results.collect().ifEmpty([])
32343263
file ('markdup/*') from ch_markdup_results_for_multiqc.collect().ifEmpty([])
32353264
file ('dedup*/*') from ch_dedup_results_for_multiqc.collect().ifEmpty([])

‎nextflow.config

+6-3
Original file line numberDiff line numberDiff line change
@@ -120,10 +120,13 @@ params {
120120
preseq_cval = 0.95
121121
preseq_terms = 100
122122

123-
//DamageProfiler settings
123+
//Damage estimation settings
124+
damage_calculation_tool = 'damageprofiler'
124125
damageprofiler_length = 100
125126
damageprofiler_threshold = 15
126127
damageprofiler_yaxis = 0.30
128+
mapdamage_downsample = 0
129+
mapdamage_yaxis = 0.30
127130

128131
//PMDTools settings
129132
run_pmdtools = false
@@ -285,7 +288,7 @@ params {
285288

286289
// Container slug. Stable releases should specify release tag!
287290
// Developmental code should specify :dev
288-
process.container = 'nfcore/eager:2.4.7'
291+
process.container = 'nfcore/eager:2.5.0'
289292

290293
// Load base.config by default for all pipelines
291294
includeConfig 'conf/base.config'
@@ -415,7 +418,7 @@ manifest {
415418
description = 'A fully reproducible and state-of-the-art ancient DNA analysis pipeline'
416419
mainScript = 'main.nf'
417420
nextflowVersion = '>=20.07.1'
418-
version = '2.4.7'
421+
version = '2.5.0'
419422
}
420423

421424
// Function to ensure that resource requirements don't go beyond

‎nextflow_schema.json

+1,784-1,665
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)
Please sign in to comment.