Skip to content

Commit 7ac6cb0

Browse files
authored
Merge pull request #1030 from scarlhoff/dsl2_pmd_maskedfasta
DSL2: add reference masking for PMDtools
2 parents d421158 + a62d332 commit 7ac6cb0

15 files changed

+326
-68
lines changed

conf/modules.config

+16
Original file line numberDiff line numberDiff line change
@@ -231,6 +231,12 @@ process {
231231
]
232232
}
233233

234+
withName: 'GUNZIP_PMDFASTA|GUNZIP_PMDSNP|GUNZIP_SNPBED' {
235+
publishDir = [
236+
enabled: false
237+
]
238+
}
239+
234240
withName: SAMTOOLS_FAIDX {
235241
publishDir = [
236242
path: { "${params.outdir}/reference/${meta.id}/" },
@@ -669,6 +675,16 @@ process {
669675
//
670676
// DAMAGE MANIPULATION
671677
//
678+
679+
withName: BEDTOOLS_MASKFASTA {
680+
ext.prefix = { "${meta.id}.masked" }
681+
publishDir = [
682+
path: { "${params.outdir}/reference/masked_reference/" },
683+
mode: params.publish_dir_mode,
684+
pattern: '*.masked.fa'
685+
]
686+
}
687+
672688
withName: MAPDAMAGE2 {
673689
tag = { "${meta.reference}|${meta.sample_id}_${meta.library_id}" }
674690
ext.args = { [

docs/development/dev_docs.md

+44
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
# Documentation and how tos for developing eager
2+
3+
## How to add new input files and options to the reference sheet
4+
5+
To add new input files or options to the reference sheet, you have to complete all the following steps.
6+
7+
### Single-reference input workflow
8+
9+
1. Add your new parameter to nextflow.config.
10+
2. Add parameter description to schema (nf-core schema build).
11+
3. Read in new parameter (params.<NEW>) as input within the reference_indexing_single local subworkflow.
12+
1. Add new line to the large `.map{}` operation starting on [line 80(https://github.com/nf-core/eager/blob/d4211582f349cc30c88202c12942218f99006041/subworkflows/local/reference_indexing_single.nf#L80)] and add check if the file exists.
13+
`def <PARAM_NAME> = params.<NEW> != null ? file(params.<NEW>, checkIfExists: true ) : ""`
14+
2. Add <PARAM_NAME> to the result of the map operation. Double-check the order!
15+
3. With the `ch_ref_index_single.multiMap{}` below you add the reference name as a meta. You can also combine your new parameter with others if useful for the workflow step.
16+
`<NEW_SUBCHANNE>: [ meta, <PARAM_NAME> ]`
17+
4. Add your ch_ref_index_single.<NEW_SUBCHANNEL> to the final emit.
18+
`<NEW_EMIT> = ch_ref_index_single.<NEW_SUBCHANNEL>`
19+
20+
### Multi-reference input workflow
21+
22+
1. Add new column named <SOFTWARE_FILETYPE> and test data to the test reference sheet (https://github.com/nf-core/test-datasets/blob/eager/reference/reference_sheet_multiref.csv).
23+
2. Read in new input within the reference_indexing_multi local subworkflow.
24+
1. Add new line to the large `.map{}` operation starting on [line 30](https://github.com/nf-core/eager/blob/d4211582f349cc30c88202c12942218f99006041/subworkflows/local/reference_indexing_multi.nf#L30). Add check if the file exists if appropriate.
25+
`def <PARAM_NAME> = row["<SOFTWARE_FILETYPE>"] != "" ? file(row["<SOFTWARE_FILETYPE>"], checkIfExists: true) : ""`
26+
2. Add <PARAM_NAME> to the result of the `.map{}` operation. Double-check the order!
27+
3. With the `ch_input_from_referencesheet.multiMap{}` below you add the reference name as a meta. You can also combine your new parameter with others if useful for the workflow step.
28+
`<NEW_SUBCHANNEL>: [ meta, <PARAM_NAME> ]`
29+
4. Add ch_input_from_referencesheet.<NEW_SUBCHANNEL> to the final emit.
30+
`<NEW_EMIT> = ch_input_from_referencesheet.<NEW_SUBCHANNEL>`
31+
32+
### Combining in the Reference Indexing workflow
33+
34+
1. Add you new parameter channel to the `if` condition selecting between the direct parameter input or the reference sheet input.
35+
1. below "REFERENCE_INDEXING_MULTI" for reference sheet input
36+
`<NEW_CHANNEL> = REFERENCE_INDEXING_MULTI.out.<NEW_EMIT>`
37+
2. below "REFERENCE_INDEXING_SINGLE"
38+
`<NEW_CHANNEL> = REFERENCE_INDEXING_SINGLE.out.<NEW_EMIT>`
39+
3. Filter out options that have not been provided.
40+
`<NEW_CHANNEL> = <NEW_CHANNEL>.filter{ it[1] != "" }`
41+
4. Add unzipping of zipped input files with GUNZIP.
42+
5. Add <NEW_CHANNEL> to the final emit.
43+
`<NEW_EMIT> = <NEW_CHANNEL>`
44+
6. Call new inputs within the main eager.nf with `REFERENCE_INDEXING.out.<NEW_EMIT>`.

docs/development/manual_tests.md

+14
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,8 @@ Tool Specific combinations
9292

9393
- with default parameters
9494
- with stricter threshold
95+
- with fasta masking
96+
- with fasta masking for 1 of 2 references
9597

9698
- BAM trimming
9799

@@ -615,6 +617,18 @@ nextflow run . -profile test,docker --run_pmd_filtering -resume --outdir ./resul
615617
# JK2802_JK2802_AGAATAACCTACCA_pmdfiltered.bam: 30
616618
```
617619
620+
```bash
621+
## PMD filtering with fasta masking
622+
## Expect: damage_manipulation directory with *.masked.fa and bam and bai and flagstat per library
623+
nextflow run . -profile test_humanbam,docker --run_pmd_filtering --damage_manipulation_pmdtools_reference_mask https://raw.githubusercontent.com/nf-core/test-datasets/eager/reference/Human/1240K.pos.list_hs37d5.0based.bed.gz -resume --outdir ./results
624+
```
625+
626+
```bash
627+
## PMD filtering with fasta masking for 1 of 2 references
628+
## Expect: damage_manipulation directory with hs37d5_chr21-MT.masked.fa and bam and bai and flagstat per library and reference (22 files total). hs37d5_chr21-MT first masked with 1240K.pos.list_hs37d5.0based.bed.gz from reference sheet, PMD filtering run with masked reference fasta for hs37d5 and non-masked reference fasta for Mammoth_MT
629+
nextflow run . -profile test_multiref,docker --run_pmd_filtering --outdir ./results
630+
```
631+
618632
## BAM trimming
619633
620634
```bash

docs/output.md

+2
Original file line numberDiff line numberDiff line change
@@ -464,11 +464,13 @@ Be advised that this process introduces reference bias in the resulting rescaled
464464

465465
- `*_pmdfiltered.bam`: Reads aligned to a reference genome that pass the post-mortem-damage threshold, in BAM format.
466466
- `*_pmdfiltered.bam.{bai,csi}`: Index file corresponding to the BAM file.
467+
- `*_pmdfiltered.flagstat`: Statistics of aligned reads after from SAMtools `flagstat`, after filtering for post-mortem damage.
467468

468469
</details>
469470

470471
[pmdtools](https://github.com/pontussk/PMDtools) implements a likelihood framework incorporating a postmortem damage (PMD) score, base quality scores and biological polymorphism to identify degraded DNA sequences that are unlikely to originate from modern contamination. Using the model, each sequence is assigned a PMD score, for which positive values indicate support for the sequence being genuinely ancient.
471472
By filtering a BAM file for damaged reads in this way, it is possible to ameliorate the effects of present-day contamination, and isolate sequences of likely ancient origin to be used downstream.
473+
By default, all positions are assessed for damage, but it is possible to provide a FASTA file masked for specific references (`--damage_manipulation_pmdtools_masked_reference`) or a BED file to mask the reference FASTA within nf-core/eager (`--damage_manipulation_pmdtools_reference_mask`). This can alleviate reference bias when running PMD filtering on capture data, where you might not want the allele of a SNP to be counted as damage when it is a transition.
472474

473475
### Base Trimming
474476

modules.json

+5
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,11 @@
3535
"git_sha": "911696ea0b62df80e900ef244d7867d177971f73",
3636
"installed_by": ["modules"]
3737
},
38+
"bedtools/maskfasta": {
39+
"branch": "master",
40+
"git_sha": "516189e968feb4ebdd9921806988b4c12b4ac2dc",
41+
"installed_by": ["modules"]
42+
},
3843
"bowtie2/align": {
3944
"branch": "master",
4045
"git_sha": "603ecbd9f45300c9788f197d2a15a005685b4220",

modules/nf-core/bedtools/maskfasta/environment.yml

+6
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

modules/nf-core/bedtools/maskfasta/main.nf

+36
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

modules/nf-core/bedtools/maskfasta/meta.yml

+46
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

nextflow.config

+1
Original file line numberDiff line numberDiff line change
@@ -185,6 +185,7 @@ params {
185185
damage_manipulation_rescale_length_3p = 0
186186
run_pmd_filtering = false
187187
damage_manipulation_pmdtools_threshold = 3
188+
damage_manipulation_pmdtools_masked_reference = null
188189
damage_manipulation_pmdtools_reference_mask = null
189190
run_trim_bam = false
190191
damage_manipulation_bamutils_trim_double_stranded_none_udg_left = 0

nextflow_schema.json

+14-4
Original file line numberDiff line numberDiff line change
@@ -780,12 +780,20 @@
780780
"description": "Specify PMDScore threshold for PMDtools.",
781781
"help_text": "Specifies the PMDScore threshold to use in the pipeline when filtering BAM files for DNA damage. Only reads which surpass this damage score are considered for downstream DNA analysis.\n\n> Modifies PMDtools parameter: `--threshold`"
782782
},
783+
"damage_manipulation_pmdtools_masked_reference": {
784+
"type": "string",
785+
"fa_icon": "fas fa-mask",
786+
"help_text": "Supplying a FASTA file will use this file as reference for `samtools calmd` prior to PMD filtering. /nSetting the SNPs that are part of the used capture set as `N` can alleviate reference bias when running PMD filtering on capture data, where you might not want the allele of a SNP to be counted as damage when it is a transition.",
787+
"description": "Specify a masked FASTA file with positions to be used with pmdtools.",
788+
"pattern": "^\\S+\\.fa?(\\sta)$",
789+
"format": "file-path"
790+
},
783791
"damage_manipulation_pmdtools_reference_mask": {
784792
"type": "string",
785793
"fa_icon": "fas fa-mask",
786794
"help_text": "Supplying a bedfile to this parameter activates masking of the reference fasta at the contained sites prior to running PMDtools. Positions that are in the provided bedfile will be replaced by Ns in the reference genome. \nThis can alleviate reference bias when running PMD filtering on capture data, where you might not want the allele of a transition SNP to be counted as damage. Masking of the reference is done using `bedtools maskfasta`.",
787795
"description": "Specify a bedfile to be used to mask the reference fasta prior to running pmdtools.",
788-
"pattern": "^\\S+\\.bed$",
796+
"pattern": "^\\S+\\.bed?(\\.gz)$",
789797
"format": "file-path"
790798
},
791799
"run_trim_bam": {
@@ -960,8 +968,7 @@
960968
"fa_icon": "fab fa-creative-commons-sampling-plus"
961969
},
962970
"skip_qualimap": {
963-
"type": "boolean",
964-
"default": "false"
971+
"type": "boolean"
965972
},
966973
"snpcapture_bed": {
967974
"type": "string",
@@ -1155,14 +1162,17 @@
11551162
{
11561163
"$ref": "#/definitions/adna_damage_analysis"
11571164
},
1165+
{
1166+
"$ref": "#/definitions/feature_annotation_statistics"
1167+
},
11581168
{
11591169
"$ref": "#/definitions/host_removal"
11601170
},
11611171
{
11621172
"$ref": "#/definitions/contamination_estimation"
11631173
},
11641174
{
1165-
"$ref": "#/definitions/feature_annotation_statistics"
1175+
"$ref": "#/definitions/contamination_estimation"
11661176
}
11671177
]
11681178
}

subworkflows/local/manipulate_damage.nf

+51-14
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
// Calculate PMD scores, trim, or rescale DNA damage from mapped reads.
33
//
44

5+
include { BEDTOOLS_MASKFASTA } from '../../modules/nf-core/bedtools/maskfasta/main'
56
include { MAPDAMAGE2 } from '../../modules/nf-core/mapdamage2/main'
67
include { PMDTOOLS_FILTER } from '../../modules/nf-core/pmdtools/filter/main'
78
include { BAMUTIL_TRIMBAM } from '../../modules/nf-core/bamutil/trimbam/main'
@@ -13,8 +14,9 @@ include { SAMTOOLS_FLAGSTAT as SAMTOOLS_FLAGSTAT_DAMAGE_FILTERED } from '../../
1314
// TODO: Add required channels and channel manipulations for reference-dependent bed masking before pmdtools. Requires multi-ref support before implementation.
1415
workflow MANIPULATE_DAMAGE {
1516
take:
16-
ch_bam_bai // [ [ meta ], bam , bai ]
17-
ch_fasta // [ [ meta ], fasta ]
17+
ch_bam_bai // [ [ meta ], bam , bai ]
18+
ch_fasta // [ [ meta ], fasta ]
19+
ch_pmd_masking // [ [ meta ], masked_fasta, bed_for_masking ]
1820

1921
main:
2022
ch_versions = Channel.empty()
@@ -35,7 +37,7 @@ workflow MANIPULATE_DAMAGE {
3537
// Prepend a new meta that contains the meta.reference value as the new_meta.reference attribute
3638
WorkflowEager.addNewMetaFromAttributes( it, "reference" , "reference" , false )
3739
}
38-
.combine(ch_refs, by: 0 ) // [ [combine_meta], [meta], bam, bai, [ref_meta], fasta ]
40+
.combine( ch_refs, by: 0 ) // [ [combine_meta], [meta], bam, bai, [ref_meta], fasta ]
3941

4042
if ( params.run_mapdamage_rescaling ) {
4143
ch_mapdamage_prep = ch_input_for_damage_manipulation
@@ -71,17 +73,52 @@ workflow MANIPULATE_DAMAGE {
7173
}
7274

7375
if ( params.run_pmd_filtering ) {
74-
// TODO Add module to produce Masked reference from given references and bed file (with meta specifying the reference it matches)?
75-
// if ( params.pmdtools_reference_mask) {
76-
// MASK_REFERENCE_BY_BED()
77-
// }
78-
79-
ch_pmdtools_input = ch_input_for_damage_manipulation
80-
.multiMap {
81-
ignore_me, meta, bam, bai, ref_meta, fasta ->
82-
bam: [ meta, bam, bai ]
83-
fasta: fasta
84-
}
76+
ch_masking_prep = ch_pmd_masking
77+
.combine( ch_fasta, by: 0 ) // [ [meta], masked_fasta, bed, fasta ]
78+
.branch {
79+
meta, masked_fasta, bed, fasta ->
80+
alreadymasked: masked_fasta != ""
81+
tobemasked: masked_fasta == "" && bed != ""
82+
nomasking: masked_fasta == "" && bed == ""
83+
}
84+
85+
ch_masking_input = ch_masking_prep.tobemasked
86+
.multiMap{
87+
meta, masked_fasta, bed, fasta ->
88+
bed: [ meta, bed ]
89+
fasta: fasta
90+
}
91+
92+
BEDTOOLS_MASKFASTA( ch_masking_input.bed, ch_masking_input.fasta )
93+
ch_masking_output = BEDTOOLS_MASKFASTA.out.fasta
94+
ch_versions = ch_versions.mix( BEDTOOLS_MASKFASTA.out.versions.first() )
95+
96+
ch_already_masked = ch_masking_prep.alreadymasked
97+
.map {
98+
meta, masked_fasta, bed, fasta ->
99+
[ meta, masked_fasta ]
100+
}
101+
102+
ch_no_masking = ch_masking_prep.nomasking
103+
.map {
104+
meta, masked_fasta, bed, fasta ->
105+
[ meta, fasta ]
106+
}
107+
108+
ch_pmd_fastas = ch_masking_output.mix( ch_already_masked, ch_no_masking )
109+
.map {
110+
// Prepend a new meta that contains the meta.id value as the new_meta.reference attribute
111+
WorkflowEager.addNewMetaFromAttributes( it, "id" , "reference" , false )
112+
}
113+
114+
ch_pmdtools_input = ch_bam_bai
115+
.map { WorkflowEager.addNewMetaFromAttributes( it, "reference" , "reference" , false ) }
116+
.combine( ch_pmd_fastas, by: 0 ) // [ [combine_meta], [meta], bam, bai, [ref_meta] fasta ]
117+
.multiMap {
118+
combine_meta, meta, bam, bai, ref_meta, fasta ->
119+
bam: [ meta, bam, bai ]
120+
fasta: fasta
121+
}
85122

86123
PMDTOOLS_FILTER( ch_pmdtools_input.bam, params.damage_manipulation_pmdtools_threshold, ch_pmdtools_input.fasta )
87124
ch_versions = ch_versions.mix( PMDTOOLS_FILTER.out.versions.first() )

0 commit comments

Comments
 (0)