diff --git a/snakepit/deepvariant.smk b/snakepit/deepvariant.smk index ca85265..aea342e 100644 --- a/snakepit/deepvariant.smk +++ b/snakepit/deepvariant.smk @@ -2,7 +2,8 @@ from pathlib import Path,PurePath wildcard_constraints: region = r'\w+', - run = r'\w+' + run = r'\w+', + preset = r'|'.join(config['GL_config']) config.setdefault('Run_name', 'DV_variant_calling') @@ -10,9 +11,14 @@ if 'binding_paths' in config: for path in config['binding_paths']: workflow._singularity_args += f' -B {path}' -regions = list(map(str,range(1,30))) + ['X','Y','MT','unplaced'] +#TODO: generalise regions from input +#lean more on config and force users to be explicit, then less room for errors +regions = config.get('regionsz',list(map(str,range(1,30))) + ['X','Y','MT','unplaced']) -ruleorder: deepvariant_postprocess > bcftools_scatter +if config.get('scatter_merge',True): + ruleorder: bcftools_scatter > deepvariant_postprocess +else: + ruleorder: deepvariant_postprocess > bcftools_scatter rule all: input: @@ -22,16 +28,20 @@ rule all: cohort_samples = config['samples'] if 'glob' not in config['samples'] else glob_wildcards(config["bam_path"] + config["samples"]["glob"]).sample #NOTE: may need to be updated if deepvariant changes its internal parameters. -def make_custom_example_arguments(model): +def make_custom_example_arguments(model,small=True): #TODO: allow switching on/off small model + common_long_read_options = '--alt_aligned_pileup "diff_channels" --parse_sam_aux_fields --partition_size "25000" --phase_reads --norealign_reads --sort_by_haplotypes --track_ref_reads --trim_reads_for_pileup --vsc_min_fraction_indels "0.12" ' + small_model = '--small_model_snp_gq_threshold 25 --small_model_indel_gq_threshold 30 --small_model_vaf_context_window_size 51 ' #TODO: this does not work for ONTr10 match model: - case 'RNA' | 'WES': - return '--channels \'\' --split_skip_reads' - case 'PACBIO': - return '--add_hp_channel --alt_aligned_pileup "diff_channels" --max_reads_per_partition "600" --min_mapping_quality "1" --parse_sam_aux_fields --partition_size "25000" --phase_reads --pileup_image_width "199" --norealign_reads --sort_by_haplotypes --track_ref_reads --vsc_min_fraction_indels "0.12"' case 'WGS': - return '--channels "insert_size"' - case _: - return '--channels \'\' --split_skip_reads' + return small_model + '--channel_list BASE_CHANNELS,insert_size' + case 'PACBIO': + return common_long_read_options + small_model '--max_reads_per_partition "600" --min_mapping_quality "1" --pileup_image_width "147"' + case 'MASSEQ': + return common_long_read_options + '--max_reads_per_partition 0 --min_mapping_quality 1 --pileup_image_width "199" --max_reads_for_dynamic_bases_per_region "1500"' + case 'ONT_R104': + return common_long_read_options + '--max_reads_per_partition "600" --min_mapping_quality "5" --pileup_image_width "99" --vsc_min_fraction_snps "0.08"' + case 'RNA' | 'WES' | _: + return '--channel_list BASE_CHANNELS --split_skip_reads' def get_regions(region): if region == 'all': @@ -47,7 +57,6 @@ def get_regions(region): BAM_EXT = config.get('bam_index','.bai') -#error can be caused by corrupted file, check gzip -t -v rule deepvariant_make_examples: input: reference = multiext(config['reference'],'','.fai'), @@ -84,12 +93,14 @@ rule deepvariant_make_examples: --task {wildcards.N} ''' -def get_checkpoint(model): +#TODO: need to update models? +def get_checkpoint(model,small=True): + path = f'/opt/{"small" if small else ""}models/' match model: - case 'PACBIO' | 'WGS': - return f'/opt/models/{model.lower()}' + case 'PACBIO' | 'MASSEQ' | 'WGS' | 'ONT_R104' : + return path + model.lower() case 'hybrid': - return '/opt/models/hybrid_pacbio_illumina' + return path + 'hybrid_pacbio_illumina' case _: return config['model'] @@ -127,7 +138,8 @@ rule deepvariant_postprocess: gvcf = multiext('{run}/deepvariant/{sample}.{region}.g.vcf.gz','','.tbi') params: variants = lambda wildcards,input: PurePath(input.variants).with_name('call_variants_output@1.tfrecord.gz'), - gvcf = lambda wildcards,input: PurePath(input.gvcf[0]).with_suffix('').with_suffix(f'.tfrecord@{config["shards"]}.gz') + gvcf = lambda wildcards,input: PurePath(input.gvcf[0]).with_suffix('').with_suffix(f'.tfrecord@{config["shards"]}.gz'), + handle_sex_chromosomes = f'--haploid_contigs "X,Y" --par_regions_bed "{config["haploid_sex"]}"' if 'haploid_sex' in config else '' threads: config.get('resources',{}).get('postprocess',{}).get('threads',2) resources: mem_mb = config.get('resources',{}).get('postprocess',{}).get('mem_mb',20000), @@ -144,16 +156,20 @@ rule deepvariant_postprocess: --gvcf_outfile {output.gvcf[0]} \ --nonvariant_site_tfrecord_path {params.gvcf} \ --novcf_stats_report \ + {params.handle_sex_chromosomes} \ --cpus {threads} ''' rule bcftools_scatter: input: gvcf = expand(rules.deepvariant_postprocess.output['gvcf'],region='all',allow_missing=True), - regions = '/cluster/work/pausch/vcf_UCD/2023_07/regions.bed' + regions = '/cluster/work/pausch/vcf_UCD/2023_07/regions.bed', # TODO: need to handle this for different references + fai = config['reference'] + ".fai" output: expand('{run}/deepvariant/{sample}.{region}.g.vcf.gz',region=regions,allow_missing=True), - expand('{run}/deepvariant/{sample}.{region}.g.vcf.gz.csi',region=regions,allow_missing=True) + expand('{run}/deepvariant/{sample}.{region}.g.vcf.gz.tbi',region=regions,allow_missing=True) + wildcard_constraints: + region = "(?!all)" params: regions = regions, _dir = lambda wildcards, output: PurePath(output[0]).with_suffix('').with_suffix('').with_suffix('').with_suffix('') @@ -163,14 +179,15 @@ rule bcftools_scatter: walltime = '1h' shell: ''' - bcftools +scatter {input.gvcf[0]} -o {params._dir} -Oz --threads {threads} --write-index -S {input.regions} -x unplaced --no-version + bcftools +scatter {input.gvcf[0]} -o {params._dir} -Oz --threads {threads} -S {input.regions} -x unplaced --no-version for R in {params.regions} do - mv {params._dir}/$R.vcf.gz {params._dir}.$R.g.vcf.gz - mv {params._dir}/$R.vcf.gz.csi {params._dir}.$R.g.vcf.gz.csi + bcftools reheader -f {input.fai} {params._dir}/$R.vcf.gz > {params._dir}.$R.g.vcf.gz + tabix -p vcf {params._dir}.$R.g.vcf.gz done ''' +# see https://github.com/dnanexus-rnd/GLnexus/pull/310 def get_GL_config(preset): match preset: case 'DeepVariantWGS' | 'DeepVariant_unfiltered' | 'DeepVariantWES_MED_DP': @@ -180,8 +197,6 @@ def get_GL_config(preset): rule GLnexus_merge: input: - #gvcfs = expand('{run}/deepvariant/{region}/{sample}.g.vcf.gz',sample=cohort_samples,allow_missing=True), - #tbi = expand('{run}/deepvariant/{region}/{sample}.g.vcf.gz.csi',sample=cohort_samples,allow_missing=True) gvcfs = expand('{run}/deepvariant/{sample}.{region}.g.vcf.gz',sample=cohort_samples,allow_missing=True), tbi = expand('{run}/deepvariant/{sample}.{region}.g.vcf.gz.tbi',sample=cohort_samples,allow_missing=True) output: @@ -195,17 +210,13 @@ rule GLnexus_merge: walltime = config.get('resources',{}).get('merge',{}).get('walltime','4h'), scratch = '50G' priority: 25 - container: config.get('containers',{}).get('GLnexus','docker://ghcr.io/dnanexus-rnd/glnexus:v1.4.3') shell: ''' - /usr/local/bin/glnexus_cli \ + glnexus_cli \ --dir $TMPDIR/GLnexus.DB \ --config {params.preset} \ --threads {threads} \ --mem-gbytes {params.mem} \ {input.gvcfs} |\ - bcftools view - |\ - bgzip -@ 8 -c > {output[0]} - - tabix -p vcf {output[0]} + bcftools view -@ {threads} -W=tbi {output[0]} '''