dram mags added

fischuu · fischuu · commit 9057c082a628 · 2025-05-09T13:48:07.000+03:00
diff --git a/workflow/rules/annotate/__main__.smk b/workflow/rules/annotate/__main__.smk
@@ -2,6 +2,7 @@ include: "bakta.smk"
 include: "quast.smk"
 include: "gtdbtk.smk"
 include: "dram.smk"
+include: "dram_mag.smk"
 include: "eggnog.smk"
 include: "checkm2.smk"
 include: "proteinortho.smk"
@@ -17,5 +18,6 @@ rule annotate:
         rules.annotate__checkm2__predict.output,
         rules.annotate__gtdbtk__classify.output,
         rules.annotate__dram.input,
+        rules.annotate__dram_mags.input,
         rules.annotate__proteinortho.output,
         rules.annotate__phylophlan.output,
diff --git a/workflow/rules/annotate/dram.smk b/workflow/rules/annotate/dram.smk
@@ -1,4 +1,4 @@
-rule _annotate__dram__annotate:
+rule annotate__dram__annotate:
     """Annotate dereplicate genomes with DRAM"""
     input:
         dereplicated_genomes=DREP / "dereplicated_genomes.fa.gz",
@@ -43,56 +43,8 @@ rule _annotate__dram__annotate:
                 --gtdb_taxonomy {input.gtdbtk_summary} \
         2>> {log} 1>&2
     """
-  
-#rule _annotate__dram__stack:
-#    """Stack DRAM annotations"""
-#    input:
-#        annotation=DRAM / "annotations.tsv",
-#        trnas=DRAM / "trnas.tsv",
-#        rrnas=DRAM / "rrnas.tsv",
-#    output:
-#        tarball=DRAM / "annotate.tar.gz",
-#    log:
-#        DRAM / "annotate_stack.log",
-#    conda:
-#        "__environment__.yml"
-#    container:
-#        docker["dram"]
-#    params:
-#        config=config["dram-config"],
-#        out_dir=DRAM,
-#        tmp_dir=DRAM / "annotate",
-#    threads: config["resources"]["cpu_per_task"]["multi_thread"]
-#    resources:
-#        cpu_per_task=config["resources"]["cpu_per_task"]["multi_thread"],
-#        mem_per_cpu=config["resources"]["mem_per_cpu"]["quitehighmem"] // config["resources"]["cpu_per_task"]["multi_thread"],
-#        time =  config["resources"]["time"]["shortrun"],
-#        partition = config["resources"]["partition"]["longrun"]
-#    shell:
-#        """
-#
-#        for file in annotations trnas rrnas ; do
-#            ( csvstack \
-#                --tabs \
-#                {params.tmp_dir}/*/$file.tsv \
-#            | csvformat \
-#                --out-tabs \
-#            > {params.out_dir}/$file.tsv \
-#            ) 2>> {log}
-#        done
-#
-#        tar \
-#            --create \
-#            --directory {params.out_dir} \
-#            --file {output.tarball} \
-#            --remove-files \
-#            --use-compress-program="pigz --processes {threads}" \
-#            --verbose \
-#            annotate \
-#        2>> {log} 1>&2
-#        """
 
-rule _annotate__dram__distill:
+rule annotate__dram__distill:
     """Distill DRAM annotations."""
     input:
         annotations=DRAM / "annotate" / "annotations.tsv",
@@ -131,8 +83,7 @@ rule _annotate__dram__distill:
         rmdir {params.outdir_tmp} 2>> {log} 1>&2
         """
 
-
 rule annotate__dram:
     """Run DRAM on dereplicated genomes."""
     input:
-        rules._annotate__dram__distill.output,
+        rules.annotate__dram__distill.output,
diff --git a/workflow/rules/annotate/dram_mag.smk b/workflow/rules/annotate/dram_mag.smk
@@ -0,0 +1,89 @@
+# For now, I took the GTDBTK annotation out, as we get the taxonomic assignment also from the globlal run
+
+rule annotate__dram_mag__annotate:
+    """Annotate dereplicate genomes with DRAM"""
+    input:
+        contigs=MAGSCOT / "{assembly_id}.fa.gz",
+        #gtdbtk_summary=GTDBTK / "gtdbtk.summary.tsv",
+        dram_db=features["databases"]["dram"],
+    output:
+        annotation=DRAMMAG / "{assembly_id}" / "annotate"  / "annotations_{assembly_id}.tsv",
+        trnas=DRAMMAG / "{assembly_id}" / "annotate" / "trnas_{assembly_id}.tsv",
+        rrnas=DRAMMAG / "{assembly_id}" / "annotate" / "rrnas_{assembly_id}.tsv",
+    log:
+        DRAM / "{assembly_id}" / "annotate_{assembly_id}.log",
+    conda:
+        "__environment__.yml"
+    container:
+        docker["dram"]
+    params:
+        config=config["dram-config"],
+        min_contig_size=1500,
+        out_dir=lambda wildcards: f"{DRAMMAG}/{wildcards.assembly_id}",
+        tmp_dir=lambda wildcards: f"{DRAMMAG}/{wildcards.assembly_id}/annotate",
+    threads: config["resources"]["cpu_per_task"]["multi_thread"]
+    resources:
+        cpu_per_task=config["resources"]["cpu_per_task"]["multi_thread"],
+        mem_per_cpu=config["resources"]["mem_per_cpu"]["quitehighmem"] // config["resources"]["cpu_per_task"]["multi_thread"],
+        time =  config["resources"]["time"]["longrun"],
+        nvme = config["resources"]["nvme"]["small"],
+        partition = config["resources"]["partition"]["small"]
+    shell:
+        """
+        rm -rf {params.tmp_dir}
+        
+        echo "Hostname: $(hostname)" 2>> {log} 1>&2
+        echo "Temporary directory: $TMPDIR" 2>> {log} 1>&2
+        df -h 2>> {log} 1>&2
+        
+        DRAM.py annotate \
+                --config_loc {params.config} \
+                --input_fasta {input.contigs} \
+                --output_dir {params.tmp_dir} \
+                --threads {threads} \
+        2>> {log} 1>&2
+    """
+
+rule annotate__dram_mag__distill:
+    """Distill DRAM annotations."""
+    input:
+        annotation=DRAMMAG / "{assembly_id}" / "annotate"  / "annotations_{assembly_id}.tsv",
+        trnas=DRAMMAG / "{assembly_id}" / "annotate" / "trnas_{assembly_id}.tsv",
+        rrnas=DRAMMAG / "{assembly_id}" / "annotate" / "rrnas_{assembly_id}.tsv",
+        dram_db=features["databases"]["dram"],
+    output:
+        genome=DRAMMAG / "{assembly_id}" / "genome_stats.tsv",
+        metabolism=DRAMMAG / "{assembly_id}" / "metabolism_summary.xlsx",
+        product_html=DRAMMAG / "{assembly_id}" / "product.html",
+        product_tsv=DRAMMAG / "{assembly_id}" / "product.tsv",
+    log:
+        DRAMMAG / "{assembly_id}" / "distill.log2",
+    conda:
+        "__environment__.yml"
+    container:
+        docker["dram"]
+    resources:
+        mem_per_cpu=config["resources"]["mem_per_cpu"]["highmem"],
+        time =  config["resources"]["time"]["longrun"],
+    params:
+        config=config["dram-config"],
+        outdir=lambda wildcards: f"{DRAMMAG}/{wildcards.assembly_id}",
+        outdir_tmp=lambda wildcards: f"{DRAMMAG}/{wildcards.assembly_id}/distill",
+    shell:
+        """
+        DRAM.py distill \
+            --config_loc {params.config} \
+            --input_file {input.annotations} \
+            --rrna_path {input.rrnas} \
+            --trna_path {input.trnas} \
+            --output_dir {params.outdir_tmp} \
+        2> {log} 1>&2
+
+        mv {params.outdir_tmp}/* {params.outdir}/ 2>> {log} 1>&2
+        rmdir {params.outdir_tmp} 2>> {log} 1>&2
+        """
+
+rule annotate__dram_mags:
+    """Run Bakta over the dereplicated mags"""
+     input:
+        expand(DRAMMAG / "{assembly_id}" / "annotate"  / "annotations_{assembly_id}.tsv", assembly_id=ASSEMBLIES),
diff --git a/workflow/rules/folders.smk b/workflow/rules/folders.smk
@@ -55,6 +55,7 @@ GTDBTK = ANN / "gtdbtk/"
 QUAST = ANN / "quast/"
 CAMPER = ANN / "camper/"
 DRAM = ANN / "dram/"
+DRAMMAG = ANN / "dram_mags/"
 CHECKM = ANN / "checkm2"
 BAKTA = ANN / "bakta"
 BAKTAMAG = ANN / "bakta_mags"