Create a file, which maps taxon-ids to the corresponding file

merszym · merszym · commit c358fe374e26 · 2020-10-31T02:16:51.000+01:00
For a possible update in the sediment_nf pipeline, Species  are directly
assigned by kraken. To avoid name-conflicts (e.g. ssp. vs. subsp.)
species are to be identified by the ncbi-taxid. So there is now a file
in genomes/ calles taxid_map.tsv, which lists all taxids and the
corresponding species name in the file-system.
diff --git a/bin/convert_acc_to_taxid.py b/bin/convert_acc_to_taxid.py
@@ -0,0 +1,15 @@
+import sys
+
+acc_tax_dict = {}
+for line in [x for x in open(sys.argv[2])]:
+    _,k,v,_ = line.split('\t', 3)
+    acc_tax_dict[k] = v
+
+handle = open('taxid_map.tsv', 'w')
+
+for line in [x for x in open(sys.argv[1])]:
+    acc, fam, sp = line.replace('\n','').split('\t', 2)
+    print(acc_tax_dict[acc], fam, sp, sep='\t', file=handle)
+
+handle.close()
+
diff --git a/bin/dustmasker_interval_to_bed.py b/bin/dustmasker_interval_to_bed.py
diff --git a/bin/extract_families.py b/bin/extract_families.py
@@ -4,12 +4,16 @@
 import gzip
 import sys
 
+acc_map_handle = open('accmap.tsv', 'w')
 for arg in sys.argv[1:]:   
     with gzip.open(arg, 'rt') as gb:
         for seq_gb in SeqIO.parse(gb, 'genbank'):
             if 'Mammalia' in seq_gb.annotations['taxonomy']:
                 family = [name for name in seq_gb.annotations['taxonomy'] if name.endswith('idae')][-1]
                 organism = seq_gb.annotations['organism'].replace(' ', '_')
-                filename = f"{family}_{organism}.fasta"
+                acc = seq_gb.id
+                print(acc,family,organism, sep='\t', file=acc_map_handle)
+                filename = f"{family}_{acc}_{organism}.fasta"
                 with open(filename,'w') as fasta_out:
                     SeqIO.write(seq_gb, fasta_out, 'fasta')
+acc_map_handle.close()
diff --git a/envs/environment.yml b/envs/environment.yml
@@ -1,13 +1,19 @@
-name: datastructureBreakpoints
+name: datastructure
 channels:
   - bioconda
   - defaults
 dependencies:
   - _libgcc_mutex=0.1=main
-  - biopython=1.77=py38h7b6447c_0
+  - beautifulsoup4=4.9.3=pyhb0f4dca_0
+  - biopython=1.78=py38h7b6447c_0
   - blas=1.0=mkl
-  - ca-certificates=2020.7.22=0
-  - certifi=2020.6.20=py38_0
+  - brotlipy=0.7.0=py38h7b6447c_1000
+  - ca-certificates=2020.10.14=0
+  - certifi=2020.6.20=py38h06a4308_2
+  - cffi=1.14.3=py38he30daa8_0
+  - chardet=3.0.4=py38_1003
+  - cryptography=3.1.1=py38h1ba5d50_0
+  - idna=2.10=py_0
   - intel-openmp=2020.2=254
   - jellyfish=1.1.12=h6bb024c_1
   - ld_impl_linux-64=2.33.1=h53a641e_7
@@ -17,19 +23,25 @@ dependencies:
   - libstdcxx-ng=9.1.0=hdf63c60_0
   - mkl=2020.2=256
   - mkl-service=2.3.0=py38he904b0f_0
-  - mkl_fft=1.1.0=py38h23d657b_0
+  - mkl_fft=1.2.0=py38h23d657b_0
   - mkl_random=1.1.1=py38h0573a6f_0
   - ncurses=6.2=he6710b0_1
-  - numpy=1.19.1=py38hbc911f0_0
-  - numpy-base=1.19.1=py38hfa32c7d_0
-  - openssl=1.1.1g=h7b6447c_0
-  - pip=20.2.2=py38_0
+  - numpy=1.19.2=py38h54aff64_0
+  - numpy-base=1.19.2=py38hfa32c7d_0
+  - openssl=1.1.1h=h7b6447c_0
+  - pip=20.2.4=py38_0
+  - pycparser=2.20=py_2
+  - pyopenssl=19.1.0=py_1
+  - pysocks=1.7.1=py38_0
   - python=3.8.5=h7579374_1
   - readline=8.0=h7b6447c_0
-  - setuptools=49.6.0=py38_0
+  - requests=2.24.0=py_0
+  - setuptools=50.3.0=py38hb0f4dca_1
   - six=1.15.0=py_0
+  - soupsieve=2.0.1=py_0
   - sqlite=3.33.0=h62c20be_0
   - tk=8.6.10=hbc83047_0
+  - urllib3=1.25.11=py_0
   - wheel=0.35.1=py_0
   - xz=5.2.5=h7b6447c_0
   - zlib=1.2.11=h7b6447c_3
diff --git a/main.nf b/main.nf
@@ -36,7 +36,6 @@ if(params.outdir == false){
 
 
 process downloadGenomes{
-    cache false
     publishDir "${params.outdir}/ncbi", mode: 'link'
     tag "Downloading..."
 
@@ -46,12 +45,10 @@ process downloadGenomes{
     script:
         """
         rsync -av rsync://ftp.ncbi.nlm.nih.gov/refseq/release/mitochondrion/*.genomic.gbff.gz .
-
         """
 }
 
 process extractFamilies{
-    cache false
     conda "$baseDir/envs/environment.yml"
     tag "Extracting..."
 
@@ -60,6 +57,7 @@ process extractFamilies{
 
     output:
         file "*.fasta" into extracted_fasta mode flatten
+        file "*.tsv" into convert_acc
 
     script:
         """
@@ -68,38 +66,36 @@ process extractFamilies{
 }
 
 extracted_fasta
-    .map{[it.baseName.split("_")[0], it.baseName.split("_")[1..-1].join("_"), file(it)]}
+    .map{[it.baseName.split("_")[0],it.baseName.split('_')[1..2].join("_"), it.baseName.split("_")[3..-1].join("_"), file(it)]}
     .set{extracted_fasta}
 
 
 process writeFastas{
-    cache false
+    conda "$baseDir/envs/environment.yml"
     publishDir "${params.outdir}/genomes/${family}/", saveAs: {"${species}.fasta"}, pattern: "*.fasta", mode:'link'
-
-    tag "$family:$species"
+    tag "Writing $family:$species"
     
     input:
-        set family, species, "input.fasta" from extracted_fasta
+        set family, accession, species, "input.fasta" from extracted_fasta
 
     output:
         set family, species, "output.fasta" into (for_bed, for_bwa, for_kraken)
-
+    
     script:
         """
         cat input.fasta > output.fasta
         """
 }
 
 process indexFasta{
-    cache false
     publishDir "${params.outdir}/genomes/${family}/", mode: 'link'
     tag "$family:$species"
     
     input:
         set family, species, "${species}.fasta" from for_bwa
 
     output:
-    file "${species}.fasta.*"
+        file "${species}.fasta.*"
    
     script:
         """
@@ -108,7 +104,6 @@ process indexFasta{
 }
 
 process writeBedFiles{
-    cache false
     publishDir "${params.outdir}/masked/", saveAs: {"${species}.masked.bed"}, mode:'link'
     tag "$family:$species"
 
@@ -132,16 +127,16 @@ for_kraken
     
 
 process createKrakenDB{
-    cache false
     conda "$baseDir/envs/environment.yml"
-    tag "Wait ~ 30min."
+    tag "Wait! This takes > 30min."
+    publishDir("stats")
     
     input:
         each kmer from params.kmers
         file fasta_list from for_kraken
     
     output:
-        file "output.txt" into log
+        file "nucl_gb.accession2taxid" into taxid_map
     
     script:
         dbname = "Mito_db_kmer${kmer}"
@@ -156,13 +151,27 @@ process createKrakenDB{
             ${params.kraken}/kraken-build --add-to-library \${file%?} --db ${dbname};\
             done
             ${params.kraken}/kraken-build --build --db ${dbname} --kmer $kmer
-            ${params.kraken}/kraken-build --clean --db ${dbname}
+        mv $dbname/taxonomy/nucl_gb.accession2taxid .
         if [[ -d \$out/kraken ]];\
             then rm -fr \$out/kraken;\
             fi;
         mkdir \$out/kraken
-        cp -r ${dbname} \$out/kraken
-        touch "output.txt"
+        mv ${dbname} \$out/kraken/
+        """
+}
+process createFileMap{
+    publishDir "${params.outdir}/genomes", mode:'link'
+    
+    input:
+        file "acc_map.tsv" from convert_acc
+        file "nucl_gb.accession2taxid" from taxid_map
+
+    output:
+        file "*.tsv" 
+
+    script:
+        """ 
+        python3 $baseDir/bin/convert_acc_to_taxid.py acc_map.tsv nucl_gb.accession2taxid
         """
 }