Skip to content

Commit c358fe3

Browse files
committed
Create a file, which maps taxon-ids to the corresponding file
For a possible update in the sediment_nf pipeline, Species are directly assigned by kraken. To avoid name-conflicts (e.g. ssp. vs. subsp.) species are to be identified by the ncbi-taxid. So there is now a file in genomes/ calles taxid_map.tsv, which lists all taxids and the corresponding species name in the file-system.
1 parent 33b82ba commit c358fe3

File tree

5 files changed

+69
-29
lines changed

5 files changed

+69
-29
lines changed

bin/convert_acc_to_taxid.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
import sys
2+
3+
acc_tax_dict = {}
4+
for line in [x for x in open(sys.argv[2])]:
5+
_,k,v,_ = line.split('\t', 3)
6+
acc_tax_dict[k] = v
7+
8+
handle = open('taxid_map.tsv', 'w')
9+
10+
for line in [x for x in open(sys.argv[1])]:
11+
acc, fam, sp = line.replace('\n','').split('\t', 2)
12+
print(acc_tax_dict[acc], fam, sp, sep='\t', file=handle)
13+
14+
handle.close()
15+

bin/dustmasker_interval_to_bed.py

100644100755
File mode changed.

bin/extract_families.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,16 @@
44
import gzip
55
import sys
66

7+
acc_map_handle = open('accmap.tsv', 'w')
78
for arg in sys.argv[1:]:
89
with gzip.open(arg, 'rt') as gb:
910
for seq_gb in SeqIO.parse(gb, 'genbank'):
1011
if 'Mammalia' in seq_gb.annotations['taxonomy']:
1112
family = [name for name in seq_gb.annotations['taxonomy'] if name.endswith('idae')][-1]
1213
organism = seq_gb.annotations['organism'].replace(' ', '_')
13-
filename = f"{family}_{organism}.fasta"
14+
acc = seq_gb.id
15+
print(acc,family,organism, sep='\t', file=acc_map_handle)
16+
filename = f"{family}_{acc}_{organism}.fasta"
1417
with open(filename,'w') as fasta_out:
1518
SeqIO.write(seq_gb, fasta_out, 'fasta')
19+
acc_map_handle.close()

envs/environment.yml

Lines changed: 22 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,19 @@
1-
name: datastructureBreakpoints
1+
name: datastructure
22
channels:
33
- bioconda
44
- defaults
55
dependencies:
66
- _libgcc_mutex=0.1=main
7-
- biopython=1.77=py38h7b6447c_0
7+
- beautifulsoup4=4.9.3=pyhb0f4dca_0
8+
- biopython=1.78=py38h7b6447c_0
89
- blas=1.0=mkl
9-
- ca-certificates=2020.7.22=0
10-
- certifi=2020.6.20=py38_0
10+
- brotlipy=0.7.0=py38h7b6447c_1000
11+
- ca-certificates=2020.10.14=0
12+
- certifi=2020.6.20=py38h06a4308_2
13+
- cffi=1.14.3=py38he30daa8_0
14+
- chardet=3.0.4=py38_1003
15+
- cryptography=3.1.1=py38h1ba5d50_0
16+
- idna=2.10=py_0
1117
- intel-openmp=2020.2=254
1218
- jellyfish=1.1.12=h6bb024c_1
1319
- ld_impl_linux-64=2.33.1=h53a641e_7
@@ -17,19 +23,25 @@ dependencies:
1723
- libstdcxx-ng=9.1.0=hdf63c60_0
1824
- mkl=2020.2=256
1925
- mkl-service=2.3.0=py38he904b0f_0
20-
- mkl_fft=1.1.0=py38h23d657b_0
26+
- mkl_fft=1.2.0=py38h23d657b_0
2127
- mkl_random=1.1.1=py38h0573a6f_0
2228
- ncurses=6.2=he6710b0_1
23-
- numpy=1.19.1=py38hbc911f0_0
24-
- numpy-base=1.19.1=py38hfa32c7d_0
25-
- openssl=1.1.1g=h7b6447c_0
26-
- pip=20.2.2=py38_0
29+
- numpy=1.19.2=py38h54aff64_0
30+
- numpy-base=1.19.2=py38hfa32c7d_0
31+
- openssl=1.1.1h=h7b6447c_0
32+
- pip=20.2.4=py38_0
33+
- pycparser=2.20=py_2
34+
- pyopenssl=19.1.0=py_1
35+
- pysocks=1.7.1=py38_0
2736
- python=3.8.5=h7579374_1
2837
- readline=8.0=h7b6447c_0
29-
- setuptools=49.6.0=py38_0
38+
- requests=2.24.0=py_0
39+
- setuptools=50.3.0=py38hb0f4dca_1
3040
- six=1.15.0=py_0
41+
- soupsieve=2.0.1=py_0
3142
- sqlite=3.33.0=h62c20be_0
3243
- tk=8.6.10=hbc83047_0
44+
- urllib3=1.25.11=py_0
3345
- wheel=0.35.1=py_0
3446
- xz=5.2.5=h7b6447c_0
3547
- zlib=1.2.11=h7b6447c_3

main.nf

Lines changed: 27 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,6 @@ if(params.outdir == false){
3636

3737

3838
process downloadGenomes{
39-
cache false
4039
publishDir "${params.outdir}/ncbi", mode: 'link'
4140
tag "Downloading..."
4241

@@ -46,12 +45,10 @@ process downloadGenomes{
4645
script:
4746
"""
4847
rsync -av rsync://ftp.ncbi.nlm.nih.gov/refseq/release/mitochondrion/*.genomic.gbff.gz .
49-
5048
"""
5149
}
5250

5351
process extractFamilies{
54-
cache false
5552
conda "$baseDir/envs/environment.yml"
5653
tag "Extracting..."
5754

@@ -60,6 +57,7 @@ process extractFamilies{
6057

6158
output:
6259
file "*.fasta" into extracted_fasta mode flatten
60+
file "*.tsv" into convert_acc
6361

6462
script:
6563
"""
@@ -68,38 +66,36 @@ process extractFamilies{
6866
}
6967

7068
extracted_fasta
71-
.map{[it.baseName.split("_")[0], it.baseName.split("_")[1..-1].join("_"), file(it)]}
69+
.map{[it.baseName.split("_")[0],it.baseName.split('_')[1..2].join("_"), it.baseName.split("_")[3..-1].join("_"), file(it)]}
7270
.set{extracted_fasta}
7371

7472

7573
process writeFastas{
76-
cache false
74+
conda "$baseDir/envs/environment.yml"
7775
publishDir "${params.outdir}/genomes/${family}/", saveAs: {"${species}.fasta"}, pattern: "*.fasta", mode:'link'
78-
79-
tag "$family:$species"
76+
tag "Writing $family:$species"
8077

8178
input:
82-
set family, species, "input.fasta" from extracted_fasta
79+
set family, accession, species, "input.fasta" from extracted_fasta
8380

8481
output:
8582
set family, species, "output.fasta" into (for_bed, for_bwa, for_kraken)
86-
83+
8784
script:
8885
"""
8986
cat input.fasta > output.fasta
9087
"""
9188
}
9289

9390
process indexFasta{
94-
cache false
9591
publishDir "${params.outdir}/genomes/${family}/", mode: 'link'
9692
tag "$family:$species"
9793

9894
input:
9995
set family, species, "${species}.fasta" from for_bwa
10096

10197
output:
102-
file "${species}.fasta.*"
98+
file "${species}.fasta.*"
10399

104100
script:
105101
"""
@@ -108,7 +104,6 @@ process indexFasta{
108104
}
109105

110106
process writeBedFiles{
111-
cache false
112107
publishDir "${params.outdir}/masked/", saveAs: {"${species}.masked.bed"}, mode:'link'
113108
tag "$family:$species"
114109

@@ -132,16 +127,16 @@ for_kraken
132127

133128

134129
process createKrakenDB{
135-
cache false
136130
conda "$baseDir/envs/environment.yml"
137-
tag "Wait ~ 30min."
131+
tag "Wait! This takes > 30min."
132+
publishDir("stats")
138133

139134
input:
140135
each kmer from params.kmers
141136
file fasta_list from for_kraken
142137

143138
output:
144-
file "output.txt" into log
139+
file "nucl_gb.accession2taxid" into taxid_map
145140

146141
script:
147142
dbname = "Mito_db_kmer${kmer}"
@@ -156,13 +151,27 @@ process createKrakenDB{
156151
${params.kraken}/kraken-build --add-to-library \${file%?} --db ${dbname};\
157152
done
158153
${params.kraken}/kraken-build --build --db ${dbname} --kmer $kmer
159-
${params.kraken}/kraken-build --clean --db ${dbname}
154+
mv $dbname/taxonomy/nucl_gb.accession2taxid .
160155
if [[ -d \$out/kraken ]];\
161156
then rm -fr \$out/kraken;\
162157
fi;
163158
mkdir \$out/kraken
164-
cp -r ${dbname} \$out/kraken
165-
touch "output.txt"
159+
mv ${dbname} \$out/kraken/
160+
"""
161+
}
162+
process createFileMap{
163+
publishDir "${params.outdir}/genomes", mode:'link'
164+
165+
input:
166+
file "acc_map.tsv" from convert_acc
167+
file "nucl_gb.accession2taxid" from taxid_map
168+
169+
output:
170+
file "*.tsv"
171+
172+
script:
173+
"""
174+
python3 $baseDir/bin/convert_acc_to_taxid.py acc_map.tsv nucl_gb.accession2taxid
166175
"""
167176
}
168177

0 commit comments

Comments
 (0)