Skip to content

Commit a297c82

Browse files
committed
Fixes #1: ignore files with mismatch between GBFF taxid and NCBI taxonomy
1 parent 6f92c8c commit a297c82

File tree

2 files changed

+42
-5
lines changed

2 files changed

+42
-5
lines changed

main.nf

+28-3
Original file line numberDiff line numberDiff line change
@@ -166,14 +166,39 @@ ch_genomes_fasta_files = ch_genomes_fasta_files.combine(ch_krakenuniq_map, by:0)
166166
//combine with the extracted fasta
167167
ch_extracted_fasta = ch_extracted_fasta.mix(ch_genomes_fasta_files).unique{it[0]}
168168

169-
//and get the taxonomy from the json
170-
ch_extracted_fasta = ch_extracted_fasta.combine(json).map{id,taxid,fasta,marker,json -> [id,fasta,taxid,json[taxid],marker]}
169+
// and get the taxonomy from the json
170+
// check if the taxonomy exists!
171+
ch_extracted_fasta = ch_extracted_fasta.combine(json)
172+
.branch{ id,taxid,fasta,marker,json ->
173+
valid_taxid: json.containsKey(taxid)
174+
invalid_taxid: !json.containsKey(taxid)
175+
}
176+
177+
ch_extracted_fasta_valid = ch_extracted_fasta.valid_taxid.map{
178+
id,taxid,fasta,marker,json ->
179+
[
180+
id,
181+
fasta,
182+
taxid,
183+
json[taxid],
184+
marker
185+
]
186+
}
187+
188+
ch_extracted_fasta.invalid_taxid.collectFile(
189+
name:"FilesWithWrongTaxonomy.tsv",
190+
newLine:true,
191+
seed:["ID","TaxID","FileName"].join("\t"),
192+
storeDir: params.outdir
193+
){
194+
[it[0], it[1], it[2].baseName].join("\t")
195+
}
171196

172197
// In the updated nodes and names.dmp, a taxonomy can now have multiple subspecies (e.g. Denisova2 (sub) --> Denisova (sub) --> Homo sapiens (sp))
173198
// they overwrite each other!
174199
// So if the genome was extracted, use the species name from NCBI, if provided, use the accession ID as species name
175200

176-
ch_for_writing = ch_extracted_fasta.map{
201+
ch_for_writing = ch_extracted_fasta_valid.map{
177202
def species_name = it[4].extracted ? it[3].subspecies ?: it[3].species : it[0]
178203
[
179204
it[3].family ?: "Unclassified", // species without family entry cannot be handled by quicksand (downstream of krakenuniq). This is a few microbes that would need a custom taxonomy

modules/local/parse_taxonomy/resources/usr/bin/parse_taxonomy.py

+14-2
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,14 @@ def get_nodes_dict(taxonomy_file):
3838
return taxonomy_dict
3939

4040
def traverse_tree(taxid, nodes):
41+
# this returns just a list of taxids, from the leaves to the root
4142
results = [taxid]
42-
record = nodes[taxid]
43+
try:
44+
record = nodes[taxid]
45+
except KeyError:
46+
#this happens if the NCBI gbff dbxref[taxon] tag doesnt match the NCBI taxonomy (it happens!)
47+
# return an empty list and report the missing ID downstream
48+
return []
4349
if "parent" in record:
4450
if record["parent"] != taxid:
4551
results.extend( traverse_tree(record["parent"], nodes) )
@@ -61,7 +67,13 @@ def traverse_tree(taxid, nodes):
6167

6268
final_json = {}
6369
for taxid in taxids:
64-
tmp = {nodes[x]['rank'] : names[x].replace(' ','_') for x in traverse_tree(taxid, nodes)}
70+
taxonomy = traverse_tree(taxid, nodes)
71+
72+
# early return from traverse_tree due to NCBI error?
73+
if taxonomy == []:
74+
continue
75+
76+
tmp = {nodes[x]['rank'] : names[x].replace(' ','_') for x in taxonomy}
6577
# {taxid:{ 'species':'Homo_sapiens', 'family':'Hominidae' }}
6678
if len(tmp)>0:
6779
final_json[taxid] = tmp

0 commit comments

Comments
 (0)