@@ -166,14 +166,39 @@ ch_genomes_fasta_files = ch_genomes_fasta_files.combine(ch_krakenuniq_map, by:0)
166
166
// combine with the extracted fasta
167
167
ch_extracted_fasta = ch_extracted_fasta. mix(ch_genomes_fasta_files). unique{it[0 ]}
168
168
169
- // and get the taxonomy from the json
170
- ch_extracted_fasta = ch_extracted_fasta. combine(json). map{id,taxid,fasta,marker,json -> [id,fasta,taxid,json[taxid],marker]}
169
+ // and get the taxonomy from the json
170
+ // check if the taxonomy exists!
171
+ ch_extracted_fasta = ch_extracted_fasta. combine(json)
172
+ .branch{ id,taxid,fasta,marker,json ->
173
+ valid_taxid : json. containsKey(taxid)
174
+ invalid_taxid : ! json. containsKey(taxid)
175
+ }
176
+
177
+ ch_extracted_fasta_valid = ch_extracted_fasta. valid_taxid. map{
178
+ id,taxid,fasta,marker,json ->
179
+ [
180
+ id,
181
+ fasta,
182
+ taxid,
183
+ json[taxid],
184
+ marker
185
+ ]
186
+ }
187
+
188
+ ch_extracted_fasta. invalid_taxid. collectFile(
189
+ name :" FilesWithWrongTaxonomy.tsv" ,
190
+ newLine :true ,
191
+ seed :[" ID" ," TaxID" ," FileName" ]. join(" \t " ),
192
+ storeDir : params. outdir
193
+ ){
194
+ [it[0 ], it[1 ], it[2 ]. baseName]. join(" \t " )
195
+ }
171
196
172
197
// In the updated nodes and names.dmp, a taxonomy can now have multiple subspecies (e.g. Denisova2 (sub) --> Denisova (sub) --> Homo sapiens (sp))
173
198
// they overwrite each other!
174
199
// So if the genome was extracted, use the species name from NCBI, if provided, use the accession ID as species name
175
200
176
- ch_for_writing = ch_extracted_fasta . map{
201
+ ch_for_writing = ch_extracted_fasta_valid . map{
177
202
def species_name = it[4 ]. extracted ? it[3 ]. subspecies ?: it[3 ]. species : it[0 ]
178
203
[
179
204
it[3 ]. family ?: " Unclassified" , // species without family entry cannot be handled by quicksand (downstream of krakenuniq). This is a few microbes that would need a custom taxonomy
0 commit comments