Skip to content

Commit b930c7b

Browse files
committed
Kraken2 depletion for human reads as alternative to bmtagger
* Add kraken_unclassified command that combines p filtering with depletion * Less sensitive than bmtagger (tradeoffs) but much faster. Can be controlled with p. p=0.01-0.05 a close to bmtagger but not as aggressive. * Custom kraken2 needed to avoid executable name clashes with other kraken* packages.
1 parent 417069c commit b930c7b

File tree

6 files changed

+360
-72
lines changed

6 files changed

+360
-72
lines changed

metagenomics.py

+77-7
Original file line numberDiff line numberDiff line change
@@ -861,6 +861,77 @@ def kaiju(inBam, db, taxDb, outReport, outReads=None, threads=None):
861861
__commands__.append(('kaiju', parser_kaiju))
862862

863863

864+
def extract_kraken_unclassified(in_kraken_reads, in_bam, out_bam, filter_taxids=None, filter_children=None, tax_dir=None, p_threshold=0.05):
865+
if filter_children:
866+
assert tax_dir
867+
db = ncbitax.TaxonomyDb(tax_dir, load_nodes=True, load_merged=True)
868+
filter_taxids = collect_children(db.children, set(filter_taxids))
869+
870+
qnames = set()
871+
with gzip.open(in_kraken_reads, 'rt') as f:
872+
for line in f:
873+
parts = line.split('\t')
874+
classified = parts[0] == 'C'
875+
qname = parts[1]
876+
taxid = parts[2]
877+
length = parts[3]
878+
# Check if already filtered and P= added
879+
if '=' in parts[4]:
880+
p = float(parts[4].split('=')[1])
881+
kmer_str = parts[5]
882+
else:
883+
kmer_str = parts[4]
884+
kmers = [kmer for kmer in kmer_str.rstrip().split(' ')]
885+
kmer_counts = []
886+
for kmer in kmers:
887+
t = kmer.split(':')
888+
# Kraken2 marker for paired read joiner
889+
if t[0] == '|':
890+
continue
891+
kmer_counts.append((t[0], int(t[1])))
892+
n_kmers = 0
893+
n_classified_kmers = 0
894+
for kmer_taxid, count in kmer_counts:
895+
if kmer_taxid != 'A':
896+
n_kmers += count
897+
if kmer_taxid not in ('0', 'A'):
898+
if filter_taxids is None:
899+
n_classified_kmers += count
900+
elif int(kmer_taxid) in filter_taxids:
901+
n_classified_kmers += count
902+
if n_kmers > 0:
903+
p = n_classified_kmers / n_kmers
904+
else:
905+
p = 0
906+
907+
if p <= float(p_threshold):
908+
if qname.endswith('/1') or qname.endswith('/2'):
909+
qname = qname[:-2]
910+
qnames.add(qname)
911+
912+
in_sam_f = pysam.AlignmentFile(in_bam, 'rb', check_sq=False)
913+
out_sam_f = pysam.AlignmentFile(out_bam, 'wb', template=in_sam_f)
914+
915+
for sam1 in in_sam_f:
916+
if sam1.query_name in qnames:
917+
out_sam_f.write(sam1)
918+
919+
920+
def parser_kraken_unclassified(parser=argparse.ArgumentParser()):
921+
parser.add_argument('in_bam', metavar='inBam', help='Input original bam.')
922+
parser.add_argument('in_kraken_reads', metavar='inKrakenReads', help='Input kraken reads.')
923+
parser.add_argument('out_bam', metavar='outBam', help='Output extracted bam')
924+
parser.add_argument('-p', '--pThreshold', dest='p_threshold', default=0.05, help='Kraken p threshold')
925+
parser.add_argument('--filterTaxids', dest='filter_taxids', nargs='+', type=int, help='Taxids to filter/deplete out')
926+
parser.add_argument('--filterChildren', dest='filter_children', action='store_true', help='Including filter taxid descendants when filtering - requires taxDb')
927+
parser.add_argument('--taxDb', dest='tax_dir', help='Directory to NCBI taxonomy db')
928+
util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmp_dir', None)))
929+
util.cmd.attach_main(parser, extract_kraken_unclassified, split_args=True)
930+
return parser
931+
932+
__commands__.append(('kraken_unclassified', parser_kraken_unclassified))
933+
934+
864935
def sam_lca_report(tax_db, bam_aligned, outReport, outReads=None, unique_only=None):
865936

866937
if outReads:
@@ -950,7 +1021,6 @@ def metagenomic_report_merge(metagenomic_reports, out_kraken_summary, kraken_db,
9501021
__commands__.append(('report_merge', parser_metagenomic_report_merge))
9511022

9521023

953-
9541024
def fasta_library_accessions(library):
9551025
'''Parse accession from ids of fasta files in library directory. '''
9561026
library_accessions = set()
@@ -962,7 +1032,7 @@ def fasta_library_accessions(library):
9621032
for seqr in SeqIO.parse(filepath, 'fasta'):
9631033
name = seqr.name
9641034
# Search for accession
965-
mo = re.search('([A-Z]+_?\d+\.\d+)', name)
1035+
mo = re.search(r'([A-Z]+_?\d+\.\d+)', name)
9661036
if mo:
9671037
accession = mo.group(1)
9681038
library_accessions.add(accession)
@@ -973,9 +1043,9 @@ class KrakenUniqBuildError(Exception):
9731043
'''Error while building KrakenUniq database.'''
9741044

9751045
def parser_filter_bam_to_taxa(parser=argparse.ArgumentParser()):
976-
parser.add_argument('in_bam', help='Input bam file.')
977-
parser.add_argument('read_IDs_to_tax_IDs', help='TSV file mapping read IDs to taxIDs, Kraken-format by default. Assumes bijective mapping of read ID to tax ID.')
978-
parser.add_argument('out_bam', help='Output bam file, filtered to the taxa specified')
1046+
parser.add_argument('in_bam', metavar='inBam', help='Input bam file.')
1047+
parser.add_argument('reads_tsv', metavar='readsTsv', help='TSV file mapping read IDs to taxIDs, Kraken-format by default. Assumes bijective mapping of read ID to tax ID.')
1048+
parser.add_argument('out_bam', metavar='outBam', help='Output bam file, filtered to the taxa specified')
9791049
parser.add_argument('nodes_dmp', help='nodes.dmp file from ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/')
9801050
parser.add_argument('names_dmp', help='names.dmp file from ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/')
9811051
parser.add_argument('--taxNames', nargs="+", dest="tax_names", help='The taxonomic names to include. More than one can be specified. Mapped to Tax IDs by lowercase exact match only. Ex. "Viruses" This is in addition to any taxonomic IDs provided.')
@@ -992,7 +1062,7 @@ def parser_filter_bam_to_taxa(parser=argparse.ArgumentParser()):
9921062
util.cmd.attach_main(parser, filter_bam_to_taxa, split_args=True)
9931063
return parser
9941064

995-
def filter_bam_to_taxa(in_bam, read_IDs_to_tax_IDs, out_bam,
1065+
def filter_bam_to_taxa(in_bam, reads_tsv, out_bam,
9961066
nodes_dmp, names_dmp,
9971067
tax_names=None, tax_ids=None,
9981068
omit_children=False,
@@ -1049,7 +1119,7 @@ def filter_bam_to_taxa(in_bam, read_IDs_to_tax_IDs, out_bam,
10491119
with util.file.tempfname(".txt.gz") as temp_read_list:
10501120
with open_or_gzopen(temp_read_list, "wt") as read_IDs_file:
10511121
read_ids_written = 0
1052-
for row in util.file.read_tabfile(read_IDs_to_tax_IDs):
1122+
for row in util.file.read_tabfile(reads_tsv):
10531123
assert tax_id_col<len(row), "tax_id_col does not appear to be in range for number of columns present in mapping file"
10541124
assert read_id_col<len(row), "read_id_col does not appear to be in range for number of columns present in mapping file"
10551125
read_id = row[read_id_col]

taxon_filter.py

+65
Original file line numberDiff line numberDiff line change
@@ -246,6 +246,71 @@ def parser_filter_lastal_bam(parser=argparse.ArgumentParser()):
246246
__commands__.append(('filter_lastal_bam', parser_filter_lastal_bam))
247247

248248

249+
# ==============================
250+
# *** deplete kraken2 ***
251+
# ==============================
252+
def parser_deplete_bam_kraken2(parser=argparse.ArgumentParser()):
253+
parser.add_argument('in_bam', metavar='inBam', help='Input BAM file.')
254+
parser.add_argument('db', help='Kraken2 database directory.')
255+
parser.add_argument('out_bam', metavar='outBam', help='Output BAM file.')
256+
parser.add_argument('--taxDb', dest='tax_dir', help='Directory to NCBI taxonomy db')
257+
parser.add_argument(
258+
'--JVMmemory',
259+
default=tools.picard.FilterSamReadsTool.jvmMemDefault,
260+
help='JVM virtual memory size (default: %(default)s)'
261+
)
262+
parser.add_argument('-p', '--pThreshold', dest='p_threshold', default=0.5, help='Kraken p threshold')
263+
parser.add_argument('--filterTaxid', dest='filter_taxid', type=int, help='Taxid to filter/deplete out')
264+
util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmp_dir', None)))
265+
util.cmd.attach_main(parser, main_deplete_bam_kraken2)
266+
return parser
267+
268+
def main_deplete_bam_kraken2(args):
269+
'''Use kraken2 to deplete input reads against several databases.'''
270+
271+
kraken2 = tools.kraken.Kraken2()
272+
with util.file.tempfname() as k_reads_fn, util.file.tempfname() as k_report_fn:
273+
kraken2.classify(args.db, args.tax_db, args.in_bam,
274+
output_report=k_report_fn, output_reads=k_reads_fn,
275+
num_threads=threads)
276+
metagenomics.extract_kraken_unclassified(
277+
k_reads_fn, args.in_bam, out_bam,
278+
filter_taxid=args.filter_taxid, tax_dir=args.tax_dir,
279+
p_threshold=args.p_threshold)
280+
281+
__commands__.append(('deplete_bam_kraken2', parser_deplete_bam_kraken2))
282+
283+
284+
def multi_db_deplete_bam(inBam, refDbs, deplete_method, outBam, **kwargs):
285+
286+
tmpDb = None
287+
if len(refDbs)>1 and not any(
288+
not os.path.exists(db) # indexed db prefix
289+
or os.path.isdir(db) # indexed db in directory
290+
or (os.path.isfile(db) and ('.tar' in db or '.tgz' in db or '.zip' in db)) # packaged indexed db
291+
for db in refDbs):
292+
# this is a scenario where all refDbs are unbuilt fasta
293+
# files. we can simplify and speed up execution by
294+
# concatenating them all and running deplete_method
295+
# just once
296+
tmpDb = mkstempfname('.fasta')
297+
merge_compressed_files(refDbs, tmpDb, sep='\n')
298+
refDbs = [tmpDb]
299+
300+
samtools = tools.samtools.SamtoolsTool()
301+
tmpBamIn = inBam
302+
for db in refDbs:
303+
if not samtools.isEmpty(tmpBamIn):
304+
tmpBamOut = mkstempfname('.bam')
305+
deplete_method(tmpBamIn, db, tmpBamOut, **kwargs)
306+
if tmpBamIn != inBam:
307+
os.unlink(tmpBamIn)
308+
tmpBamIn = tmpBamOut
309+
shutil.copyfile(tmpBamIn, outBam)
310+
311+
if tmpDb:
312+
os.unlink(tmpDb)
313+
249314
# ==============================
250315
# *** deplete_bmtagger_bam ***
251316
# ==============================

test/__init__.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,10 @@ def assert_equal_bam_reads(testCase, bam_filename1, bam_filename2):
5555
samtools.view(args=[], inFile=bam_filename2, outFile=sam_two)
5656

5757
try:
58-
testCase.assertTrue(filecmp.cmp(sam_one, sam_two, shallow=False))
58+
if testCase:
59+
testCase.assertTrue(filecmp.cmp(sam_one, sam_two, shallow=False))
60+
else:
61+
assert filecmp.cmp(sam_one, sam_two, shallow=False)
5962
finally:
6063
for fname in [sam_one, sam_two]:
6164
if os.path.exists(fname):

test/unit/test_metagenomics.py

+70-63
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import copy
77
import os.path
88
from os.path import join
9+
import subprocess
910
import tempfile
1011
import textwrap
1112
import unittest
@@ -26,32 +27,22 @@
2627
from io import StringIO
2728

2829

29-
class TestCommandHelp(unittest.TestCase):
30+
def test_help_parser_for_each_command():
31+
for cmd_name, parser_fun in metagenomics.__commands__:
32+
parser = parser_fun(argparse.ArgumentParser())
33+
helpstring = parser.format_help()
3034

31-
def test_help_parser_for_each_command(self):
32-
for cmd_name, parser_fun in metagenomics.__commands__:
33-
parser = parser_fun(argparse.ArgumentParser())
34-
helpstring = parser.format_help()
3535

36-
37-
class TestKronaCalls(TestCaseWithTmp):
38-
39-
def setUp(self):
40-
super().setUp()
41-
patcher = patch('tools.krona.Krona', autospec=True)
42-
self.addCleanup(patcher.stop)
43-
self.mock_krona = patcher.start()
44-
45-
self.inTsv = util.file.mkstempfname('.tsv')
46-
self.db = tempfile.mkdtemp('db')
47-
48-
def test_krona_import_taxonomy(self):
49-
out_html = util.file.mkstempfname('.html')
50-
metagenomics.krona(self.inTsv, self.db, out_html, queryColumn=3, taxidColumn=5, scoreColumn=7,
51-
noHits=True, noRank=True, inputType='tsv')
52-
self.mock_krona().import_taxonomy.assert_called_once_with(
53-
self.db, [self.inTsv], out_html, query_column=3, taxid_column=5, score_column=7,
54-
no_hits=True, no_rank=True, magnitude_column=None, root_name=os.path.basename(self.inTsv))
36+
def test_krona_import_taxonomy(mocker):
37+
p = mocker.patch('tools.krona.Krona', autospec=True)
38+
out_html = util.file.mkstempfname('.html')
39+
in_tsv = util.file.mkstempfname('.tsv')
40+
db = tempfile.mkdtemp('db')
41+
metagenomics.krona(in_tsv, db, out_html, queryColumn=3, taxidColumn=5, scoreColumn=7,
42+
noHits=True, noRank=True, inputType='tsv')
43+
p().import_taxonomy.assert_called_once_with(
44+
db, [in_tsv], out_html, query_column=3, taxid_column=5, score_column=7,
45+
no_hits=True, no_rank=True, magnitude_column=None, root_name=os.path.basename(in_tsv))
5546

5647

5748
@pytest.fixture
@@ -123,8 +114,7 @@ def ranks():
123114

124115
@pytest.fixture
125116
def simple_m8():
126-
test_path = join(util.file.get_test_input_path(),
127-
'TestTaxonomy')
117+
test_path = join(util.file.get_test_input_path(), 'TestTaxonomy')
128118
return open(join(test_path, 'simple.m8'))
129119

130120

@@ -170,8 +160,7 @@ def test_rank_code():
170160

171161

172162
def test_blast_records(simple_m8):
173-
test_path = join(util.file.get_test_input_path(),
174-
'TestTaxonomy')
163+
test_path = join(util.file.get_test_input_path(), 'TestTaxonomy')
175164
with simple_m8 as f:
176165
records = list(metagenomics.blast_records(f))
177166
assert len(records) == 110
@@ -180,8 +169,7 @@ def test_blast_records(simple_m8):
180169

181170

182171
def test_blast_lca(taxa_db_simple, simple_m8):
183-
test_path = join(util.file.get_test_input_path(),
184-
'TestTaxonomy')
172+
test_path = join(util.file.get_test_input_path(), 'TestTaxonomy')
185173
expected = textwrap.dedent("""\
186174
C\tM04004:13:000000000-AGV3H:1:1101:12068:2105\t2
187175
C\tM04004:13:000000000-AGV3H:1:1101:13451:2146\t2
@@ -283,45 +271,64 @@ def test_kaiju(mocker):
283271
p.assert_called_with('db.fmi', 'tax_db', 'input.bam', output_report='output.report', num_threads=mock.ANY, output_reads='output.reads')
284272

285273

286-
class TestBamFilter(TestCaseWithTmp):
287-
def test_bam_filter_simple(self):
288-
temp_dir = tempfile.gettempdir()
289-
input_dir = util.file.get_test_input_path(self)
290-
taxonomy_dir = os.path.join(util.file.get_test_input_path(),"TestMetagenomicsSimple","db","taxonomy")
274+
@pytest.fixture
275+
def taxonomy_dir():
276+
return os.path.join(util.file.get_test_input_path(),"TestMetagenomicsSimple", "db", "taxonomy")
277+
291278

279+
def test_kraken_unclassified(taxonomy_dir):
280+
input_dir = join(util.file.get_test_input_path(), 'TestBamFilter')
281+
282+
for p, n_leftover in zip([0.05, 0.5], [37, 579]):
292283
filtered_bam = util.file.mkstempfname('.bam')
293-
args = [
294-
os.path.join(input_dir,"input.bam"),
295-
os.path.join(input_dir,"input.kraken-reads.tsv.gz"),
284+
cmd_args = [
285+
os.path.join(input_dir, "input.bam"),
286+
os.path.join(input_dir, "input.kraken-reads.tsv.gz"),
296287
filtered_bam,
297-
os.path.join(taxonomy_dir,"nodes.dmp"),
298-
os.path.join(taxonomy_dir,"names.dmp"),
299-
"--taxNames",
300-
"Ebolavirus"
288+
'-p', str(p)
301289
]
302-
args = metagenomics.parser_filter_bam_to_taxa(argparse.ArgumentParser()).parse_args(args)
290+
args = metagenomics.parser_kraken_unclassified(argparse.ArgumentParser()).parse_args(cmd_args)
303291
args.func_main(args)
304292

305-
expected_bam = os.path.join(input_dir,"expected.bam")
306-
assert_equal_bam_reads(self, filtered_bam, expected_bam)
293+
cmd = ['samtools', 'view', filtered_bam]
294+
p = subprocess.run(cmd, stdout=subprocess.PIPE)
295+
assert len(p.stdout.decode('utf-8').split('\n')) == n_leftover
307296

308-
def test_bam_filter_by_tax_id(self):
309-
temp_dir = tempfile.gettempdir()
310-
input_dir = util.file.get_test_input_path(self)
311-
taxonomy_dir = os.path.join(util.file.get_test_input_path(),"TestMetagenomicsSimple","db","taxonomy")
312297

313-
filtered_bam = util.file.mkstempfname('.bam')
314-
args = [
315-
os.path.join(input_dir,"input.bam"),
316-
os.path.join(input_dir,"input.kraken-reads.tsv.gz"),
317-
filtered_bam,
318-
os.path.join(taxonomy_dir,"nodes.dmp"),
319-
os.path.join(taxonomy_dir,"names.dmp"),
320-
"--taxIDs",
321-
"186538"
322-
]
323-
args = metagenomics.parser_filter_bam_to_taxa(argparse.ArgumentParser()).parse_args(args)
324-
args.func_main(args)
298+
def test_bam_filter_simple(taxonomy_dir):
299+
input_dir = join(util.file.get_test_input_path(), 'TestBamFilter')
300+
301+
filtered_bam = util.file.mkstempfname('.bam')
302+
args = [
303+
os.path.join(input_dir, "input.bam"),
304+
os.path.join(input_dir, "input.kraken-reads.tsv.gz"),
305+
filtered_bam,
306+
os.path.join(taxonomy_dir, "nodes.dmp"),
307+
os.path.join(taxonomy_dir, "names.dmp"),
308+
"--taxNames",
309+
"Ebolavirus"
310+
]
311+
args = metagenomics.parser_filter_bam_to_taxa(argparse.ArgumentParser()).parse_args(args)
312+
args.func_main(args)
313+
314+
expected_bam = os.path.join(input_dir,"expected.bam")
315+
assert_equal_bam_reads(None, filtered_bam, expected_bam)
316+
317+
318+
def test_bam_filter_by_tax_id(taxonomy_dir):
319+
input_dir = join(util.file.get_test_input_path(), 'TestBamFilter')
320+
filtered_bam = util.file.mkstempfname('.bam')
321+
args = [
322+
os.path.join(input_dir, "input.bam"),
323+
os.path.join(input_dir, "input.kraken-reads.tsv.gz"),
324+
filtered_bam,
325+
os.path.join(taxonomy_dir, "nodes.dmp"),
326+
os.path.join(taxonomy_dir, "names.dmp"),
327+
"--taxIDs",
328+
"186538"
329+
]
330+
args = metagenomics.parser_filter_bam_to_taxa(argparse.ArgumentParser()).parse_args(args)
331+
args.func_main(args)
325332

326-
expected_bam = os.path.join(input_dir,"expected.bam")
327-
assert_equal_bam_reads(self, filtered_bam, expected_bam)
333+
expected_bam = os.path.join(input_dir,"expected.bam")
334+
assert_equal_bam_reads(None, filtered_bam, expected_bam)

0 commit comments

Comments
 (0)