Skip to content

Commit

Permalink
Update scripts to work with new tsv header
Browse files Browse the repository at this point in the history
  • Loading branch information
rrwick committed Apr 5, 2018
1 parent ca428ef commit 70fccf8
Show file tree
Hide file tree
Showing 2 changed files with 83 additions and 10 deletions.
45 changes: 41 additions & 4 deletions scripts/summarise_contigs.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def summarise_contigs(result_file, threshold, top_num):
species_tallies = {x: 0 for x in species_names}
contig_tallies = {}
contig_names = []
contig_lengths = {}
contig_lengths = get_contig_lengths(result_file)

open_func = get_open_function(result_file)
with open_func(result_file, 'rt') as result:
Expand All @@ -58,7 +58,6 @@ def summarise_contigs(result_file, threshold, top_num):
contig_names.append(contig)
contig_tallies[contig] = {x: 0 for x in species_names}
position = int(parts[1])
contig_lengths[contig] = position
probabilities = [float(x) for x in parts[2:]]
assert len(probabilities) == len(species_names)
max_prob = max(probabilities)
Expand Down Expand Up @@ -89,17 +88,55 @@ def summarise_contigs(result_file, threshold, top_num):

def get_species_names(result_file):
species_names = []

species_count = None
open_func = get_open_function(result_file)
with open_func(result_file, 'rt') as result:
for line in result:
if line.startswith('#'):
species_names.append(line.strip()[1:])
line = line.strip()[1:]
try:
_ = int(line)
integer_line = True
except ValueError:
integer_line = False
if integer_line:
if species_count is None:
species_count = int(line)
else: # we've hit the contig count
break
else:
species_names.append(line)
else:
break
assert len(species_names) == species_count
return species_names


def get_contig_lengths(result_file):
contig_lengths = {}
integer_line_count = 0
contig_count = None
open_func = get_open_function(result_file)
with open_func(result_file, 'rt') as result:
for line in result:
if not line.startswith('#'):
break
line = line.strip()[1:]
try:
_ = int(line)
integer_line = True
except ValueError:
integer_line = False
if integer_line:
integer_line_count += 1
if integer_line_count == 2:
contig_count = int(line)
elif integer_line_count == 2:
contig_name, contig_length = line.split('\t')
contig_lengths[contig_name] = int(contig_length)
return contig_lengths


def print_header(top_species):
header = ['Contig', 'Length'] + top_species
print('\t'.join(header))
Expand Down
48 changes: 42 additions & 6 deletions scripts/summarise_species.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ def summarise_species(args):
assembly_name = assembly_name.replace('_painted.tsv.gz', '').replace('_painted.tsv', '')
species_names = get_species_names(result_file)
species_tallies = {x: 0 for x in species_names}
contig_lengths = {}
contig_lengths = get_contig_lengths(result_file)

open_func = get_open_function(result_file)
with open_func(result_file, 'rt') as result:
Expand All @@ -99,7 +99,6 @@ def summarise_species(args):
parts = line.strip().split('\t')
contig = parts[0]
position = int(parts[1])
contig_lengths[contig] = position
probabilities = [float(x) for x in parts[2:]]
assert len(probabilities) == len(species_names)
max_prob = max(probabilities)
Expand All @@ -126,17 +125,54 @@ def summarise_species(args):

def get_species_names(result_file):
species_names = []

species_count = None
open_func = get_open_function(result_file)
with open_func(result_file, 'rt') as result:
for line in result:
if line.startswith('#'):
species_names.append(line.strip()[1:])
else:
if not line.startswith('#'):
break
line = line.strip()[1:]
try:
_ = int(line)
integer_line = True
except ValueError:
integer_line = False
if integer_line:
if species_count is None:
species_count = int(line)
else: # we've hit the contig count
break
else:
species_names.append(line)
assert len(species_names) == species_count
return species_names


def get_contig_lengths(result_file):
contig_lengths = {}
integer_line_count = 0
contig_count = None
open_func = get_open_function(result_file)
with open_func(result_file, 'rt') as result:
for line in result:
if not line.startswith('#'):
break
line = line.strip()[1:]
try:
_ = int(line)
integer_line = True
except ValueError:
integer_line = False
if integer_line:
integer_line_count += 1
if integer_line_count == 2:
contig_count = int(line)
elif integer_line_count == 2:
contig_name, contig_length = line.split('\t')
contig_lengths[contig_name] = int(contig_length)
return contig_lengths


def get_compression_type(filename):
"""
Attempts to guess the compression (if any) on a file using the first few bytes.
Expand Down

0 comments on commit 70fccf8

Please sign in to comment.