Skip to content

Commit

Permalink
Merge pull request #162 from andersen-lab/demix-low-cov-grouping
Browse files Browse the repository at this point in the history
Fix KeyError: 'recombinant parents'
  • Loading branch information
dylanpilz authored Aug 1, 2023
2 parents 4ce7f03 + 4cd3356 commit 3e04680
Showing 1 changed file with 29 additions and 25 deletions.
54 changes: 29 additions & 25 deletions freyja/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -862,57 +862,65 @@ def collapse_barcodes(df_barcodes, df_depth, depthcutoff, locDir, output):
lineage_data = {lineage['name']: lineage for lineage in lineage_yml}

alias_count = {}
merging_recomb = False
collapsed_lineages = {}

# collapse lineages into MRCA, where possible
for tup in duplicates:
pango_aliases = [lineage_data[lin]['alias']
for lin in tup]
alias_dict = {lineage_data[lin]['alias']: lin for lin in tup}

# handle case where recombinant and non-recombinant lins are merged
recomb_and_nonrecomb = len(
# handle cases where multiple lineage classes are being merged
# e.g. (A.5, B.12) or (XBB, XBN)
multiple_lin_classes = len(
set([alias[0] for alias in pango_aliases])) > 1
if recomb_and_nonrecomb:

if multiple_lin_classes:
# for recombinant lineages, find the parent lineages
parent_aliases = []
for alias in pango_aliases:
if alias.startswith('X'):

if 'recombinant_parents' in lineage_data[alias_dict[alias]]:

# replace with its recombinant parents
pango_aliases.remove(alias)
parents = lineage_data[alias]['recombinant_parents']\
parents = lineage_data[alias]['recombinant_parents'] \
.replace('*', '').split(',')
parents = [lineage_data[lin]['alias'] for lin in parents]
for alias in pango_aliases:
for parent in parents:
if alias.startswith(parent) and\
parent not in pango_aliases:
pango_aliases.append(parent)

merging_recomb = True
# only consider parents related to the other lineages
for parent in parents:
if any([alias.startswith(parent)
for alias in pango_aliases]) and \
parent not in pango_aliases:
parent_aliases.append(parent)

pango_aliases += parent_aliases

# get MRCA
mrca = os.path.commonpath(
[lin.replace('.', '/') for lin in pango_aliases]
).replace('/', '.')

# attempting to collapse multiple recombinant lineages
if len(mrca) == 0 and all([alias.startswith('X')
for alias in pango_aliases]):
mrca = 'Recombinant'
merging_recomb = True
# assign placeholder if no MRCA found
if len(mrca) == 0:
mrca = 'Misc'
else:
# otherwise, get the shortened alias, if available
for lineage in lineage_data:
if lineage_data[lineage]['alias'] == mrca:
mrca = lineage
# add flag to indicate that this is a merged lineage
mrca = lineage + '-like'
break

# add flag to indicate that this is a merged lineage
mrca += '-like'
# include index for multiple barcode classes with same MRCA
if mrca not in alias_count:
alias_count[mrca] = 0
else:
alias_count[mrca] += 1
mrca += f'({alias_count[mrca]})'

collapsed_lineages[mrca] = list(tup)

df_barcodes = df_barcodes.rename({lin: mrca for lin in tup})
df_barcodes = df_barcodes.drop_duplicates()

Expand All @@ -926,10 +934,6 @@ def collapse_barcodes(df_barcodes, df_depth, depthcutoff, locDir, output):
yaml.dump(collapsed_lineages, f, default_flow_style=False)

print(f'collapsed lineages saved to {output}')
if merging_recomb:
print('warning: Recombinant and non-recombinant lineage barcodes'
' being merged based on available sequence coverage and'
' --depthcutoff value. Solution may be inaccurate.')

return df_barcodes

Expand Down

0 comments on commit 3e04680

Please sign in to comment.