From 0261da9db246d0b4004f4e533eb4406cace2871f Mon Sep 17 00:00:00 2001 From: Matt Stone Date: Mon, 13 Jan 2025 13:23:53 -0500 Subject: [PATCH 1/4] fix: sum_of_base_qualities should be zero when a read has no base qualities --- fgpyo/sam/__init__.py | 10 +++++++++- fgpyo/util/inspect.py | 2 +- tests/fgpyo/sam/test_sam.py | 28 ++++++++++++++++++++++++++++ 3 files changed, 38 insertions(+), 2 deletions(-) diff --git a/fgpyo/sam/__init__.py b/fgpyo/sam/__init__.py index 8fa2c19..7e5b207 100644 --- a/fgpyo/sam/__init__.py +++ b/fgpyo/sam/__init__.py @@ -849,16 +849,24 @@ def from_read(cls, read: pysam.AlignedSegment) -> List["SupplementaryAlignment"] def sum_of_base_qualities(rec: AlignedSegment, min_quality_score: int = 15) -> int: """Calculate the sum of base qualities score for an alignment record. - This function is useful for calculating the "mate score" as implemented in samtools fixmate. + This function is useful for calculating the "mate score" as implemented in `samtools fixmate`. + Consistently with `samtools fixmate`, this function returns 0 if the record has no base + qualities. Args: rec: The alignment record to calculate the sum of base qualities from. min_quality_score: The minimum base quality score to use for summation. + Returns: + The sum of base qualities on the input record. 0 if the record has no base qualities. + See: [`calc_sum_of_base_qualities()`](https://github.com/samtools/samtools/blob/4f3a7397a1f841020074c0048c503a01a52d5fa2/bam_mate.c#L227-L238) [`MD_MIN_QUALITY`](https://github.com/samtools/samtools/blob/4f3a7397a1f841020074c0048c503a01a52d5fa2/bam_mate.c#L42) """ + if rec.query_qualities is None: + return 0 + score: int = sum(qual for qual in rec.query_qualities if qual >= min_quality_score) return score diff --git a/fgpyo/util/inspect.py b/fgpyo/util/inspect.py index c75b577..abc9ad8 100644 --- a/fgpyo/util/inspect.py +++ b/fgpyo/util/inspect.py @@ -457,7 +457,7 @@ def attr_from( # True, because python, so we need to check for that explicitly if not set_value and attribute.type is not None and not attribute.type == bool: try: - return_value = attribute.type(str_value) # type: ignore[operator] + return_value = attribute.type(str_value) set_value = True except (ValueError, TypeError): pass diff --git a/tests/fgpyo/sam/test_sam.py b/tests/fgpyo/sam/test_sam.py index e822ee5..5f8b46e 100755 --- a/tests/fgpyo/sam/test_sam.py +++ b/tests/fgpyo/sam/test_sam.py @@ -10,6 +10,8 @@ import pysam import pytest +from pysam import AlignedSegment +from pysam import AlignmentFile from pysam import AlignmentHeader import fgpyo.sam as sam @@ -616,6 +618,32 @@ def test_sum_of_base_qualities_some_below_minimum() -> None: assert sum_of_base_qualities(single, min_quality_score=4) == 9 +def test_sum_of_base_qualities_unmapped(tmp_path: Path) -> None: + builder = SamBuilder(r1_len=5, r2_len=5) + single: AlignedSegment = builder.add_single() + + # NB: assigning to `query_qualities` does not affect the cached object returned by the property, + # but it does change the underlying attribute used when constructing the string representation + single.query_qualities = None + record_str = single.to_string() + + bam_path: Path = tmp_path / "no_qualities.bam" + + with bam_path.open("w") as bam_file: + bam_file.write(str(builder.header)) + bam_file.write(f"{record_str}\n") + + with AlignmentFile(str(bam_path)) as bam: + record = next(bam) + + # NB: writing to the temp file above is necessary to construct an `AlignedSegment` with no base + # qualities, as `SamBuilder` does not currently permit the construction of such records. + # TODO simplify this after `SamBuilder` is updated to support this. + # https://github.com/fulcrumgenomics/fgpyo/issues/211 + assert record.query_qualities is None + assert sum_of_base_qualities(record) == 0 + + def test_calc_edit_info_no_edits() -> None: chrom = "ACGCTAGACTGCTAGCAGCATCTCATAGCACTTCGCGCTATAGCGATATAAATATCGCGATCTAGCG" builder = SamBuilder(r1_len=30) From c01e6dc6e91e7d93c04773241ca752fb888b7482 Mon Sep 17 00:00:00 2001 From: Matt Stone Date: Mon, 13 Jan 2025 15:16:36 -0500 Subject: [PATCH 2/4] chore: add type-ignore back --- fgpyo/util/inspect.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fgpyo/util/inspect.py b/fgpyo/util/inspect.py index abc9ad8..c75b577 100644 --- a/fgpyo/util/inspect.py +++ b/fgpyo/util/inspect.py @@ -457,7 +457,7 @@ def attr_from( # True, because python, so we need to check for that explicitly if not set_value and attribute.type is not None and not attribute.type == bool: try: - return_value = attribute.type(str_value) + return_value = attribute.type(str_value) # type: ignore[operator] set_value = True except (ValueError, TypeError): pass From 834210b24c891c36e25526e2eb3b0e473c61b6c9 Mon Sep 17 00:00:00 2001 From: Matt Stone Date: Tue, 21 Jan 2025 15:55:00 -0500 Subject: [PATCH 3/4] feat: check for no query qualities --- fgpyo/sam/__init__.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/fgpyo/sam/__init__.py b/fgpyo/sam/__init__.py index 7e5b207..4d14363 100644 --- a/fgpyo/sam/__init__.py +++ b/fgpyo/sam/__init__.py @@ -158,6 +158,7 @@ import enum import io import sys +from array import array from collections.abc import Collection from itertools import chain from pathlib import Path @@ -178,6 +179,7 @@ from pysam import AlignedSegment from pysam import AlignmentFile as SamFile from pysam import AlignmentHeader as SamHeader +from pysam import qualitystring_to_array from typing_extensions import deprecated import fgpyo.io @@ -195,6 +197,12 @@ NO_REF_POS: int = -1 """The reference position to use to indicate no position in SAM/BAM.""" +STRING_PLACEHOLDER: str = "*" +"""The value to use when a string field's information is unavailable.""" + +NO_QUERY_QUALITIES: array = qualitystring_to_array(STRING_PLACEHOLDER) +"""The quality array corresponding to an unavailable query quality string ("*").""" + _IOClasses = (io.TextIOBase, io.BufferedIOBase, io.RawIOBase, io.IOBase) """The classes that should be treated as file-like classes""" @@ -864,7 +872,7 @@ def sum_of_base_qualities(rec: AlignedSegment, min_quality_score: int = 15) -> i [`calc_sum_of_base_qualities()`](https://github.com/samtools/samtools/blob/4f3a7397a1f841020074c0048c503a01a52d5fa2/bam_mate.c#L227-L238) [`MD_MIN_QUALITY`](https://github.com/samtools/samtools/blob/4f3a7397a1f841020074c0048c503a01a52d5fa2/bam_mate.c#L42) """ - if rec.query_qualities is None: + if rec.query_qualities is None or rec.query_qualities == NO_QUERY_QUALITIES: return 0 score: int = sum(qual for qual in rec.query_qualities if qual >= min_quality_score) From 4ef48d1d9f9d28a57ed7e6e74059af0ec5791128 Mon Sep 17 00:00:00 2001 From: Matt Stone Date: Thu, 23 Jan 2025 14:42:04 -0500 Subject: [PATCH 4/4] refactor: string placeholder assignment --- fgpyo/sam/__init__.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fgpyo/sam/__init__.py b/fgpyo/sam/__init__.py index 4d14363..7f2ec87 100644 --- a/fgpyo/sam/__init__.py +++ b/fgpyo/sam/__init__.py @@ -191,15 +191,15 @@ NO_REF_INDEX: int = -1 """The reference index to use to indicate no reference in SAM/BAM.""" -NO_REF_NAME: str = "*" +STRING_PLACEHOLDER: str = "*" +"""The value to use when a string field's information is unavailable.""" + +NO_REF_NAME: str = STRING_PLACEHOLDER """The reference name to use to indicate no reference in SAM/BAM.""" NO_REF_POS: int = -1 """The reference position to use to indicate no position in SAM/BAM.""" -STRING_PLACEHOLDER: str = "*" -"""The value to use when a string field's information is unavailable.""" - NO_QUERY_QUALITIES: array = qualitystring_to_array(STRING_PLACEHOLDER) """The quality array corresponding to an unavailable query quality string ("*")."""