Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

AVITI manifest improvements #374

Merged
merged 9 commits into from
Nov 4, 2024
4 changes: 4 additions & 0 deletions VERSIONLOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
# Scilifelab_epps Version Log

## 20241104.2

For AVITI manifest generation: make PhiX manifest variant, fix udf typo, remove unused func, clarify var names, add cases to reverse-compliment Index2.

## 20241104.1

Suspected bugfix for BA parsing script.
Expand Down
69 changes: 38 additions & 31 deletions scripts/generate_aviti_run_manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,12 @@ def get_manifests(process: Process, manifest_root_name: str) -> list[tuple[str,
if sample.project:
project = sample.project.name.replace(".", "__").replace(",", "")
seq_setup = sample.project.udf.get("Sequencing setup", "0-0")
user_library = (
True
if sample.project.udf["Library construction method"]
== "Finished library (by user)"
else False
)
else:
project = "Control"
seq_setup = "0-0"
Expand All @@ -200,11 +206,21 @@ def get_manifests(process: Process, manifest_root_name: str) -> list[tuple[str,
row["SampleName"] = sample.name
if isinstance(idx, tuple):
row["Index1"], row["Index2"] = idx
# Special cases to reverse-complement index2
if not user_library or (
user_library
and (
TENX_DUAL_PAT.findall(lims_label)
or SMARTSEQ_PAT.findall(lims_label)
)
):
logging.info(f"Reverse-complementing index2 of {sample.name}.")
row["Index2"] = revcomp(row["Index2"])
else:
row["Index1"] = idx
# Assume long idx2 from recipe + no idx2 from label means idx2 is UMI
if int(process.udf.get("Index read 2", 0)) > 12:
row["Index2"] = "N" * int(process.udf["Index read 2"])
if int(process.udf.get("Index Read 2", 0)) > 12:
row["Index2"] = "N" * int(process.udf["Index Read 2"])
else:
row["Index2"] = ""
row["Lane"] = lane
Expand Down Expand Up @@ -259,7 +275,7 @@ def get_manifests(process: Process, manifest_root_name: str) -> list[tuple[str,

# Start building manifests
manifests: list[tuple[str, str]] = []
for manifest_type in ["untrimmed", "trimmed", "empty"]:
for manifest_type in ["untrimmed", "trimmed", "phix", "empty"]:
manifest_name, manifest_contents = make_manifest(
df_samples_and_controls,
process,
Expand Down Expand Up @@ -292,6 +308,7 @@ def make_manifest(
].copy()

file_name = f"{manifest_root_name}_{manifest_type}.csv"

runValues_section = "\n".join(
[
"[RUNVALUES]",
Expand Down Expand Up @@ -320,6 +337,10 @@ def make_manifest(

samples_section = f"[SAMPLES]\n{df.to_csv(index=None, header=True)}"

elif manifest_type == "phix":
df = df[df["Project"] == "Control"]
samples_section = f"[SAMPLES]\n{df.to_csv(index=None, header=True)}"

elif manifest_type == "empty":
samples_section = ""

Expand All @@ -333,24 +354,8 @@ def make_manifest(
return (file_name, manifest_contents)


def fit_seq(seq: str, length: int, seq_extension: str | None = None) -> str:
"""Fit a sequence to a given length by extending or truncating."""
if len(seq) == length:
return seq
elif len(seq) > length:
return seq[:length]
else:
if seq_extension is None:
raise AssertionError("Can't extend sequence without extension string.")
else:
if length - len(seq) > len(seq_extension):
raise AssertionError(
"Extension string too short to fit sequence to desired length."
)
return seq + seq_extension[: length - len(seq)]


def check_distances(rows: list[dict], threshold=3) -> None:
def check_distances(rows: list[dict], threshold=2) -> None:
"""Iterator function to check index distances between all pairs of samples."""
for i in range(len(rows)):
row = rows[i]

Expand All @@ -369,26 +374,28 @@ def check_pair_distance(row, row_comp, check_flips: bool = False, threshold: int
"""

if check_flips:
flips = []
for a1, _a1 in zip(
[row["Index1"], revcomp(row["Index1"])], ["Index1", "Index1_rc"]
flips: list[tuple[int, str, str]] = []
for s1i1, s1i1_name in zip(
[row["Index1"], revcomp(row["Index1"])],
["Index1", "Index1_rc"],
):
for a2, _a2 in zip(
[row["Index2"], revcomp(row["Index2"])], ["Index2", "Index2_rc"]
for s1i2, s1i2_name in zip(
[row["Index2"], revcomp(row["Index2"])],
["Index2", "Index2_rc"],
):
for b1, _b1 in zip(
for s2i1, s2i1_name in zip(
[row_comp["Index1"], revcomp(row_comp["Index1"])],
["Index1", "Index1_rc"],
):
for b2, _b2 in zip(
for s2i2, s2i2_name in zip(
[row_comp["Index2"], revcomp(row_comp["Index2"])],
["Index2", "Index2_rc"],
):
flips.append(
(
distance(a1, b1) + distance(a2, b2),
f"{a1}-{a2} {b1}-{b2}",
f"{_a1}-{_a2} {_b1}-{_b2}",
distance(s1i1, s2i1) + distance(s1i2, s2i2),
f"{s1i1}-{s1i2} {s2i1}-{s2i2}",
f"{s1i1_name}-{s1i2_name} {s2i1_name}-{s2i2_name}",
)
)
dist, compared_seqs, flip_conf = min(flips, key=lambda x: x[0])
Expand Down
Loading