Skip to content

Commit 7b39e61

Browse files
authored
Merge pull request #393 from kedhammar/aviti-trim-phix
AVITI manifest fixes
2 parents 1420d1b + d8fed2e commit 7b39e61

File tree

1 file changed

+123
-40
lines changed

1 file changed

+123
-40
lines changed

scripts/generate_aviti_run_manifest.py

Lines changed: 123 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
from genologics.config import BASEURI, PASSWORD, USERNAME
1414
from genologics.entities import Process
1515
from genologics.lims import Lims
16-
from Levenshtein import hamming as distance
16+
from Levenshtein import distance
1717

1818
from data.Chromium_10X_indexes import Chromium_10X_indexes
1919
from scilifelab_epps.epp import upload_file
@@ -145,7 +145,9 @@ def dict_to_manifest_col(d: dict) -> str:
145145
return s
146146

147147

148-
def get_manifests(process: Process, manifest_root_name: str) -> list[tuple[str, str]]:
148+
def get_manifests(
149+
process: Process, manifest_root_name: str
150+
) -> list[tuple[str, str | None]]:
149151
"""Generate multiple manifests, grouping samples by index multiplicity and length,
150152
adding PhiX controls of appropriate lengths as needed.
151153
"""
@@ -158,30 +160,27 @@ def get_manifests(process: Process, manifest_root_name: str) -> list[tuple[str,
158160

159161
# Assert lanes
160162
lanes = [art_out.location[1].split(":")[0] for art_out in arts_out]
161-
lanes.sort()
162163
assert set(lanes) == {"1"} or set(lanes) == {
163164
"1",
164165
"2",
165166
}, "Expected a single-lane or dual-lane flowcell."
166167

167168
# Iterate over pool / lane
168169
sample_rows = []
169-
for pool, lane in zip(arts_out, lanes):
170+
for pool, lane in sorted(zip(arts_out, lanes), key=lambda x: x[1]):
170171
# Get sample-label linkage via database
171172
sample2label: dict[str, str] = get_pool_sample_label_mapping(pool)
172173
assert len(set(pool.reagent_labels)) == len(pool.reagent_labels), (
173-
"Detected non-unique reagent labels."
174+
f"Detected non-unique reagent labels in lane {lane}"
174175
)
175176

176177
# Record PhiX UDFs for each output artifact
177-
phix_loaded: bool = pool.udf["% phiX"] != 0
178-
phix_set_name = pool.udf.get("Element PhiX Set", None)
179-
if phix_loaded:
180-
assert phix_set_name is not None, (
181-
"PhiX controls loaded but no kit specified."
182-
)
178+
phix_loaded: float = pool.udf.get("% phiX", 0)
179+
phix_set_name: str = pool.udf.get("Element PhiX Set", "")
180+
if phix_loaded != 0:
181+
assert phix_set_name != "", "PhiX controls loaded but no kit specified."
183182
else:
184-
assert phix_set_name is None, "PhiX controls specified but not loaded."
183+
assert phix_set_name == "", "PhiX controls specified but not loaded."
185184

186185
# Collect rows for each sample
187186
for sample in pool.samples:
@@ -274,7 +273,7 @@ def get_manifests(process: Process, manifest_root_name: str) -> list[tuple[str,
274273
check_distances(rows_to_check)
275274

276275
# Start building manifests
277-
manifests: list[tuple[str, str]] = []
276+
manifests: list[tuple[str, str | None]] = []
278277
for manifest_type in ["untrimmed", "trimmed", "phix", "empty"]:
279278
manifest_name, manifest_contents = make_manifest(
280279
df_samples_and_controls,
@@ -292,7 +291,13 @@ def make_manifest(
292291
process: Process,
293292
manifest_root_name: str,
294293
manifest_type: str,
295-
) -> tuple[str, str]:
294+
) -> tuple[str, str | None]:
295+
logging.info(f"Building {manifest_type} manifest...")
296+
297+
# Get the index cycles from the step fields
298+
idx1_cycles = int(process.udf.get("Index Read 1"))
299+
idx2_cycles = int(process.udf.get("Index Read 2"))
300+
296301
# Make copy of input df and subset columns to include in manifest
297302
df = df_samples_and_controls[
298303
[
@@ -319,26 +324,36 @@ def make_manifest(
319324
]
320325
)
321326

322-
settings_section = "\n".join(
323-
[
324-
"[SETTINGS]",
325-
"SettingName, Value",
326-
]
327-
)
328-
327+
# Build the [SAMPLES] section of the manifest, depending on the manifest type.
329328
if manifest_type == "untrimmed":
330329
samples_section = f"[SAMPLES]\n{df.to_csv(index=None, header=True)}"
331330

332-
elif manifest_type == "trimmed":
333-
min_idx1_len = df["Index1"].apply(len).min()
334-
min_idx2_len = df["Index2"].apply(len).min()
335-
df["Index1"] = df["Index1"].apply(lambda x: x[:min_idx1_len])
336-
df["Index2"] = df["Index2"].apply(lambda x: x[:min_idx2_len])
337-
338-
samples_section = f"[SAMPLES]\n{df.to_csv(index=None, header=True)}"
331+
elif manifest_type in ["trimmed", "phix"]:
332+
if manifest_type == "phix":
333+
# Subset to PhiX controls
334+
df = df[df["Project"] == "Control"]
335+
336+
# Make index lengths conform to number of cycles
337+
for idx, cycles in zip(["Index1", "Index2"], [idx1_cycles, idx2_cycles]):
338+
# If there are any indexes shorter than the number of cycles
339+
if not df[df[idx].apply(len) < cycles].empty:
340+
for row in df[df[idx].apply(len) < cycles].to_dict(orient="records"):
341+
logging.error(
342+
f"'{row['SampleName']}' has {idx} '{row[idx]}' of length {len(row[idx])} shorter than {cycles} cycles."
343+
)
344+
logging.error(
345+
f"Could not generate {manifest_type} manifest because indexes are shorter than the number of index cycles. Skipping."
346+
)
347+
return (file_name, None)
348+
# If there are any indexes longer than the number of cycles
349+
if not df[df[idx].apply(len) > cycles].empty:
350+
# For each one, log how it's trimmed
351+
for row in df[df[idx].apply(len) > cycles].to_dict(orient="records"):
352+
logging.info(
353+
f"Trimming '{row['SampleName']}' {idx} '{row[idx]}' of length {len(row[idx])} to {cycles} cycles."
354+
)
355+
df[idx] = df[idx].apply(lambda x: x[:cycles])
339356

340-
elif manifest_type == "phix":
341-
df = df[df["Project"] == "Control"]
342357
samples_section = f"[SAMPLES]\n{df.to_csv(index=None, header=True)}"
343358

344359
elif manifest_type == "empty":
@@ -347,13 +362,82 @@ def make_manifest(
347362
else:
348363
raise AssertionError("Invalid manifest type.")
349364

365+
settings_section = "\n".join(
366+
[
367+
"[SETTINGS]",
368+
"SettingName, Value",
369+
]
370+
)
371+
372+
# Customize mismatch thresholds, if necessary
373+
if manifest_type not in ["untrimmed", "empty"]:
374+
try:
375+
logging.info(
376+
f"Getting custom mismatch thresholds for {manifest_type} manifest..."
377+
)
378+
i1_mismatch, i2_mismatch = get_custom_mistmatch_thresholds(df)
379+
except AssertionError as e:
380+
logging.error(e)
381+
logging.error(
382+
f"Could not generate {manifest_type} manifest without index collisions. Skipping."
383+
)
384+
return (file_name, None)
385+
386+
settings_section += "\n" + "\n".join(
387+
[
388+
f"I1MismatchThreshold, {i1_mismatch}",
389+
f"I2MismatchThreshold, {i2_mismatch}",
390+
]
391+
)
392+
393+
# Write manifest
350394
manifest_contents = "\n\n".join(
351395
[runValues_section, settings_section, samples_section]
352396
)
353397

354398
return (file_name, manifest_contents)
355399

356400

401+
def get_custom_mistmatch_thresholds(df: pd.DataFrame) -> tuple[int, int]:
402+
# Defaults, according to Element documentation
403+
i1MismatchThreshold = 1
404+
i2MismatchThreshold = 1
405+
406+
# Collect distances
407+
idx1_dists = []
408+
idx2_dists = []
409+
total_dists = []
410+
# Iterate across all sample pairings per lane
411+
for lane in df["Lane"].unique():
412+
df_lane = df[df["Lane"] == lane]
413+
df_lane.reset_index(drop=True, inplace=True)
414+
for i in range(0, len(df_lane)):
415+
for j in range(i + 1, len(df_lane)):
416+
# TODO skip NNN
417+
idx1_dist = distance(df_lane["Index1"][i], df_lane["Index1"][j])
418+
idx2_dist = distance(df_lane["Index2"][i], df_lane["Index2"][j])
419+
420+
# Collect distances between all sample pairings on index and index-pair level
421+
idx1_dists.append(idx1_dist)
422+
idx2_dists.append(idx2_dist)
423+
total_dists.append(idx1_dist + idx2_dist)
424+
425+
if min(total_dists) == 0:
426+
raise AssertionError("Total index distance of 0 detected.")
427+
if min(idx1_dists) <= 2:
428+
logging.warning(
429+
"Minimum distance between Index1 sequences is at or below 2. Reducing allowed mismatches from 1 to 0."
430+
)
431+
i1MismatchThreshold = 0
432+
if min(idx2_dists) <= 2:
433+
logging.warning(
434+
"Minimum distance between Index2 sequences is at or below 2. Reducing allowed mismatches from 1 to 0."
435+
)
436+
i2MismatchThreshold = 0
437+
438+
return (i1MismatchThreshold, i2MismatchThreshold)
439+
440+
357441
def check_distances(rows: list[dict], threshold=2) -> None:
358442
"""Iterator function to check index distances between all pairs of samples."""
359443
for i in range(len(rows)):
@@ -459,19 +543,18 @@ def main(args: Namespace):
459543
manifest_root_name = f"AVITI_run_manifest_{flowcell_id}_{process.id}_{TIMESTAMP}_{process.technician.name.replace(' ', '')}"
460544

461545
# Create manifest(s)
462-
manifests: list[tuple[str, str]] = get_manifests(process, manifest_root_name)
546+
manifests: list[tuple[str, str | None]] = get_manifests(process, manifest_root_name)
463547

464-
# Write manifest(s)
465-
for file, content in manifests:
466-
open(file, "w").write(content)
467-
468-
# Zip manifest(s)
548+
# Write and zip manifest(s)
469549
zip_file = f"{manifest_root_name}.zip"
470-
files = [file for file, _ in manifests]
471550
with ZipFile(zip_file, "w") as zip_stream:
472-
for file in files:
473-
zip_stream.write(file)
474-
os.remove(file)
551+
for file, content in manifests:
552+
if content:
553+
open(file, "w").write(content)
554+
zip_stream.write(file)
555+
os.remove(file)
556+
else:
557+
logging.warning(f"Not writing {file} due to missing contents.")
475558

476559
# Upload manifest(s)
477560
logging.info("Uploading run manifest to LIMS...")

0 commit comments

Comments
 (0)