13
13
from genologics .config import BASEURI , PASSWORD , USERNAME
14
14
from genologics .entities import Process
15
15
from genologics .lims import Lims
16
- from Levenshtein import hamming as distance
16
+ from Levenshtein import distance
17
17
18
18
from data .Chromium_10X_indexes import Chromium_10X_indexes
19
19
from scilifelab_epps .epp import upload_file
@@ -145,7 +145,9 @@ def dict_to_manifest_col(d: dict) -> str:
145
145
return s
146
146
147
147
148
- def get_manifests (process : Process , manifest_root_name : str ) -> list [tuple [str , str ]]:
148
+ def get_manifests (
149
+ process : Process , manifest_root_name : str
150
+ ) -> list [tuple [str , str | None ]]:
149
151
"""Generate multiple manifests, grouping samples by index multiplicity and length,
150
152
adding PhiX controls of appropriate lengths as needed.
151
153
"""
@@ -158,30 +160,27 @@ def get_manifests(process: Process, manifest_root_name: str) -> list[tuple[str,
158
160
159
161
# Assert lanes
160
162
lanes = [art_out .location [1 ].split (":" )[0 ] for art_out in arts_out ]
161
- lanes .sort ()
162
163
assert set (lanes ) == {"1" } or set (lanes ) == {
163
164
"1" ,
164
165
"2" ,
165
166
}, "Expected a single-lane or dual-lane flowcell."
166
167
167
168
# Iterate over pool / lane
168
169
sample_rows = []
169
- for pool , lane in zip (arts_out , lanes ):
170
+ for pool , lane in sorted ( zip (arts_out , lanes ), key = lambda x : x [ 1 ] ):
170
171
# Get sample-label linkage via database
171
172
sample2label : dict [str , str ] = get_pool_sample_label_mapping (pool )
172
173
assert len (set (pool .reagent_labels )) == len (pool .reagent_labels ), (
173
- "Detected non-unique reagent labels. "
174
+ f "Detected non-unique reagent labels in lane { lane } "
174
175
)
175
176
176
177
# Record PhiX UDFs for each output artifact
177
- phix_loaded : bool = pool .udf ["% phiX" ] != 0
178
- phix_set_name = pool .udf .get ("Element PhiX Set" , None )
179
- if phix_loaded :
180
- assert phix_set_name is not None , (
181
- "PhiX controls loaded but no kit specified."
182
- )
178
+ phix_loaded : float = pool .udf .get ("% phiX" , 0 )
179
+ phix_set_name : str = pool .udf .get ("Element PhiX Set" , "" )
180
+ if phix_loaded != 0 :
181
+ assert phix_set_name != "" , "PhiX controls loaded but no kit specified."
183
182
else :
184
- assert phix_set_name is None , "PhiX controls specified but not loaded."
183
+ assert phix_set_name == "" , "PhiX controls specified but not loaded."
185
184
186
185
# Collect rows for each sample
187
186
for sample in pool .samples :
@@ -274,7 +273,7 @@ def get_manifests(process: Process, manifest_root_name: str) -> list[tuple[str,
274
273
check_distances (rows_to_check )
275
274
276
275
# Start building manifests
277
- manifests : list [tuple [str , str ]] = []
276
+ manifests : list [tuple [str , str | None ]] = []
278
277
for manifest_type in ["untrimmed" , "trimmed" , "phix" , "empty" ]:
279
278
manifest_name , manifest_contents = make_manifest (
280
279
df_samples_and_controls ,
@@ -292,7 +291,13 @@ def make_manifest(
292
291
process : Process ,
293
292
manifest_root_name : str ,
294
293
manifest_type : str ,
295
- ) -> tuple [str , str ]:
294
+ ) -> tuple [str , str | None ]:
295
+ logging .info (f"Building { manifest_type } manifest..." )
296
+
297
+ # Get the index cycles from the step fields
298
+ idx1_cycles = int (process .udf .get ("Index Read 1" ))
299
+ idx2_cycles = int (process .udf .get ("Index Read 2" ))
300
+
296
301
# Make copy of input df and subset columns to include in manifest
297
302
df = df_samples_and_controls [
298
303
[
@@ -319,26 +324,36 @@ def make_manifest(
319
324
]
320
325
)
321
326
322
- settings_section = "\n " .join (
323
- [
324
- "[SETTINGS]" ,
325
- "SettingName, Value" ,
326
- ]
327
- )
328
-
327
+ # Build the [SAMPLES] section of the manifest, depending on the manifest type.
329
328
if manifest_type == "untrimmed" :
330
329
samples_section = f"[SAMPLES]\n { df .to_csv (index = None , header = True )} "
331
330
332
- elif manifest_type == "trimmed" :
333
- min_idx1_len = df ["Index1" ].apply (len ).min ()
334
- min_idx2_len = df ["Index2" ].apply (len ).min ()
335
- df ["Index1" ] = df ["Index1" ].apply (lambda x : x [:min_idx1_len ])
336
- df ["Index2" ] = df ["Index2" ].apply (lambda x : x [:min_idx2_len ])
337
-
338
- samples_section = f"[SAMPLES]\n { df .to_csv (index = None , header = True )} "
331
+ elif manifest_type in ["trimmed" , "phix" ]:
332
+ if manifest_type == "phix" :
333
+ # Subset to PhiX controls
334
+ df = df [df ["Project" ] == "Control" ]
335
+
336
+ # Make index lengths conform to number of cycles
337
+ for idx , cycles in zip (["Index1" , "Index2" ], [idx1_cycles , idx2_cycles ]):
338
+ # If there are any indexes shorter than the number of cycles
339
+ if not df [df [idx ].apply (len ) < cycles ].empty :
340
+ for row in df [df [idx ].apply (len ) < cycles ].to_dict (orient = "records" ):
341
+ logging .error (
342
+ f"'{ row ['SampleName' ]} ' has { idx } '{ row [idx ]} ' of length { len (row [idx ])} shorter than { cycles } cycles."
343
+ )
344
+ logging .error (
345
+ f"Could not generate { manifest_type } manifest because indexes are shorter than the number of index cycles. Skipping."
346
+ )
347
+ return (file_name , None )
348
+ # If there are any indexes longer than the number of cycles
349
+ if not df [df [idx ].apply (len ) > cycles ].empty :
350
+ # For each one, log how it's trimmed
351
+ for row in df [df [idx ].apply (len ) > cycles ].to_dict (orient = "records" ):
352
+ logging .info (
353
+ f"Trimming '{ row ['SampleName' ]} ' { idx } '{ row [idx ]} ' of length { len (row [idx ])} to { cycles } cycles."
354
+ )
355
+ df [idx ] = df [idx ].apply (lambda x : x [:cycles ])
339
356
340
- elif manifest_type == "phix" :
341
- df = df [df ["Project" ] == "Control" ]
342
357
samples_section = f"[SAMPLES]\n { df .to_csv (index = None , header = True )} "
343
358
344
359
elif manifest_type == "empty" :
@@ -347,13 +362,82 @@ def make_manifest(
347
362
else :
348
363
raise AssertionError ("Invalid manifest type." )
349
364
365
+ settings_section = "\n " .join (
366
+ [
367
+ "[SETTINGS]" ,
368
+ "SettingName, Value" ,
369
+ ]
370
+ )
371
+
372
+ # Customize mismatch thresholds, if necessary
373
+ if manifest_type not in ["untrimmed" , "empty" ]:
374
+ try :
375
+ logging .info (
376
+ f"Getting custom mismatch thresholds for { manifest_type } manifest..."
377
+ )
378
+ i1_mismatch , i2_mismatch = get_custom_mistmatch_thresholds (df )
379
+ except AssertionError as e :
380
+ logging .error (e )
381
+ logging .error (
382
+ f"Could not generate { manifest_type } manifest without index collisions. Skipping."
383
+ )
384
+ return (file_name , None )
385
+
386
+ settings_section += "\n " + "\n " .join (
387
+ [
388
+ f"I1MismatchThreshold, { i1_mismatch } " ,
389
+ f"I2MismatchThreshold, { i2_mismatch } " ,
390
+ ]
391
+ )
392
+
393
+ # Write manifest
350
394
manifest_contents = "\n \n " .join (
351
395
[runValues_section , settings_section , samples_section ]
352
396
)
353
397
354
398
return (file_name , manifest_contents )
355
399
356
400
401
+ def get_custom_mistmatch_thresholds (df : pd .DataFrame ) -> tuple [int , int ]:
402
+ # Defaults, according to Element documentation
403
+ i1MismatchThreshold = 1
404
+ i2MismatchThreshold = 1
405
+
406
+ # Collect distances
407
+ idx1_dists = []
408
+ idx2_dists = []
409
+ total_dists = []
410
+ # Iterate across all sample pairings per lane
411
+ for lane in df ["Lane" ].unique ():
412
+ df_lane = df [df ["Lane" ] == lane ]
413
+ df_lane .reset_index (drop = True , inplace = True )
414
+ for i in range (0 , len (df_lane )):
415
+ for j in range (i + 1 , len (df_lane )):
416
+ # TODO skip NNN
417
+ idx1_dist = distance (df_lane ["Index1" ][i ], df_lane ["Index1" ][j ])
418
+ idx2_dist = distance (df_lane ["Index2" ][i ], df_lane ["Index2" ][j ])
419
+
420
+ # Collect distances between all sample pairings on index and index-pair level
421
+ idx1_dists .append (idx1_dist )
422
+ idx2_dists .append (idx2_dist )
423
+ total_dists .append (idx1_dist + idx2_dist )
424
+
425
+ if min (total_dists ) == 0 :
426
+ raise AssertionError ("Total index distance of 0 detected." )
427
+ if min (idx1_dists ) <= 2 :
428
+ logging .warning (
429
+ "Minimum distance between Index1 sequences is at or below 2. Reducing allowed mismatches from 1 to 0."
430
+ )
431
+ i1MismatchThreshold = 0
432
+ if min (idx2_dists ) <= 2 :
433
+ logging .warning (
434
+ "Minimum distance between Index2 sequences is at or below 2. Reducing allowed mismatches from 1 to 0."
435
+ )
436
+ i2MismatchThreshold = 0
437
+
438
+ return (i1MismatchThreshold , i2MismatchThreshold )
439
+
440
+
357
441
def check_distances (rows : list [dict ], threshold = 2 ) -> None :
358
442
"""Iterator function to check index distances between all pairs of samples."""
359
443
for i in range (len (rows )):
@@ -459,19 +543,18 @@ def main(args: Namespace):
459
543
manifest_root_name = f"AVITI_run_manifest_{ flowcell_id } _{ process .id } _{ TIMESTAMP } _{ process .technician .name .replace (' ' , '' )} "
460
544
461
545
# Create manifest(s)
462
- manifests : list [tuple [str , str ]] = get_manifests (process , manifest_root_name )
546
+ manifests : list [tuple [str , str | None ]] = get_manifests (process , manifest_root_name )
463
547
464
- # Write manifest(s)
465
- for file , content in manifests :
466
- open (file , "w" ).write (content )
467
-
468
- # Zip manifest(s)
548
+ # Write and zip manifest(s)
469
549
zip_file = f"{ manifest_root_name } .zip"
470
- files = [file for file , _ in manifests ]
471
550
with ZipFile (zip_file , "w" ) as zip_stream :
472
- for file in files :
473
- zip_stream .write (file )
474
- os .remove (file )
551
+ for file , content in manifests :
552
+ if content :
553
+ open (file , "w" ).write (content )
554
+ zip_stream .write (file )
555
+ os .remove (file )
556
+ else :
557
+ logging .warning (f"Not writing { file } due to missing contents." )
475
558
476
559
# Upload manifest(s)
477
560
logging .info ("Uploading run manifest to LIMS..." )
0 commit comments