@@ -290,6 +290,36 @@ def find_shared_dna(
290
290
291
291
All output is saved to the output directory as `CSV` or `PNG` files.
292
292
293
+ Notes
294
+ -----
295
+ The code is commented throughout to help describe the algorithm and its operation.
296
+
297
+ To summarize, the algorithm first computes the genetic distance in cMs between SNPs
298
+ common to all individuals using the specified genetic map.
299
+
300
+ Then, individuals are compared for whether they share one or two alleles for each SNP in
301
+ common; in this manner, where all individuals share one chromosome, for example, there
302
+ will be several SNPs in a row where at least one allele is shared between individuals for
303
+ each SNP. The ``cM_threshold`` is then applied to each of these "matching segments" to
304
+ determine whether the segment could be a potential shared DNA segment (i.e., whether each
305
+ segment has a cM value greater than the threshold).
306
+
307
+ The matching segments that passed the ``cM_threshold`` are then checked to see if they
308
+ are adjacent to another matching segment, and if so, the segments are stitched together,
309
+ and the single SNP separating the segments is flagged as potentially discrepant. (This
310
+ means that multiple smaller matching segments passing the ``cM_threshold`` could be
311
+ stitched, identifying the SNP between each segment as discrepant.)
312
+
313
+ Next, the ``snp_threshold`` is applied to each segment to ensure there are enough SNPs in
314
+ the segment and the segment is not only a few SNPs in a region with a high recombination
315
+ rate; for each segment that passes this test, we have a segment of shared DNA, and the
316
+ total cMs for this segment are computed.
317
+
318
+ Finally, discrepant SNPs are checked to ensure that only SNPs internal to a shared DNA
319
+ segment are reported as discrepant (i.e., don't report a discrepant SNP if it was part of a
320
+ segment that didn't pass the ``snp_threshold``). Currently, no action other than reporting
321
+ is taken on discrepant SNPs.
322
+
293
323
Parameters
294
324
----------
295
325
individuals : iterable of Individuals
@@ -330,15 +360,18 @@ def find_shared_dna(
330
360
two_chrom_discrepant_snps (pandas.Index)
331
361
discrepant SNPs discovered while finding shared DNA on two chromosomes
332
362
"""
363
+ # initialize all objects to be returned to be empty to start
333
364
one_chrom_shared_dna = pd .DataFrame ()
334
365
two_chrom_shared_dna = pd .DataFrame ()
335
366
one_chrom_shared_genes = pd .DataFrame ()
336
367
two_chrom_shared_genes = pd .DataFrame ()
337
368
one_chrom_discrepant_snps = pd .Index ([])
338
369
two_chrom_discrepant_snps = pd .Index ([])
339
370
371
+ # ensure that all individuals have SNPs that are mapped relative to Build 37
340
372
self ._remap_snps_to_GRCh37 (individuals )
341
373
374
+ # return if there aren't enough individuals to compare
342
375
if len (individuals ) < 2 :
343
376
logger .warning ("find_shared_dna requires two or more individuals..." )
344
377
return self ._find_shared_dna_return_helper (
@@ -350,6 +383,7 @@ def find_shared_dna(
350
383
two_chrom_discrepant_snps ,
351
384
)
352
385
386
+ # load the specified genetic map (one genetic map for each chromosome)
353
387
genetic_map_dfs = self ._resources .get_genetic_map (genetic_map )
354
388
355
389
if len (genetic_map_dfs ) == 0 :
@@ -362,26 +396,34 @@ def find_shared_dna(
362
396
two_chrom_discrepant_snps ,
363
397
)
364
398
365
- cols = ["genotype{}" .format (str (i )) for i in range (len (individuals ))]
399
+ # generate a list of dynamically named columns for each individual's genotype
400
+ # (e.g., genotype0, genotype1, etc).
401
+ cols = [f"genotype{ str (i )} " for i in range (len (individuals ))]
366
402
403
+ # set the reference SNPs to compare to be that of the first individual
367
404
df = individuals [0 ].snps
368
405
df = df .rename (columns = {"genotype" : cols [0 ]})
369
406
407
+ # build-up a dataframe of SNPs that are common to all individuals
370
408
for i , ind in enumerate (individuals [1 :]):
371
409
# join SNPs for all individuals
372
410
df = df .join (ind .snps ["genotype" ], how = "inner" )
373
411
df = df .rename (columns = {"genotype" : cols [i + 1 ]})
374
412
413
+ # set a flag for if one individuals is male (i.e., only one chromosome match on the X
414
+ # chromosome is possible in the non-PAR region)
375
415
one_x_chrom = self ._is_one_individual_male (individuals )
376
416
417
+ # create tasks to compute the genetic distances (cMs) between each SNP on each chromosome
377
418
tasks = []
378
-
379
419
chroms_to_drop = []
380
420
for chrom in df ["chrom" ].unique ():
381
421
if chrom not in genetic_map_dfs .keys ():
382
422
chroms_to_drop .append (chrom )
383
423
continue
384
424
425
+ # each task requires the genetic map for the chromosome and the positions of all SNPs
426
+ # in common on that chromosome
385
427
tasks .append (
386
428
{
387
429
"genetic_map" : genetic_map_dfs [chrom ],
@@ -390,19 +432,24 @@ def find_shared_dna(
390
432
}
391
433
)
392
434
393
- # drop chromosomes without genetic distance data
435
+ # drop chromosomes without genetic distance data (e.g., chroms MT, PAR, etc.)
394
436
for chrom in chroms_to_drop :
395
437
df = df .drop (df .loc [df ["chrom" ] == chrom ].index )
396
438
397
- # determine the genetic distance between each SNP using the HapMap Phase II genetic map
439
+ # determine the genetic distance between each SNP using the specified genetic map
398
440
snp_distances = map (self ._compute_snp_distances , tasks )
399
441
snp_distances = pd .concat (snp_distances )
442
+
443
+ # extract the column "cM_from_prev_snp" from the result and add that to the dataframe
444
+ # of SNPs common to all individuals; now we have the genetic distance between each SNP
400
445
df ["cM_from_prev_snp" ] = snp_distances ["cM_from_prev_snp" ]
401
446
447
+ # now we apply a mask for whether all individuals match on one or two chromosomes...
448
+ # first, set all rows for these columns to True
402
449
df ["one_chrom_match" ] = True
403
450
df ["two_chrom_match" ] = True
404
-
405
- # determine where individuals share an allele on one chromosome
451
+ # determine where individuals share an allele on one chromosome (i.e., set to False when
452
+ # at least one allele doesn't match for all individuals)
406
453
for genotype1 , genotype2 in combinations (cols , 2 ):
407
454
df .loc [
408
455
~ df [genotype1 ].isnull ()
@@ -414,7 +461,8 @@ def find_shared_dna(
414
461
"one_chrom_match" ,
415
462
] = False
416
463
417
- # determine where individuals share alleles on both chromosomes
464
+ # determine where individuals share alleles on two chromosomes (i.e., set to False when
465
+ # two alleles don't match for all individuals)
418
466
for genotype1 , genotype2 in combinations (cols , 2 ):
419
467
df .loc [
420
468
~ df [genotype1 ].isnull ()
@@ -430,12 +478,14 @@ def find_shared_dna(
430
478
# genotype columns are no longer required for calculation
431
479
df = df .drop (cols , axis = 1 )
432
480
481
+ # find shared DNA on one chrom
433
482
one_chrom_shared_dna , one_chrom_discrepant_snps = self ._find_shared_dna_helper (
434
483
df [["chrom" , "pos" , "cM_from_prev_snp" , "one_chrom_match" ]],
435
484
cM_threshold ,
436
485
snp_threshold ,
437
486
one_x_chrom ,
438
487
)
488
+ # find shared DNA on two chroms
439
489
two_chrom_shared_dna , two_chrom_discrepant_snps = self ._find_shared_dna_helper (
440
490
df [["chrom" , "pos" , "cM_from_prev_snp" , "two_chrom_match" ]],
441
491
cM_threshold ,
@@ -646,7 +696,7 @@ def _is_one_individual_male(self, individuals):
646
696
return False
647
697
648
698
def _compute_snp_distances (self , task ):
649
- """ Compute genetic distance between SNPs.
699
+ """ Compute genetic distance in cMs between SNPs.
650
700
651
701
Parameters
652
702
----------
@@ -732,8 +782,8 @@ def _compute_shared_dna(self, task):
732
782
"two_chrom_match" ,
733
783
] = False
734
784
735
- # get consecutive strings of trues
736
- # http://stackoverflow.com/a/17151327
785
+ # get consecutive strings of Trues, for where there's a one or two chrom match between
786
+ # individuals, depending on the task; http://stackoverflow.com/a/17151327
737
787
a = df .loc [(df ["chrom" ] == chrom )][match_col ].values
738
788
a = np .r_ [a , False ]
739
789
a_rshifted = np .roll (a , 1 )
@@ -744,8 +794,10 @@ def _compute_shared_dna(self, task):
744
794
a_ends = np .nonzero (ends )[0 ]
745
795
a_ends = np .reshape (a_ends , (len (a_ends ), 1 ))
746
796
797
+ # get the matching segments
747
798
matches = np .hstack ((a_starts , a_ends ))
748
799
800
+ # compute total cMs for each matching segment
749
801
c = np .r_ [0 , df .loc [(df ["chrom" ] == chrom )]["cM_from_prev_snp" ].cumsum ()][
750
802
matches
751
803
]
0 commit comments