Merge pull request #94 from apriha/develop

apriha · web-flow · commit e58bd2bdf114 · 2021-04-14T23:46:09.000-07:00
v4.1.1
diff --git a/README.rst b/README.rst
@@ -117,8 +117,8 @@ Loading SNPs('resources/663.23andme.305.txt.gz')
 
 Now we can perform some analysis between the ``User662`` and ``User663`` datasets.
 
-Find Discordant SNPs
-''''''''''''''''''''
+`Find Discordant SNPs <https://lineage.readthedocs.io/en/latest/lineage.html#lineage.Lineage.find_discordant_snps>`_
+''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
 First, let's find discordant SNPs (i.e., SNP data that is not consistent with Mendelian
 inheritance):
 
@@ -136,8 +136,8 @@ the prompt, although the same output is available in the CSV file.
 
 Not counting mtDNA SNPs, there are 37 discordant SNPs between these two datasets.
 
-Find Shared DNA
-'''''''''''''''
+`Find Shared DNA <https://lineage.readthedocs.io/en/latest/lineage.html#lineage.Lineage.find_shared_dna>`_
+''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
 ``lineage`` uses the probabilistic recombination rates throughout the human genome from the
 `International HapMap Project <https://www.genome.gov/10001688/international-hapmap-project/>`_
 and the `1000 Genomes Project <https://www.internationalgenome.org>`_ to compute the shared DNA
@@ -180,8 +180,8 @@ details the shared segments of DNA on one chromosome and a plot that illustrates
 
 .. image:: https://raw.githubusercontent.com/apriha/lineage/master/docs/images/shared_dna_User662_User663_HapMap2.png
 
-Find Shared Genes
-'''''''''''''''''
+`Find Shared Genes <https://lineage.readthedocs.io/en/latest/lineage.html#lineage.Lineage.find_shared_dna>`_
+''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
 The `Central Dogma of Molecular Biology <https://en.wikipedia.org/wiki/Central_dogma_of_molecular_biology>`_
 states that genetic information flows from DNA to mRNA to proteins: DNA is transcribed into
 mRNA, and mRNA is translated into a protein. It's more complicated than this (it's biology
diff --git a/src/lineage/__init__.py b/src/lineage/__init__.py
@@ -290,6 +290,36 @@ def find_shared_dna(
 
         All output is saved to the output directory as `CSV` or `PNG` files.
 
+        Notes
+        -----
+        The code is commented throughout to help describe the algorithm and its operation.
+
+        To summarize, the algorithm first computes the genetic distance in cMs between SNPs
+        common to all individuals using the specified genetic map.
+
+        Then, individuals are compared for whether they share one or two alleles for each SNP in
+        common; in this manner, where all individuals share one chromosome, for example, there
+        will be several SNPs in a row where at least one allele is shared between individuals for
+        each SNP. The ``cM_threshold`` is then applied to each of these "matching segments" to
+        determine whether the segment could be a potential shared DNA segment (i.e., whether each
+        segment has a cM value greater than the threshold).
+
+        The matching segments that passed the ``cM_threshold`` are then checked to see if they
+        are adjacent to another matching segment, and if so, the segments are stitched together,
+        and the single SNP separating the segments is flagged as potentially discrepant. (This
+        means that multiple smaller matching segments passing the ``cM_threshold`` could be
+        stitched, identifying the SNP between each segment as discrepant.)
+
+        Next, the ``snp_threshold`` is applied to each segment to ensure there are enough SNPs in
+        the segment and the segment is not only a few SNPs in a region with a high recombination
+        rate; for each segment that passes this test, we have a segment of shared DNA, and the
+        total cMs for this segment are computed.
+
+        Finally, discrepant SNPs are checked to ensure that only SNPs internal to a shared DNA
+        segment are reported as discrepant (i.e., don't report a discrepant SNP if it was part of a
+        segment that didn't pass the ``snp_threshold``). Currently, no action other than reporting
+        is taken on discrepant SNPs.
+
         Parameters
         ----------
         individuals : iterable of Individuals
@@ -330,15 +360,18 @@ def find_shared_dna(
             two_chrom_discrepant_snps (pandas.Index)
                 discrepant SNPs discovered while finding shared DNA on two chromosomes
         """
+        # initialize all objects to be returned to be empty to start
         one_chrom_shared_dna = pd.DataFrame()
         two_chrom_shared_dna = pd.DataFrame()
         one_chrom_shared_genes = pd.DataFrame()
         two_chrom_shared_genes = pd.DataFrame()
         one_chrom_discrepant_snps = pd.Index([])
         two_chrom_discrepant_snps = pd.Index([])
 
+        # ensure that all individuals have SNPs that are mapped relative to Build 37
         self._remap_snps_to_GRCh37(individuals)
 
+        # return if there aren't enough individuals to compare
         if len(individuals) < 2:
             logger.warning("find_shared_dna requires two or more individuals...")
             return self._find_shared_dna_return_helper(
@@ -350,6 +383,7 @@ def find_shared_dna(
                 two_chrom_discrepant_snps,
             )
 
+        # load the specified genetic map (one genetic map for each chromosome)
         genetic_map_dfs = self._resources.get_genetic_map(genetic_map)
 
         if len(genetic_map_dfs) == 0:
@@ -362,26 +396,34 @@ def find_shared_dna(
                 two_chrom_discrepant_snps,
             )
 
-        cols = ["genotype{}".format(str(i)) for i in range(len(individuals))]
+        # generate a list of dynamically named columns for each individual's genotype
+        # (e.g., genotype0, genotype1, etc).
+        cols = [f"genotype{str(i)}" for i in range(len(individuals))]
 
+        # set the reference SNPs to compare to be that of the first individual
         df = individuals[0].snps
         df = df.rename(columns={"genotype": cols[0]})
 
+        # build-up a dataframe of SNPs that are common to all individuals
         for i, ind in enumerate(individuals[1:]):
             # join SNPs for all individuals
             df = df.join(ind.snps["genotype"], how="inner")
             df = df.rename(columns={"genotype": cols[i + 1]})
 
+        # set a flag for if one individuals is male (i.e., only one chromosome match on the X
+        # chromosome is possible in the non-PAR region)
         one_x_chrom = self._is_one_individual_male(individuals)
 
+        # create tasks to compute the genetic distances (cMs) between each SNP on each chromosome
         tasks = []
-
         chroms_to_drop = []
         for chrom in df["chrom"].unique():
             if chrom not in genetic_map_dfs.keys():
                 chroms_to_drop.append(chrom)
                 continue
 
+            # each task requires the genetic map for the chromosome and the positions of all SNPs
+            # in common on that chromosome
             tasks.append(
                 {
                     "genetic_map": genetic_map_dfs[chrom],
@@ -390,19 +432,24 @@ def find_shared_dna(
                 }
             )
 
-        # drop chromosomes without genetic distance data
+        # drop chromosomes without genetic distance data (e.g., chroms MT, PAR, etc.)
         for chrom in chroms_to_drop:
             df = df.drop(df.loc[df["chrom"] == chrom].index)
 
-        # determine the genetic distance between each SNP using the HapMap Phase II genetic map
+        # determine the genetic distance between each SNP using the specified genetic map
         snp_distances = map(self._compute_snp_distances, tasks)
         snp_distances = pd.concat(snp_distances)
+
+        # extract the column "cM_from_prev_snp" from the result and add that to the dataframe
+        # of SNPs common to all individuals; now we have the genetic distance between each SNP
         df["cM_from_prev_snp"] = snp_distances["cM_from_prev_snp"]
 
+        # now we apply a mask for whether all individuals match on one or two chromosomes...
+        # first, set all rows for these columns to True
         df["one_chrom_match"] = True
         df["two_chrom_match"] = True
-
-        # determine where individuals share an allele on one chromosome
+        # determine where individuals share an allele on one chromosome (i.e., set to False when
+        # at least one allele doesn't match for all individuals)
         for genotype1, genotype2 in combinations(cols, 2):
             df.loc[
                 ~df[genotype1].isnull()
@@ -414,7 +461,8 @@ def find_shared_dna(
                 "one_chrom_match",
             ] = False
 
-        # determine where individuals share alleles on both chromosomes
+        # determine where individuals share alleles on two chromosomes (i.e., set to False when
+        # two alleles don't match for all individuals)
         for genotype1, genotype2 in combinations(cols, 2):
             df.loc[
                 ~df[genotype1].isnull()
@@ -430,12 +478,14 @@ def find_shared_dna(
         # genotype columns are no longer required for calculation
         df = df.drop(cols, axis=1)
 
+        # find shared DNA on one chrom
         one_chrom_shared_dna, one_chrom_discrepant_snps = self._find_shared_dna_helper(
             df[["chrom", "pos", "cM_from_prev_snp", "one_chrom_match"]],
             cM_threshold,
             snp_threshold,
             one_x_chrom,
         )
+        # find shared DNA on two chroms
         two_chrom_shared_dna, two_chrom_discrepant_snps = self._find_shared_dna_helper(
             df[["chrom", "pos", "cM_from_prev_snp", "two_chrom_match"]],
             cM_threshold,
@@ -646,7 +696,7 @@ def _is_one_individual_male(self, individuals):
         return False
 
     def _compute_snp_distances(self, task):
-        """ Compute genetic distance between SNPs.
+        """ Compute genetic distance in cMs between SNPs.
 
         Parameters
         ----------
@@ -732,8 +782,8 @@ def _compute_shared_dna(self, task):
                 "two_chrom_match",
             ] = False
 
-        # get consecutive strings of trues
-        # http://stackoverflow.com/a/17151327
+        # get consecutive strings of Trues, for where there's a one or two chrom match between
+        # individuals, depending on the task; http://stackoverflow.com/a/17151327
         a = df.loc[(df["chrom"] == chrom)][match_col].values
         a = np.r_[a, False]
         a_rshifted = np.roll(a, 1)
@@ -744,8 +794,10 @@ def _compute_shared_dna(self, task):
         a_ends = np.nonzero(ends)[0]
         a_ends = np.reshape(a_ends, (len(a_ends), 1))
 
+        # get the matching segments
         matches = np.hstack((a_starts, a_ends))
 
+        # compute total cMs for each matching segment
         c = np.r_[0, df.loc[(df["chrom"] == chrom)]["cM_from_prev_snp"].cumsum()][
             matches
         ]
diff --git a/src/lineage/resources.py b/src/lineage/resources.py
@@ -133,6 +133,7 @@ def get_genetic_map(self, genetic_map):
 
     def get_genetic_map_HapMapII_GRCh37(self):
         """ Get International HapMap Consortium HapMap Phase II genetic map for Build 37.
+        [#InternationalHapMapConsortium]_ [#Auton2010]_
 
         Returns
         -------
@@ -141,10 +142,10 @@ def get_genetic_map_HapMapII_GRCh37(self):
 
         References
         ----------
-        1. "The International HapMap Consortium (2007).  A second generation human haplotype
-           map of over 3.1 million SNPs.  Nature 449: 851-861."
-        2. "The map was generated by lifting the HapMap Phase II genetic map from build 35 to
-           GRCh37. The original map was generated using LDhat as described in the 2007 HapMap
+        .. [#InternationalHapMapConsortium] "The International HapMap Consortium (2007).  A second generation human
+           haplotype map of over 3.1 million SNPs.  Nature 449: 851-861."
+        .. [#Auton2010] "The map was generated by lifting the HapMap Phase II genetic map from build
+           35 to GRCh37. The original map was generated using LDhat as described in the 2007 HapMap
            paper (Nature, 18th Sept 2007). The conversion from b35 to GRCh37 was achieved using
            the UCSC liftOver tool. Adam Auton, 08/12/2010"
         """
@@ -157,11 +158,12 @@ def get_genetic_map_HapMapII_GRCh37(self):
         return self._genetic_map
 
     def get_genetic_map_1000G_GRCh37(self, pop):
-        """ Get population-specific 1000 Genomes Project genetic map.
+        """ Get population-specific 1000 Genomes Project genetic map. [#Auton2013]_ [#Auton2015]_
 
         Notes
         -----
-        From `README_omni_recombination_20130507 <ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/working/20130507_omni_recombination_rates/README_omni_recombination_20130507>`_:
+        From `README_omni_recombination_20130507
+        <ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/working/20130507_omni_recombination_rates/README_omni_recombination_20130507>`_ [#Auton2013]_ :
 
             Genetic maps generated from the 1000G phased OMNI data.
 
@@ -185,9 +187,6 @@ def get_genetic_map_1000G_GRCh37(self, pop):
             estimate of 4Ne, allowing the population based rates to be converted to
             per-generation rates.
 
-            Adam Auton (adam.auton@einstein.yu.edu)
-            May 7th, 2013
-
         Returns
         -------
         dict
@@ -196,11 +195,11 @@ def get_genetic_map_1000G_GRCh37(self, pop):
 
         References
         ----------
-        1. Adam Auton (adam.auton@einstein.yu.edu), May 7th, 2013
-        2. The 1000 Genomes Project Consortium., Corresponding authors., Auton, A. et al. A
-           global reference for human genetic variation. Nature 526, 68–74 (2015).
+        .. [#Auton2013] Adam Auton, May 7th, 2013
+        .. [#Auton2015] The 1000 Genomes Project Consortium., Corresponding authors.,
+           Auton, A. et al. A global reference for human genetic variation. Nature 526, 68–74 (2015).
            https://doi.org/10.1038/nature15393
-        3. https://github.com/joepickrell/1000-genomes-genetic-maps
+        .. [#Pickrell] https://github.com/joepickrell/1000-genomes-genetic-maps
         """
         if self._genetic_map_name != pop:
             self._genetic_map = self._load_genetic_map_1000G_GRCh37(
@@ -415,7 +414,7 @@ def _load_cytoBand(filename):
         df : pandas.DataFrame
             cytoBand table
         """
-        # adapted from chromosome plotting code (see [1]_)
+        # adapted from chromosome plotting code (see get_cytoBand_hg19 Ref. 1.)
         df = pd.read_csv(
             filename, names=["chrom", "start", "end", "name", "gie_stain"], sep="\t"
         )
diff --git a/src/lineage/visualization.py b/src/lineage/visualization.py
@@ -2,11 +2,11 @@
 
 Notes
 -----
-Adapted from Ryan Dale's GitHub Gist for plotting chromosome features. [1]_
+Adapted from Ryan Dale's GitHub Gist for plotting chromosome features. [#Dale]_
 
 References
 ----------
-.. [1] Ryan Dale, GitHub Gist,
+.. [#Dale] Ryan Dale, GitHub Gist,
    https://gist.github.com/daler/c98fc410282d7570efc3#file-ideograms-py
 
 """