|
| 1 | +--- |
| 2 | +interact_link: notebooks/coalescent/r.ipynb |
| 3 | +title: 'R' |
| 4 | +permalink: 'chapters/coalescent/r' |
| 5 | +previouschapter: |
| 6 | + url: chapters/coalescent/intro |
| 7 | + title: 'Simple coalescent model' |
| 8 | +nextchapter: |
| 9 | + url: chapters/applications |
| 10 | + title: 'Applications' |
| 11 | +redirect_from: |
| 12 | + - 'chapters/coalescent/r' |
| 13 | +--- |
| 14 | + |
| 15 | +# Kingman coalescent and the Newick format |
| 16 | + |
| 17 | +Authors: |
| 18 | +- Alex Zarebski @aezarebski |
| 19 | +- Gerry Tonkin-Hill @gtonkinhill |
| 20 | + |
| 21 | +Date: 2018-10-03 |
| 22 | + |
| 23 | +In this notebook we implement the Kingman coalescent and implement some functions for working with trees inspired by Newick format. Newick format is a widely used way to represent tree data structures. Having the genealogy in Newick format makes it easy to read into `ape` --- a popular package in R for working with genealogies --- and use the visualisation functionality it provides. |
| 24 | + |
| 25 | + |
| 26 | +{:.input_area} |
| 27 | +```R |
| 28 | +library(ape) |
| 29 | +``` |
| 30 | + |
| 31 | +## Model and implementation |
| 32 | + |
| 33 | +Given we have `k` copies of the gene in a population of size `pop_size` the probability of at least one pair coming from the same parent is *approximately* `0.25 * k * (k - 1) / pop_size`. Using discrete generations would suggest a geometric number of generations until the first coalescence where the probability of coalesence in each generation is this value. We can approximate the geometric distribution with an exponential distribution with this rate. |
| 34 | + |
| 35 | + |
| 36 | +{:.input_area} |
| 37 | +```R |
| 38 | +coalescent_rate <- function(k, pop_size) { |
| 39 | + 0.25 * k * (k - 1) / pop_size |
| 40 | +} |
| 41 | +``` |
| 42 | + |
| 43 | +The following functions, `leaf_node` and `branch_node` are helpers to work with trees. |
| 44 | + |
| 45 | + |
| 46 | +{:.input_area} |
| 47 | +```R |
| 48 | +leaf_node <- function(name, time) { |
| 49 | + list(type = "leaf", name = name, time = time) |
| 50 | +} |
| 51 | +``` |
| 52 | + |
| 53 | + |
| 54 | +{:.input_area} |
| 55 | +```R |
| 56 | +branch_node <- function(name, children, time) { |
| 57 | + list(type = "branch", |
| 58 | + name = name, |
| 59 | + children = children, |
| 60 | + time = time, |
| 61 | + lengths = c(time - children[[1]]$time, |
| 62 | + time - children[[2]]$time)) |
| 63 | +} |
| 64 | +``` |
| 65 | + |
| 66 | +Taking two nodes and linking them as the children of a parent is achieved with the following function. |
| 67 | + |
| 68 | + |
| 69 | +{:.input_area} |
| 70 | +```R |
| 71 | +binary_parent <- function(child1, child2, time) { |
| 72 | + parent_name <- paste(child1$name, child2$name, sep = "-") |
| 73 | + branch_node(parent_name, list(child1, child2), time) |
| 74 | +} |
| 75 | +``` |
| 76 | + |
| 77 | +Start by setting up a little sample population in a larger population to work on |
| 78 | + |
| 79 | + |
| 80 | +{:.input_area} |
| 81 | +```R |
| 82 | +current_time <- 0 |
| 83 | +sampled_population <- list(leaf_node("beth", current_time), |
| 84 | + leaf_node("gerry", current_time), |
| 85 | + leaf_node("morty", current_time), |
| 86 | + leaf_node("summer", current_time)) |
| 87 | +population_size <- 100 |
| 88 | + |
| 89 | +k <- +Inf |
| 90 | +``` |
| 91 | + |
| 92 | +Until the population has multiple individuals who have not coalesed continue to coalese individuals. |
| 93 | + |
| 94 | + |
| 95 | +{:.input_area} |
| 96 | +```R |
| 97 | +while (k > 2) { |
| 98 | + k <- length(sampled_population) |
| 99 | + coalescent_time <- rexp(1, coalescent_rate(k, population_size)) |
| 100 | + current_time <- current_time + coalescent_time |
| 101 | + ixs <- sample.int(k, size = 2) |
| 102 | + parent_node <- binary_parent(sampled_population[[ixs[1]]], sampled_population[[ixs[2]]], current_time) |
| 103 | + if (k > 2) { |
| 104 | + sampled_population <- c(list(parent_node), sampled_population[-ixs]) |
| 105 | + } else { |
| 106 | + sampled_population <- parent_node |
| 107 | + } |
| 108 | +} |
| 109 | + |
| 110 | +``` |
| 111 | + |
| 112 | +The following function recursively constructs a Newick representation of the tree. |
| 113 | + |
| 114 | + |
| 115 | +{:.input_area} |
| 116 | +```R |
| 117 | +newick_helper <- function(node) { |
| 118 | + if (node$type == "leaf") { |
| 119 | + node$name |
| 120 | + } else if (node$type == "branch") { |
| 121 | + sprintf("(%s:%f,%s:%f)%s", |
| 122 | + newick_helper(node$children[[1]]), |
| 123 | + node$lengths[1], |
| 124 | + newick_helper(node$children[[2]]), |
| 125 | + node$lengths[2], |
| 126 | + node$name) |
| 127 | + } |
| 128 | +} |
| 129 | + |
| 130 | +newick <- function(node) { |
| 131 | + sprintf("%s;", newick_helper(node)) |
| 132 | +} |
| 133 | +``` |
| 134 | + |
| 135 | + |
| 136 | +{:.input_area} |
| 137 | +```R |
| 138 | +demo_tree <- read.tree(text = newick(sampled_population)) |
| 139 | +``` |
| 140 | + |
| 141 | + |
| 142 | +{:.input_area} |
| 143 | +```R |
| 144 | +plot(demo_tree) |
| 145 | +``` |
| 146 | + |
| 147 | + |
| 148 | + |
| 149 | + |
0 commit comments