Skip to content

Commit

Permalink
kdetrees: added untared directory for remotes::install_github("vinues…
Browse files Browse the repository at this point in the history
…a/get_phylomarkers/kdetrees" ...)
  • Loading branch information
vinuesa committed Apr 2, 2024
1 parent 07286cf commit c3f06ae
Show file tree
Hide file tree
Showing 62 changed files with 4,657 additions and 0 deletions.
4 changes: 4 additions & 0 deletions kdetrees/ChangeLog
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
0.1.4 --- Bugfixes
- the function kdetrees.complete does not work because “infile” in the arguments is then called “file” in the code.
- function “browser()” in the kdetrees function should be removed. It pauses the execution. I believe it was there for debugging matter?
- in the vignette, top of page 2, “blibrary(ape)” should be “library(ape)”
23 changes: 23 additions & 0 deletions kdetrees/DESCRIPTION
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
Package: kdetrees
Type: Package
Title: Nonparametric method for identifying discordant phylogenetic
trees
Version: 0.1.5
Date: 2014-05-21
Author: Grady Weyenberg and Ruriko Yoshida
Maintainer: Grady Weyenberg <[email protected]>
Description: A non-parametric method for identifying potential
outlying observations in a collection of phylogenetic trees based
on the methods of Owen and Provan (2011). Such discordant trees
may indicate problems with sequence annotation or tree
reconstruction, or they may represent interesting biological
phenomena, such as horizontal gene transfers.
License: GPL-2
Depends: R (>= 2.15.1)
Imports: ape, distory, ggplot2
URL: http://github.com/grady/kdetrees
LazyData: TRUE
Packaged: 2014-05-21 20:18:16 UTC; grady
NeedsCompilation: no
Repository: CRAN
Date/Publication: 2014-05-30 01:09:11
61 changes: 61 additions & 0 deletions kdetrees/MD5
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
c332010cf6c02c1712b4a37db22acb72 *ChangeLog
280805007b9070fc45652703a024c5aa *DESCRIPTION
0f260d4787b5544e9dcf8fcf67d98af6 *NAMESPACE
a5fd8a2f29349a0f832909ed44a0e765 *R/bw.R
a92fda4ebca62b5d069d918f294c9aa0 *R/dist.diss.R
e0258371d68ff74289607c24232c1a75 *R/kde.R
8c70a373a4bb0f4c407e94ddd0e506b5 *R/kdetrees-package.R
812e2ee45b6b00ee2525067fa07c8323 *R/kernels.R
e886c049a9fd4d2ceead8f68bee51ab9 *R/util.R
c9d8c0a88514fbf82b64099a9716a223 *build/vignette.rds
8bcc286927c1edcd873d1011779f7f79 *data/apicomplexa.rda
bb91457cbc3f6ada07991d8a33456cda *inst/CITATION
ece82cb71a8af3d6944110b2968986c9 *inst/doc/kdetrees.R
8d0c626df1bf0dcce371fd80aa05dfa9 *inst/doc/kdetrees.Rnw
a6f54e7f4fc2f9e12c7ec0d3908d206a *inst/doc/kdetrees.pdf
8e3ef0b808c846ed84f262b92cfc8537 *inst/doc/simulation.R
a0a7cc1738a8f911afb8e83564bb985d *inst/doc/simulation.Rnw
6b339d99f26c1b802c171d2cf9bf6d2c *inst/doc/simulation.pdf
f8995a58826ebfa00dce7df785eb64ea *man/apicomplexa.Rd
e3cdea30668024e1f6737d035cdc4dac *man/as.data.frame.kdetrees.Rd
4a4f37c2b8ed61a20c94eaeaca5940f8 *man/as.matrix.multiPhylo.Rd
10432d11dc6cbc3f4b8d56dc5e3e2015 *man/bw.nn.Rd
b6995382a1fb79d152c6b5ee4ca3e68b *man/dist.diss.Rd
3123ef771f5902ed90081b405607eef4 *man/estimate.Rd
938ad2a0626acf5ac8bc0eea461b5303 *man/hist.kdetrees.Rd
5f326151183979c5b34fd5aa64d477c2 *man/kdetrees-package.Rd
d15f63adbd87a2c12422ee423ddfa36b *man/kdetrees.Rd
1059d2e97d9f4ea6e24d7c78d73a5de4 *man/kdetrees.complete.Rd
10915d55dd9d6a1d923f8b6c3a5572a9 *man/normkern.Rd
68c7bfb0fa33ab5f1348225abc740af2 *man/plot.kdetrees.Rd
40c1cf8ff0847740ae3b7dd3b34ebf04 *man/print.kdetrees.Rd
ea47d5f8fec82002e068b9af9844992d *vignettes/img/bench.pdf
41626af8f158319df365806883254a00 *vignettes/img/roc.pdf
1346734622605adc83e5994fb53b301a *vignettes/img/rocmix.pdf
904bc1b84844c8c09335af344eeb290b *vignettes/img/tpr.pdf
6e32bcc34922f5253680d9f6ca9a61e1 *vignettes/img/tprBW.pdf
8d0c626df1bf0dcce371fd80aa05dfa9 *vignettes/kdetrees.Rnw
6fac019e1ad28167d1e2ae3f13b8538e *vignettes/references.bib
f3b08a54e016addaa313c5859ed7a932 *vignettes/sim/bench.Rda
f8f9ba60975f5511ca035e6e5de5887f *vignettes/sim/coalescent.nex
8e86494b545ac540d2d36120b5c00352 *vignettes/sim/deftprmix.Rda
e9b92d0fd8eb4ae73927253f5637073b *vignettes/sim/genetrees.tre
a11aac5b05b957800a53d0c71e43747c *vignettes/sim/mix2.Rda
7f38da7f58b9d6d8553bd87ea96c49e7 *vignettes/sim/my-pmcoa.R
25597c5f00ddc750978ffc71479ab1d7 *vignettes/sim/result.Rda
558329580b92d577212c7c422663d967 *vignettes/sim/results.Rda
08c36438abe008e0e13bfbf7770b4de4 *vignettes/sim/rocmix.Rda
39b9f571afbf94f7585419d5ce31acfc *vignettes/sim/rocresult.Rda
998f8214be533a533dacdc2af002f4d8 *vignettes/sim/rocsim.Rda
df6e8d158a54d5e16b4a0c2ae2ca6f95 *vignettes/sim/set8-all.pdf
6df0d11aa4eaefe8e2e514ec44e9a671 *vignettes/sim/set8-outliers.pdf
f86ccd56f460b432968a146491b1efeb *vignettes/sim/sims.rda
cb6e43b435a33188a763c16bbc3b5de3 *vignettes/sim/species.nex
7a59a0c62cb79b07d65d9e417437a387 *vignettes/sim/species1k.nex
5d60a029442b74603eba304aef67b914 *vignettes/sim/species2.nex
e7ac7f00cf05c75228e60ae074790bde *vignettes/sim/species5.nex
cc9ea3f6f41202d03775308e1629234a *vignettes/sim/tpr.Rda
3f59478fec860fc284bfe05e365f6782 *vignettes/sim/tprmix.Rda
85455890a491f0aa2dc5a0e6a2648302 *vignettes/sim/tprmix1.Rda
7afa3b19c17d1409616cf2cd2cc65f64 *vignettes/sim/treesim.py
a0a7cc1738a8f911afb8e83564bb985d *vignettes/simulation.Rnw
19 changes: 19 additions & 0 deletions kdetrees/NAMESPACE
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# Generated by roxygen2 (4.0.0): do not edit by hand

S3method(as.data.frame,kdetrees)
S3method(as.matrix,multiPhylo)
S3method(hist,kdetrees)
S3method(plot,kdetrees)
S3method(print,kdetrees)
export(bw.nn)
export(dist.diss)
export(kdetrees)
export(kdetrees.complete)
import(ggplot2)
importFrom(ape,compute.brlen)
importFrom(ape,cophenetic.phylo)
importFrom(ape,read.tree)
importFrom(ape,root)
importFrom(ape,unroot)
importFrom(ape,write.tree)
importFrom(distory,dist.multiPhylo)
22 changes: 22 additions & 0 deletions kdetrees/R/bw.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
### Copyright (C) 2014 -- Grady Weyenberg ###

##' nearest-neighbor adaptive bandwidth selection
##'
##' For each row in pairwise distance matrix find the distance to the
##' closest prop fraction of trees.
##' @param x pairwise distance matrix
##' @param prop fraction of data to define the local neighborhood
##' @param tol tolerance for zero-bandwidth check
##' @return a vector of bandwidths for each tree (row) in x
##' @author Grady Weyenberg
##' @export
##' @examples
##' dm <- as.matrix(dist.diss(apicomplexa[1:20]))
##' bw.nn(dm)
bw.nn <- function(x,prop=0.2,tol=1e-6){
out <- apply(x,1,function(y) quantile(y,prop))
is.zero <- out < tol
if(sum(is.zero)>0) out[is.zero] <- apply(x[is.zero,],1,function(y) min(y[y>tol]))
out
}

68 changes: 68 additions & 0 deletions kdetrees/R/dist.diss.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
### Copyright (C) 2014 -- Grady Weyenberg ###

##' dissimilarity map tree vectorization
##'
##' Dissimilarity maps convert trees to vectors using tip-to-tip path
##' lengths. Branch length information may be optionally discarded
##' (the default), resulting in vectors based solely on tree
##' topology.
##'
##' @param x an ape::multiPhylo object.
##' @param ... additional options for \code{ape::cophenetic.phylo}
##' @return a row matrix of tree vectors
##' @author Grady Weyenberg
##' @method as.matrix multiPhylo
##' @export
##' @examples
##' as.matrix(apicomplexa[1:5])
as.matrix.multiPhylo <- function(x,...){
tip.labels <- unique(unlist(lapply(x,"[[","tip.label")))

##Unroot the trees (is there any reason not to do this?)
##if(unroot){ x <- lapply(x,unroot) }
##Set branch lengths to 1
#if(!use.blen){ x <- lapply(x, compute.brlen, method = 1) }
##find tip-tip distances for a tree, if tips are missing, add NA
tip2tip <- function(y){
missing.tips <- setdiff(tip.labels,y$tip.label)
o <- cophenetic(y,...)
o <- rbind(o,matrix(nrow=length(missing.tips),ncol=ncol(o),dimnames=list(missing.tips)))
o <- cbind(o,matrix(nrow=nrow(o),ncol=length(missing.tips),dimnames=list(rownames(o),missing.tips)))
o[tip.labels,tip.labels][upper.tri(o)]
}
cnames <- outer(tip.labels, tip.labels, paste, sep="-")
##convert multiPhylo to a row matrix of tip distances
out <- t(sapply(x,tip2tip))
colnames(out) <- cnames[upper.tri(cnames)]
if(is.null(rownames(out))) rownames(out) <- paste("tree", 1:nrow(out), sep="")
out
}

##' Compute pairwise tree distances
##'
##'
##'
##' @param x either a row matrix of tree vectors, or a multiPhylo object
##' @param ... additional arguments passed to as.matrix.multiPhylo
##' @param method option passed to dist
##' @param p option passed to dist
##' @seealso dist
##' @return a dist object with tree-to-tree distances
##' @author Grady Weyenberg
##' @export
##' @examples
##' dist.diss(apicomplexa[1:5])
dist.diss <- function(x,...,method="euclidean",p=2){
##pairwise tree distances
if(inherits(x,"multiPhylo")) d <- as.matrix.multiPhylo(x,...)
##missing tip imputation
if (any(is.na(d))) {
cm <- apply(d,2,median,na.rm=TRUE)
if (any(is.na(cm)))
stop("There are some species which never appear in the same tree: ",
paste(names(cm)[is.na(cm)], collapse=", "))
for (j in 1:ncol(d)) { d[is.na(d[,j]),j] <- cm[j] }
warning("Tip labels were not the same for all trees, missing values have been imputed.")
}
dist(d,method=method,p=p)
}
129 changes: 129 additions & 0 deletions kdetrees/R/kde.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
### Copyright (C) 2014 -- Grady Weyenberg ###

##' Analyze a set of phylogenetic trees and attempt to identify trees
##' which are significantly discordant with other trees in the sample
##' (outlier trees).
##'
##' If bw is a single number, it will be used as a single constant
##' bandwidth. It can also be a vector, in which case it will be used
##' as variable bandwidths for each tree, repectively. Finally, if it
##' is a list (default), the list will be passed as arguments to the bw.nn
##' adaptive bandwith function.
##'
##' ... Is passed to either \code{distory::dist.multiPhylo} or
##' \code{dist.diss}, as appropriate. See the help for these functions
##' for more details.
##'
##' @title Identify discordant trees in a sample
##' @param trees multiPhylo object
##' @param k IQR multiplier for outlier detection
##' @param distance Select "geodesic" or "dissimilarity" distance
##' calculation method
##' @param outgroup if a character, reroot all trees with this species
##' as outgroup. The geodesic distance method requires rooted trees.
##' @param topo.only set all branch lengths to 1 before analyzing?
##' @param bw see Details
##' @param greedy greedy outlier detection?
##' @param ... additional arguments for distance calculation function, see details
##' @return a kdetrees object; list(density,outliers)
##' @author Grady Weyenberg
##' @export
##' @examples
##' kdeobj <- kdetrees(apicomplexa)
##' print(kdeobj)
##' kdeobj$outliers
##'
##' kdetrees(apicomplexa, k=2.0, distance="dissimilarity",topo.only=FALSE)
kdetrees <- function(trees,k=1.5,distance=c("geodesic","dissimilarity"), outgroup=NULL,
topo.only=FALSE,bw=list(),greedy=FALSE,...) {
distance <- match.arg(distance)
if (!inherits(trees,"multiPhylo") && all(sapply(trees,inherits,"phylo"))) class(trees) <- "multiPhylo"

if (topo.only) {
trees <- lapply(trees,compute.brlen, method = 1)
class(trees) <- "multiPhylo"
}

if (is.character(outgroup)) {
trees <- lapply(trees,root,outgroup,resolve.root=TRUE)
trees <- lapply(trees,"[<-","node.label", NULL)
class(trees) <- "multiPhylo"
}

dm <- switch(distance,
geodesic = as.matrix(dist.multiPhylo(trees,...)),
dissimilarity = as.matrix(dist.diss(trees,...)))
dimnames(dm) <- list(names(trees),names(trees))

cutoff <- function(x, c = 1.5){
qs <- quantile(x, c(0.25,0.75))
unname(diff(qs) * -c + qs[1])
}

if(is.list(bw)) bw <- do.call(bw.nn,c(list(dm),bw))
km <- normkern(dm,bw)
x <- estimate(km)
c <- cutoff(x, k)
i <- which( x < c )
if (greedy) {
while(TRUE){
i <- which( x < c )
if (length(i) < 1) break
x <- estimate(km,i)
c2 <- cutoff(x[-i], k)
## if(is.na(c2)) browser()
if(c2 > c) c <- c2 else break
}
}
structure(list(density=x, i=i, outliers=trees[i]), class="kdetrees",
call=match.call(), c=c)
}


##' Performs a complete kdetrees analysis, starting with reading trees
##' from a newick file on disk, and writing result files to the
##' working directory. Names and location of output files may be
##' controlled by optional arguments.
##'
##' @title Complete kdetrees analysis convenience function
##' @param infile newick file with trees
##' @param ... additional parameters for kdetrees
##' @param treeoutfile write outlier trees in newick format to this file
##' @param csvfile write density results to this file
##' @param plotfile print scatterplot of results to this file
##' @param histfile print histogram of density estimates to this file
##' @return result of kdetrees call
##' @author Grady Weyenberg
##' @export
kdetrees.complete <- function(infile,...,treeoutfile="outliers.tre",
csvfile="results.csv",plotfile="plot.png",
histfile="hist.png"){
trees <- read.tree(infile)
if (is.null(names(trees))) names(trees) <- paste("tree",seq_along(trees),sep="")
if (!inherits(trees,"multiPhylo")) stop("Could not read tree file")

res <- kdetrees(trees,...)
if (is.character(plotfile)) ggsave(plotfile,plot(res)) #scatterplot
if (is.character(histfile)) ggsave(histfile,hist(res)) #histogram
if (is.character(csvfile)) write.csv(as.data.frame(res),csvfile) #csv
if (is.character(treeoutfile) && length(res$outliers) > 0)
write.tree(res$outliers, treeoutfile, tree.names=TRUE,digits=5) #out-trees

res
}

##' estimate densities from kernel matrix
##'
##' @param x matrix of kernel contributions
##' @param i vector of columns to exclude from calculation
##' @return vector of density estimates for each tree
##' @author Grady Weyenberg
estimate <- function(x,i=integer()){
if(length(i) > 0)
rowSums(x[,-i]) - (!(1:nrow(x) %in% i)) * diag(x)
else
rowSums(x) - diag(x)
}



20 changes: 20 additions & 0 deletions kdetrees/R/kdetrees-package.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
### Copyright (C) 2014 -- Grady Weyenberg ###

##' kdetrees
##'
##' @docType package
##' @name kdetrees-package
##' @importFrom ape cophenetic.phylo unroot compute.brlen root write.tree read.tree
##' @importFrom distory dist.multiPhylo
##' @import ggplot2
NULL


##' Apicomplexa gene trees sample data set.
##'
##' @name apicomplexa
##' @docType data
##' @format a multiPhylo object with 268 phylogenetic trees
##' @examples
##' kdetrees(apicomplexa)
NULL
24 changes: 24 additions & 0 deletions kdetrees/R/kernels.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
### Copyright (C) 2014 -- Grady Weyenberg ###
### Kernel functions are defined here. These functions have a template
### function(x,bw,...)
### they should be symmetric about zero and should integrate to a constant
### value for all positive bw.

##' Generalized Gaussian kernel
##'
##' The un-normalized Gaussian kernel function: exp(-(abs(x/bw))^delta)/bw
##'
##' The bandwidth parameter may be used in any way that makes sense in
##' the above R expression. In particular, it may be a single value,
##' for a constant bandwidth, or a vector, with each element
##' corresponding the bandwidth of the kernel to be placed at each
##' respective observation.
##'
##' @param x places to evaluate kernel
##' @param bw bandwidth values
##' @param delta shape parameter for kernel
##' @return an object of the same type as x with the kernel evaluations
##' @author Grady Weyenberg
normkern <- function(x, bw=1.0, delta=2L)
exp(-abs(x/bw)^delta) / bw

Loading

0 comments on commit c3f06ae

Please sign in to comment.