distory/0000755000176200001440000000000013647011422011747 5ustar liggesusersdistory/NAMESPACE0000644000176200001440000000064313646353016013200 0ustar liggesusersuseDynLib(distory, .registration = TRUE) export(bethe.tree, bin.multiPhylo, boot.samples.idxs, convert.table.to.idx, dist.multiPhylo, distinct.edges, edge.from.split, get.bipartition, gromov.hyperbolicity, lookup.samples, mcmc.target.seq, orthant.boundary.tree, partition.leaves, phylo.diff, position.leverage) importFrom(graphics, par, plot) importFrom(stats, as.dist, runif) import(ape) distory/LICENSE0000644000176200001440000000013613646553520012765 0ustar liggesusersYEAR: 2010 COPYRIGHT HOLDER: John Chakerian and Susan Holmes ORGANIZATION: Stanford Universitydistory/man/0000755000176200001440000000000013646743247012542 5ustar liggesusersdistory/man/gromov.hyperbolicity.Rd0000644000176200001440000000345613644314000017212 0ustar liggesusers\name{gromov.hyperbolicity} \Rdversion{1.1} \alias{gromov.hyperbolicity} \title{Gromov Hyperbolicity Constant} \description{ Computes the Gromov Hyperbolicity Constant of a distance matrix. } \usage{ gromov.hyperbolicity(d, deltas = FALSE, scale = NA) } \arguments{ \item{d}{ A distance matrix of type \code{dist} or \code{matrix}, or anything that can be coerced into \code{dist} by \code{as.dist}. Must have at least 4 points.} \item{deltas}{ A logical value specifying whether to return the vector of delta values. Default is \code{FALSE}.} \item{scale}{ Specifies a scaling method for each delta. Default is no scaling (NA or "none"). Available methods are "max" which scales deltas by the max of the sums computed, and "perimeter" for the largest perimeter of the four points.} } \details{ This computes a constant that represents the relaxation of a 4-point condition for delta-hyperbolicity. See (Gromov 1987) for details. } \value{ The Gromov hyperbolicity constant of the given distance matrix. } \author{John Chakerian} \seealso{\code{\link{dist.multiPhylo}}} \references{ M. Gromov. \emph{Hyperbolic groups}. In Essays in Group Theory, pages 73--263. Springer, New York, 1987. Chakerian, J. and Holmes, S. P. Computational Tools for Evaluating Phylogenetic and Heirarchical Clustering Trees. arXiv:1006.1015v1. } \examples{ # scale final delta by max distance points <- cbind(runif(100), runif(100)) d <- dist(points) gromov.hyperbolicity(d)/max(d) # scale each delta by max distance for the 4 points points <- cbind(runif(100), runif(100)) d <- dist(points) gromov.hyperbolicity(d, scale="max") # scale each delta by the max perimeter for the 4 points points <- cbind(runif(100), runif(100)) d <- dist(points) gromov.hyperbolicity(d, scale="max") } \keyword{manip} distory/man/dist.multiPhylo.Rd0000644000176200001440000001101313646550142016123 0ustar liggesusers\name{dist.multiPhylo} \Rdversion{1.1} \alias{dist.multiPhylo} \title{Geodesic Distance Between Phylogenetic Trees} \description{ Computes the geodesic distance of a list of phylogenetic trees using a polynomial algorithm. } \usage{ dist.multiPhylo(x, method = "geodesic", force.multi2di = FALSE, outgroup = NULL, convert.multifurcating = FALSE, use.random.resolution = FALSE, scale = NULL, verbose = FALSE) } \arguments{ \item{x}{ A list of ape trees (class 'phylo'). The list does not have to be of class 'multiPhylo'. The function will also accept a list of strings of trees in Newick format, or a single string with trees in Newick format separated by semicolons. All the trees must have the same tip labels.} \item{method}{ Determines which distance method is used. Options are 'geodesic' for the tree space geodesic distance, or 'edgeset' for the number of edges (defined by splits of tips) that are different.} \item{force.multi2di}{ Force conversion of every tree to strict bifurcating through the ape function 'multi2di', using the use.random.resolution as its parameter. This option should not be used in conjunction with specification of an outgroup.} \item{outgroup}{ Specifies an outgroup to root each tree with respect to. This calls the ape function 'root' on every tree in the list.} \item{convert.multifurcating}{ Setting this option will check every tree for multifurcations using the ape function 'is.binary.tree' - if it returns FALSE, the ape function 'multi2di' will be called on it. Note that this does not ensure a tree is strictly binary, since ape considers an unrooted tree binary even if the root node is trifurcating. This option can be used in conjunction with specification of an outgroup.} \item{use.random.resolution}{ Specifies the parameter to 'multi2di' if needed.} \item{scale}{ Specifies a scale to make all trees unformly scaled (that is, the sum of all edges will be uniform)scale to make all trees unformly scaled (that is, the sum of all edge lengths will be uniform). The parameter can either be a tree of class \code{phylo} or a numeric value for the sum of all edge lengths.} \item{verbose}{ Turns on incremental status updates and more warnings. Helpful for large computations.} } \details{ This function computes the geodesic distance according to Billera et. al. using an algorithm based off of the polynomial time algorithm of Owen and Provan. Since it corresponds to a formal definition of tree-space as a space of strictly binary trees, no mulifurcations are allowed, including on the root node. In addition, negative and 0-lengthed edges are clamped to a very small value (DBL_MIN) for technical reasons. The Newick parser supports only a subset of the Newick format. In particular, it does not at the moment allow for internal node labels, only weights. Weights will be automatically set to 1 if not specified. It may be necessary to clean data in ape to make the trees conform to this. } \value{ Returns a distance matrix of class 'dist' representing the pairwise geodesic distances between all input trees. Keep in mind this distance matrix is not Euclidean. N/A values are provided in the case of an error in determining the distance. } \references{ Chakerian, J. and Holmes, S. P. Computational Tools for Evaluating Phylogenetic and Heirarchical Clustering Trees. arXiv:1006.1015v1. Billera, L. J., Holmes, S. P., and Vogtmann, K. (2001) Geometry of the space of phylogenetic trees. \emph{Advances in Applied Mathematics}, \bold{27}, 733--767. Megan Owen and J. Scott Provan (2010) A fast algorithm for computing geodesic distances in tree space. \emph{IEEE/ACM Transactions on Computational Biology and Bioinformatics}, 14 Jan. 2010. } \author{John Chakerian} \seealso{ \code{\link[ape]{dist.dna}}, \code{\link[ape]{boot.phylo}}, \code{\link[stats]{cmdscale}} } \examples{ data(woodmouse) otree <- root(nj(dist.dna(woodmouse)), "No305", resolve.root=TRUE) breps <- 250 trees <- boot.phylo(otree, woodmouse, B=breps, function(x) root(nj(dist.dna(x)), "No305", resolve.root=TRUE), trees = TRUE) combined.trees <- c(list(otree), trees$trees) tree.dists <- dist.multiPhylo(combined.trees) mdres <- cmdscale(tree.dists, k=breps, add=TRUE) plot(mdres$points[,1], mdres$points[,2], col = c("red", rep("black", breps))) text(mdres$points[,1], mdres$points[,2], labels = 1:(breps + 1), cex = 0.7, adj = c(0, 2)) } \keyword{manip} distory/man/orthant.boundary.tree.Rd0000644000176200001440000000314713644314516017264 0ustar liggesusers\name{orthant.boundary.tree} \Rdversion{1.1} \alias{orthant.boundary.tree} \title{Orthant Boundary Tree} \description{ Produces a degenerate tree on the boundary between trees that differ by one split. } \usage{ orthant.boundary.tree(x,y) } \arguments{ \item{x}{The tree in the first orthant.} \item{y}{The tree in the second orthant.} } \details{ The tree found is the tree on the boundary between the two orthants such that it is on the straight line connecting the two trees when one orthant is thought of as being the (-,+) quadrant and the second orthant as being the (+,+) quadrant, where the (0,y) line is the particular boundary in question. } \value{ Returns an object of class 'phylo' representing the boundary tree. } \author{John Chakerian} \references{ Chakerian, J. and Holmes, S. P. Computational Tools for Evaluating Phylogenetic and Heirarchical Clustering Trees. arXiv:1006.1015v1. } \seealso{\code{\link{mcmc.target.seq}}} \examples{ data(woodmouse) otree <- root(fastme.ols(dist.dna(woodmouse)), "No305", resolve.root=TRUE) breps <- 200 trees <- boot.phylo(otree, woodmouse, B=breps, function(x) root(fastme.ols(dist.dna(x)), "No305", resolve.root=TRUE), trees = TRUE) combined.trees <- c(list(otree), trees$trees) binning <- bin.multiPhylo(combined.trees) tree.a <- combined.trees[[match(1, binning)]] i <- 2 max.bin <- max(binning) tree.b <- combined.trees[[match(2, binning)]] while(length(distinct.edges(tree.a,tree.b)) > 1 && i < max.bin) { i = i + 1 tree.b = combined.trees[[match(i, binning)]] } plot(orthant.boundary.tree(tree.a, tree.b)) } \keyword{manip} distory/man/mcmc.target.seq.Rd0000644000176200001440000000700213646502111016002 0ustar liggesusers\name{mcmc.target.seq} \Rdversion{1.1} \alias{mcmc.target.seq} \alias{boot.samples.idxs} \alias{lookup.samples} \alias{convert.table.to.idx} \title{Find MCMC Target Sequence} \description{ \code{mcmc.target.seq} uses MCMC to find a configuration of DNA positions to get as close as possible to a given tree. \code{boot.samples.idxs} bootstraps over indices into a DNA matrix. \code{lookup.samples} goes from an index representation of a configuration of DNA to the actual DNAbin format. \code{convert.table.to.idx} converts a table of counts for positions 1..n into a list of indices corresponding to positions (i.e. goes from the tabled form to a vector whose tabling matches the input). } \usage{ mcmc.target.seq(data, x, F, n) boot.samples.idxs(data, B = 100, block = 1) lookup.samples(data, idxs) convert.table.to.idx(T) } \arguments{ \item{data}{A DNA matrix in DNAbin format.} \item{x}{A tree of class 'phylo' to estimate.} \item{F}{A tree estimation function, accepting a DNA matrix in DNAbin format and returning a tree of class 'phylo.'} \item{n}{The number of MCMC iterations to perform.} \item{B}{ The number of bootstrap replicates.} \item{block}{The block size to use during bootstrapping.} \item{idxs}{A list of numeric vectors of indices to use for lookup.} \item{T}{A table or table-like vector to convert.} } \details{ \code{mcmc.target.seq} performs an MCMC with simulated annealing to locate a configuration of DNA positions from the original matrix that gets as close as possible to a target tree. Propositions for the MCMC replacing one character with another uniformly at random. The remaining functions are intended to be used as support functions. } \value{ \code{mcmc.target.seq} returns a list of 4 elements: a numeric vector of counts of each position in the original matrix, the best estimated tree, the best distance from the estimated tree to the target tree, and a numeric vector of the distances for every iteration of the simulation. \code{boot.samples.idxs} returns a numeric vector representing the bootstrapped idices. \code{lookup.samples} returns a list of objects of class DNAbin corresponding to the DNA sequences generated from indices into the original DNA matrix. \code{convert.table.to.idx} returns a numeric vector of indices based on the table counts. } \author{John Chakerian} \references{ Chakerian, J. and Holmes, S. P. Computational Tools for Evaluating Phylogenetic and Heirarchical Clustering Trees. arXiv:1006.1015v1. } \seealso{ \code{\link{dist.multiPhylo}}, \code{\link{orthant.boundary.tree}} } \examples{ \dontrun{ ## This example has been excluded from checks: ## copy/paste the code to try it data(woodmouse) otree <- root(fastme.ols(dist.dna(woodmouse)), "No305", resolve.root=TRUE) breps <- 200 trees <- boot.phylo(otree, woodmouse, B=breps, function(x) root(fastme.ols(dist.dna(x)), "No305", resolve.root=TRUE), trees = TRUE) combined.trees <- c(list(otree), trees$trees) binning <- bin.multiPhylo(combined.trees) tree.a <- combined.trees[[match(1, binning)]] i <- 2 max.bin <- max(binning) tree.b <- combined.trees[[match(2, binning)]] while(length(distinct.edges(tree.a,tree.b)) > 1 && i < max.bin) { i = i + 1 tree.b = combined.trees[[match(i, binning)]] } bdy.tree <- orthant.boundary.tree(tree.a, tree.b) f.est <- function(x) root(nj(dist.dna(x)), "No305", resolve.root=TRUE) res <- mcmc.target.seq(woodmouse, bdy.tree, f.est, 1000) par(mfrow=c(2,1)) plot(res$tree) plot(res$vals) }} \keyword{manip} distory/man/bin.multiPhylo.Rd0000644000176200001440000000250613644312474015741 0ustar liggesusers\name{bin.multiPhylo} \Rdversion{1.1} \alias{bin.multiPhylo} \title{Bin Trees} \description{ Bins trees according to branching topology. } \usage{ bin.multiPhylo(treelist) } \arguments{ \item{treelist}{A list of trees that can be passed to dist.phylo (see the help for dist.phylo for acceptable formats).} } \details{ Bins trees according to branching topology. Two trees are considered to have the same topology if the same set of partitions of tips are produced by the edges, which corresponds to the same branching up to rearrangement of tips. } \value{ Returns a numeric vector of bin ids. Bin ids are assigned in order of the first tree in that bin, that is, the first k unique trees in the list passed will be assigned bins 1..k in order of appearance. } \author{John Chakerian} \references{ Chakerian, J. and Holmes, S. P. Computational Tools for Evaluating Phylogenetic and Heirarchical Clustering Trees. arXiv:1006.1015v1. } \seealso{\code{\link{dist.multiPhylo}}} \examples{ data(woodmouse) otree <- root(fastme.ols(dist.dna(woodmouse)), "No305", resolve.root=TRUE) breps <- 500 trees <- boot.phylo(otree, woodmouse, B=breps, function(x) root(fastme.ols(dist.dna(x)), "No305", resolve.root=TRUE),trees=TRUE) combined.trees <- c(list(otree), trees$trees) bin.multiPhylo(combined.trees) } \keyword{manip} distory/man/position.leverage.Rd0000644000176200001440000000353713644315142016461 0ustar liggesusers\name{position.leverage} \Rdversion{1.1} \alias{position.leverage} \title{Position Leverage} \description{ Provides a rough heuristic for determining the degree to which each position in the DNA matrix affects the resulting tree. } \usage{ position.leverage(data, F, to = NULL, rep = 50, by = 1) } \arguments{ \item{data}{A DNA matrix in \code{DNAbin} format.} \item{F}{ A tree estimation function, accepting a DNA matrix of class \code{DNAbin} and returning a tree of class \code{phylo}.} \item{to}{ The tree with which distances are measured in respect to, or \code{NULL} to indicate the tree estimated by \code{F} for the starting DNA matrix.} \item{rep}{ The number of times to replicate the position in question.} \item{by}{ The function will perform the calculation on every \code{by}-th position (that is, on \code{seq(1, N, by)}).}} \details{ This function takes a DNA matrix and, for every \code{by}-th position, replicates that position \code{rep} times, randomly removing \code{rep} other positions to keep all sequences the same length other positions to keep all sequences the same length. For each new DNA matrix created in this way, \code{F} is used to estimate the corresponding tree, and the distance to tree \code{to} is computed and stored. This distance can be thought of as somewhat analogous to the leverage of that position. } \value{ Returns a numeric vector of distances from tree \code{to} for each position sampled. } \author{John Chakerian} \references{ Chakerian, J. and Holmes, S. P. Computational Tools for Evaluating Phylogenetic and Heirarchical Clustering Trees. arXiv:1006.1015v1. } \seealso{\code{\link{dist.multiPhylo}}} \examples{ data(woodmouse) f.est <- function(x) root(nj(dist.dna(x)), "No305", resolve.root = TRUE) position.leverage(woodmouse, f.est, by = 10) } \keyword{manip} distory/man/phylo.diff.Rd0000644000176200001440000000465013644316231015063 0ustar liggesusers\name{phylo.diff} \Rdversion{1.1} \alias{phylo.diff} \alias{distinct.edges} \alias{edge.from.split} \alias{get.bipartition} \alias{partition.leaves} \title{Differences Between Phylogenetic Trees} \description{ A family of functions for determining and plotting the differences between two trees. \code{phylo.diff} plots two trees side by side, highlighting edges unique to each tree in red. \code{distinct.edges} finds the edges present in the first argument not in the second. \code{edge.from.split} locates the edge id from a given split. \code{get.bipartition} gets the bipartition of tips formed by a single edge. \code{partition.leaves} returns the set of all bipartitions from all edges. } \usage{ phylo.diff(x, y, \dots) distinct.edges(x, y) edge.from.split(x, split) get.bipartition(x, e) partition.leaves(x) } \arguments{ \item{x}{The first (or only) tree.} \item{y}{The second tree, for the functions that accept two trees.} \item{split}{A list of bipartitions, probably from \code{partition.leaves}.} \item{e}{An edge for a particular tree, given as an id.} \item{\dots}{Additional arguments to pass to the \code{plot.phylo} function.} } \details{ \code{phylo.diff} uses the ape tree plotting function. The other functions are mostly meant as support functions. } \value{ \code{phylo.diff} returns invisible. \code{distinct.edges} returns a numeric vector of edge ids for the first tree. \code{edge.from.split} returns an edge id for a particular tree corresponding to a given bipartition and NA if none such edge exists. \code{get.bipartition} returns a character vector of the tips below that edge in the given tree. \code{partition.leaves} returns a list of partitions (themselves character vectors) of the given tree. } \author{John Chakerian} \references{ Chakerian, J. and Holmes, S. P. Computational Tools for Evaluating Phylogenetic and Heirarchical Clustering Trees. arXiv:1006.1015v1. } \seealso{\code{\link{dist.multiPhylo}}} \examples{ data(woodmouse) otree <- root(fastme.ols(dist.dna(woodmouse)), "No305", resolve.root=TRUE) breps <- 10 trees <- boot.phylo(otree, woodmouse, B=breps, function(x) root(fastme.ols(dist.dna(x)), "No305", resolve.root=TRUE), trees = TRUE) combined.trees <- c(list(otree), trees$trees) binning <- bin.multiPhylo(combined.trees) phylo.diff(combined.trees[[match(1, binning)]], combined.trees[[match(2, binning)]]) } \keyword{manip} distory/man/distory-package.Rd0000644000176200001440000000232513644313577016116 0ustar liggesusers\name{distory-package} \Rdversion{1.1} \alias{distory-package} \alias{distory} \docType{package} \title{Distance Between Phylogenetic Histories} \description{ The \pkg{distory} package provides functions for computing geodesic distances between phylogenetic trees, as well as functions which use this distance. Methods for computing Gromov delta-hyperbolicity, Markov Chain Monte Carlo routines in tree space, and per-position leverage for DNA sequences are included. } \details{ A description of the algorithm used for the distance computation can be found in \code{\link{dist.multiPhylo}}. } \author{ John Chakerian and Susan Holmes } \references{ Chakerian, J. and Holmes, S. P. Computational Tools for Evaluating Phylogenetic and Heirarchical Clustering Trees. arXiv:1006.1015v1. Billera, L. J., Holmes, S. P., and Vogtmann, K. (2001) Geometry of the space of phylogenetic trees. \emph{Advances in Applied Mathematics}, \bold{27}, 733_-767. Megan Owen and J. Scott Provan (2010) A fast algorithm for computing geodesic distances in tree space. \emph{IEEE/ACM Transactions on Computational Biology and Bioinformatics}, 14 Jan. 2010. } \keyword{package} distory/man/bethe.tree.Rd0000644000176200001440000000216513644312557015054 0ustar liggesusers\name{bethe.tree} \Rdversion{1.1} \alias{bethe.tree} \title{Bethe Tree} \description{ Generates a Bethe tree with given tips, inner edge lengths, and outgroup. } \usage{ bethe.tree(tips, level.lengths = NULL, outgroup="O", outgroup.dist=1) } \arguments{ \item{tips}{ A list of tip names as a character vector. Should be a power of 2. All tip names must be distinct.} \item{level.lengths}{ Edge lengths for each level, counted from the bottom up. NULL means a default of 1. If the vector isn't long enough, the last value will be repeated as necessary.} \item{outgroup}{The tip label for the outgroup.} \item{outgroup.dist}{The distance of the outgroup from the root.} } \details{ Generates a Bethe tree with specified internal edge lengths. } \value{ A class of type \code{phylo} representing the tree. } \author{John Chakerian} \references{ Chakerian, J. and Holmes, S. P. Computational Tools for Evaluating Phylogenetic and Heirarchical Clustering Trees. arXiv:1006.1015v1. } \seealso{\code{\link{dist.multiPhylo}}} \examples{ plot(bethe.tree(as.character(1:16), 1:4, "17", 14)) } \keyword{datagen} distory/DESCRIPTION0000644000176200001440000000122713647011422013457 0ustar liggesusersPackage: distory Version: 1.4.4 Date: 2020-04-19 Title: Distance Between Phylogenetic Histories Author: John Chakerian [aut, cre], Susan Holmes [aut, cre] Maintainer: Emmanuel Paradis Depends: ape (>= 5.0) Imports: graphics, stats Description: Geodesic distance between phylogenetic trees and associated functions. The theoretical background of 'distory' is published in Billera et al. (2001) "Geometry of the space of phylogenetic trees." . License: BSD_3_clause + file LICENSE NeedsCompilation: yes Packaged: 2020-04-19 03:33:59 UTC; paradis Repository: CRAN Date/Publication: 2020-04-19 09:00:02 UTC distory/src/0000755000176200001440000000000013646743247012556 5ustar liggesusersdistory/src/phydist_r.cpp0000644000176200001440000002166313062442436015263 0ustar liggesusers/* * Polynomial Time Distance Algorithm for Binary Rooted Phylogenetic Trees * * Note: Algorithm based on presentation by Scott Provan / Megan Owen, * "Computing Geodesic Distances in Tree Space in Polynomial Time" * * John Chakerian * chakj@stanford.edu * * Last edited: Feb 28 2010 * */ #include #include #include #include #include "treedist.h" #include #include #include // for fmax2 using namespace std; int compute_phylo_distance_matrix(vector trees_in, bool verbose, double *m); void build_tree_list(std::vector &trees_in, std::vector &trees, bool verbose); double gromov_graycode(double *m, size_t n, double* deltas, int scale); inline bool edge_less_than(const PhyEdge &a, const PhyEdge &b) { for(unsigned int i = 0; i < a.split.size(); i++) { if(a.split[i] < b.split[i]) return true; if(b.split[i] < a.split[i]) return false; } // if we're here, they're equal, so a is NOT less than b. return false; } inline int edgeset_difference(const PhyEdgeSet &a, const PhyEdgeSet &b) { int sim = 0; for(unsigned int i = 0; i < a.size(); i++) { for(unsigned int j = 0; j < a.size(); j++) { if(a[i] == b[j]) { sim++; break; } } } return a.size() - sim; } extern "C" { // returns a distance matrix of doubles SEXP phycpp_compute_tree_distance_set(SEXP trees, SEXP verbose) { SEXP distmatrix; bool be_verbose = asLogical(verbose); // get number of trees int len = length(trees); vector treevec(len); // convert 'trees' to a vector of tree strings for(int i = 0; i < len; i++) treevec[i] = CHAR(STRING_ELT(VECTOR_ELT(trees,i),0)); PROTECT(distmatrix = allocMatrix(REALSXP,len, len)); compute_phylo_distance_matrix(treevec, be_verbose, REAL(distmatrix)); // replace -1 values with R N/A values for(int i = 0; i < len*len; i++) if(REAL(distmatrix)[i] == -1) REAL(distmatrix)[i] = R_NaReal; UNPROTECT(1); return distmatrix; } SEXP multiset_diff_integer(SEXP set1, SEXP set2) { unsigned len1 = length(set1); int *s1 = INTEGER(set1); unsigned len2 = length(set2); int *s2 = INTEGER(set2); SEXP newset; PROTECT(newset = allocVector(INTSXP, len1)); int *a = INTEGER(newset); unsigned p=0; for(unsigned i = 0; i < len1; i++) { bool keep = true; for(unsigned j = 0; j < len2; j++) { if(s1[i] == s2[j]) { keep = false; break; } } if(keep) a[p++] = s1[i]; } for(unsigned i = p; i < len1; i++) { a[i] = NA_INTEGER; } UNPROTECT(1); return newset; } // returns edge differences SEXP phycpp_bin_trees(SEXP treelist) { // get number of trees int len = length(treelist); vector treevec(len); // convert 'trees' to a vector of tree strings for(int i = 0; i < len; i++) treevec[i] = CHAR(STRING_ELT(VECTOR_ELT(treelist,i),0)); std::vector trees; build_tree_list(treevec, trees, false); SEXP distmatrix; PROTECT(distmatrix = allocMatrix(REALSXP,len,len)); double *d = REAL(distmatrix); unsigned int sz = trees.size(); // zero out diagonal for(unsigned int i = 0; i < trees.size(); i++) d[sz*i + i] = 0; for(unsigned int i = 0; i < trees.size(); i++) { for(unsigned int j = i; j < trees.size(); j++) { int sim = edgeset_difference(trees[i], trees[j]); d[sz*i + j] = sim; d[sz*j + i] = sim; } } UNPROTECT(1); return distmatrix; } SEXP gromov_distmatrix(SEXP distmatrix, SEXP bDeltas, SEXP scale_method) { bool list_deltas = asLogical(bDeltas); int scaleM = asInteger(scale_method); unsigned len = length(distmatrix); unsigned n = sqrt(static_cast(len)); double *d = REAL(distmatrix); SEXP g; if(list_deltas) { PROTECT(g = allocVector(REALSXP, (n*(n-1)*(n-2)*(n-3))/(4*3*2))); gromov_graycode(d, n, REAL(g), scaleM); UNPROTECT(1); } else { PROTECT(g = allocVector(REALSXP, 1)); REAL(g)[0] = gromov_graycode(d, n, NULL, scaleM); UNPROTECT(1); } return g; } } void build_tree_list(std::vector &trees_in, std::vector &trees, bool verbose) { string t; // build the leaf label LUT t = trees_in[0]; map strtbl = AssignLeafLabels(t); for(unsigned int treeno = 0; treeno < trees_in.size(); treeno++) { t = trees_in[treeno]; if(verbose) Rprintf("Parsing tree %d\n", treeno); PhyEdgeSet tr = NewickParse(t, strtbl); ClampNegativeWeights(tr); trees.push_back(tr); } } int compute_phylo_distance_matrix(vector trees_in, bool verbose, double *distance_matrix) { std::vector trees; build_tree_list(trees_in, trees, verbose); int ctr = 0; int tot = 0.5 * (trees.size() * (trees.size() - 1)); // figure out how big the trees are int k = -1; while(trees[++k].size() == 0) ; stl_bool *incompatibility_buffer = new stl_bool[trees[k].size()*trees[k].size()]; for(unsigned int j = 0; j < trees.size(); j++) // cols { for(unsigned int i = 0; i < j; i++) // rows { ctr++; if(verbose) Rprintf("%d/%d\t\t[%3.2f%%]\n", ctr,tot, (ctr/(double)tot)*100.0); if(trees[i].size() == 0 || trees[j].size() == 0) // mark invalid trees with -1 so we can mark them as N/A later { distance_matrix[i*trees.size()+j] = -1; distance_matrix[j*trees.size()+i] = -1; } else { double d = TreeDistance(trees[i],trees[j], incompatibility_buffer); distance_matrix[i*trees.size()+j] = d; distance_matrix[j*trees.size()+i] = d; } } } delete [] incompatibility_buffer; // fill in the diagonal for(unsigned int i = 0; i < trees.size(); i++) { distance_matrix[i*trees.size()+i] = 0; } return 0; } double gromov_graycode(double *m, size_t n, double *deltas, int scale) { /* implements Knuth 7.2.1.3 Alg R (Revolving Door) */ unsigned c[] = {(unsigned)-1,0,1,2,3,(unsigned)n}; unsigned t = 4; unsigned j; double s[3]; double max = 0; double a,b; double raw_delta; unsigned i = 0; R2: s[0] = m[c[1]*n + c[2]] + m[c[3]*n + c[4]]; s[1] = m[c[1]*n + c[3]] + m[c[2]*n + c[4]]; s[2] = m[c[1]*n + c[4]] + m[c[2]*n + c[3]]; /* get the two largest */ if(s[0] >= s[1]) { a = s[0]; if(s[1] >= s[2]) b = s[1]; else b = s[1]; } else { a = s[1]; if(s[2] >= s[0]) b = s[2]; else b = s[1]; } raw_delta = fabs(a-b); switch(scale) { case 2: raw_delta /= fmax2(a,b); break; case 3: double dd[4]; dd[0] = m[c[1]*n + c[4]] + m[c[1]*n + c[3]] + m[c[3]*n + c[4]]; dd[1] = m[c[1]*n + c[4]] + m[c[1]*n + c[2]] + m[c[2]*n + c[4]]; dd[2] = m[c[2]*n + c[3]] + m[c[3]*n + c[4]] + m[c[2]*n + c[4]]; dd[3] = m[c[1]*n + c[2]] + m[c[1]*n + c[3]] + m[c[2]*n + c[3]]; if(dd[0] >= dd[1] && dd[0] >= dd[2] && dd[0] >= dd[3]) raw_delta /= dd[0]; else if(dd[1] >= dd[0] && dd[1] >= dd[2] && dd[1] >= dd[3]) raw_delta /= dd[1]; else if(dd[2] >= dd[0] && dd[2] >= dd[1] && dd[2] >= dd[3]) raw_delta /= dd[2]; else raw_delta /= dd[3]; break; } if(deltas) { deltas[i] = raw_delta; } if(raw_delta > max) { max = raw_delta; } i++; //R3: /* N.B. assumes t is even */ if(c[1] > 0) { c[1]--; goto R2; } else { j = 2; goto R5; } R4: if(c[j] >= j) { c[j] = c[j-1]; c[j-1] = j - 2; goto R2; } else { j++; } R5: if(c[j] + 1 < c[j+1]) { c[j-1] = c[j]; c[j]++; goto R2; } else { j++; if(j <= t) { goto R4; } } return max/2.0; } distory/src/phylo.h0000644000176200001440000000502513062442437014051 0ustar liggesusers#ifndef _PHYLO_H_ #define _PHYLO_H_ #include #include #include #include #include using namespace std; typedef unsigned char stl_bool; // to prevent bit packing, which slows access time down considerably due to non-addressability of bits struct PhyEdge { double weight; unsigned int id; // used so we can identify edges without doing the set comparisons vector split; // downward == false, upward == true PhyEdge(unsigned short leafcount) { /* this is a hack to add in the root node implicitly. it is added as an * upward edge that is never assigned to. */ split.resize(leafcount+1, true); } bool operator==(const PhyEdge &rhs) const // for inter-tree comparisons { return (split == rhs.split); /* two edges are the same if their splits of leaves are the same */ } bool operator<(const PhyEdge &rhs) const // FOR ORDERING WITHIN A SINGLE TREE ONLY { return id < rhs.id; } inline unsigned int SubsetRemainder(const PhyEdge &rhs) const { bool subset = true; // check left subset for(unsigned int i = 0; i < rhs.split.size(); i++) { if(rhs.split[i] == false && split[i] != false) { subset = false; break; } } if(subset) { // count # of lefts in this int nl = 0; for(unsigned int i = 0; i < split.size(); i++) if(split[i] == false) nl++; // count # of lefts in RHS int nr = 0; for(unsigned int i = 0; i < rhs.split.size(); i++) if(rhs.split[i] == false) nr++; return nl - nr; } // check right subset subset = true; // reset flag for(unsigned int i = 0; i < rhs.split.size(); i++) { if(rhs.split[i] == true && split[i] != false) { subset = false; break; } } if(subset) { // count # of lefts in this int nl = 0; for(unsigned int i = 0; i < split.size(); i++) if(split[i] == false) nl++; // count # of lefts in RHS int nr = 0; for(unsigned int i = 0; i < rhs.split.size(); i++) if(rhs.split[i] == true) nr++; return nl - nr; } return UINT_MAX; } }; typedef vector PhyEdgeSet; #endif /* _PHYLO_H_ */ distory/src/treedist.h0000644000176200001440000000151513062442437014541 0ustar liggesusers/* * Polynomial Time Distance Algorithm for Binary Rooted Phylogenetic Trees * * Note: Algorithm based on presentation by Scott Provan / Megan Owen, * "Computing Geodesic Distances in Tree Space in Polynomial Time" * * John Chakerian * chakj@stanford.edu * */ #ifndef __TREEDIST_H_ #define __TREEDIST_H_ #include #include "phylo.h" #include "newick.h" double ConeDistance(PhyEdgeSet &a, PhyEdgeSet &b); double TreeDistance(PhyEdgeSet &a, PhyEdgeSet &b, stl_bool* incompatible); double DisjointTreeDistance(PhyEdgeSet &a, PhyEdgeSet &b, stl_bool* incompatible, unsigned int max_id); void ClampNegativeWeights(PhyEdgeSet &a); enum maxflow_enum { LP, EK, EKF, KOL, PR }; extern maxflow_enum mf_method; enum metric_enum { GEODESIC, CONEPATH }; extern metric_enum metric_method; string tolower_str(string in); #endif distory/src/distory.c0000644000176200001440000000120413646352561014407 0ustar liggesusers#include #include SEXP phycpp_bin_trees(SEXP treelist); SEXP phycpp_compute_tree_distance_set(SEXP trees, SEXP verbose); SEXP gromov_distmatrix(SEXP distmatrix, SEXP bDeltas, SEXP scale_method); static R_CallMethodDef Call_entries[] = { {"phycpp_bin_trees", (DL_FUNC) &phycpp_bin_trees, 1}, {"phycpp_compute_tree_distance_set", (DL_FUNC) &phycpp_compute_tree_distance_set, 2}, {"gromov_distmatrix", (DL_FUNC) &gromov_distmatrix, 3}, {NULL, NULL, 0} }; void R_init_distory(DllInfo *info) { R_registerRoutines(info, NULL, Call_entries, NULL, NULL); R_useDynamicSymbols(info, FALSE); } distory/src/newick.h0000644000176200001440000000066213062442436014177 0ustar liggesusers#ifndef _NEWICK_H_ #define _NEWICK_H_ #include #include #include "phylo.h" using namespace std; /* Parse a single tree in Newick format */ PhyEdgeSet NewickParse(const string &str, map &name_to_pos); /* Use a single tree to assign each leaf label an integer ID to be used for all * future parsing */ map AssignLeafLabels(const string &str); #endif /* _NEWICK_H_ */ distory/src/treedist-stripped.cpp0000644000176200001440000004045613062442437016733 0ustar liggesusers/* * Polynomial Time Distance Algorithm for Binary Rooted Phylogenetic Trees * * Note: Algorithm based on presentation by Scott Provan / Megan Owen, * "Computing Geodesic Distances in Tree Space in Polynomial Time" * * John Chakerian * chakj@stanford.edu * * Last edited: Feb 28 2010 * * Note that this is a stripped version because it includes only a built-in * Edmonds-Karp network flow implementation; linear programming and Boost.Graph * implementations are not provided, since they would introduce library * dependencies. * */ #include #include #include #include #include #include #include #include #include #include #include #include #include "treedist.h" #include "phylo.h" #include "newick.h" using namespace std; // Callback for clamping 0 or negative weights at a Very Small Value (that can // still be squared) void ClampNegativeWeights(PhyEdgeSet &a) { for(unsigned int i = 0; i < a.size(); i++) if(a[i].weight < sqrt(DBL_MIN)) a[i].weight = sqrt(DBL_MIN); } // Two edges are compatible if there is an intersection between the splits // that is empty. bool EdgesCompatible(PhyEdge &e1, PhyEdge &e2) { // n.b. - left == false, right == true bool n1 = true; bool n2 = true; bool n3 = true; bool n4 = true; for(unsigned int i = 0; i < e1.split.size(); i++) { // left/left if(e1.split[i] == false && e2.split[i] == false) n1 = false; // right/right if(e1.split[i] == true && e2.split[i] == true) n2 = false; // right/left if(e1.split[i] == true && e2.split[i] == false) n3 = false; // left/right if(e1.split[i] == false && e2.split[i] == true) n4 = false; } return (n1 || n2 || n3 || n4); } double TreeDistance(PhyEdgeSet &a, PhyEdgeSet &b, stl_bool* incompatible) { // 1. Separate out into shared edges & unique edges: // a) foreach edge e in a: // i) search b for any edge e' with a leaf set equivalent to e // ii) if found, 'remove' e and e' and store as a pair in // shared_edges // b) all remaining edges are unique to each assert(a.size() == b.size()); std::vector > shared_edges; shared_edges.reserve(a.size()); vector a_shared_idxs; a_shared_idxs.reserve(a.size()); vector b_shared_idxs; b_shared_idxs.reserve(b.size()); for(unsigned int i = 0; i < a.size(); i++) { for(unsigned int j = 0; j < b.size(); j++) { if(a[i] == b[j]) { shared_edges.push_back( make_pair(a[i], b[j]) ); a_shared_idxs.push_back(i); b_shared_idxs.push_back(j); break; // stop checking for this i } } } // The not-shared edges PhyEdgeSet p,q; for(unsigned int i = 0; i < a.size(); i++) if(find(a_shared_idxs.begin(), a_shared_idxs.end(), i) == a_shared_idxs.end()) p.push_back(a[i]); for(unsigned int j = 0; j < b.size(); j++) if(find(b_shared_idxs.begin(), b_shared_idxs.end(), j) == b_shared_idxs.end()) q.push_back(b[j]); std::vector > bins(shared_edges.size()+1); // last one is generic bin // 1.5. For every unique edge, classify under the tightest shared edge, // Classify left-tree edges for(unsigned int i = 0; i < p.size(); i++) { unsigned int smallest_subset = UINT_MAX; unsigned int edge_id = UINT_MAX; for(unsigned int j = 0; j < shared_edges.size(); j++) { unsigned int r = shared_edges[j].first.SubsetRemainder(p[i]); if(r < smallest_subset) { smallest_subset = r; edge_id = j; if(smallest_subset == 1) break; } } if(edge_id == UINT_MAX) // dump it in the generic bin bins[shared_edges.size()].first.push_back(p[i]); else bins[edge_id].first.push_back(p[i]); } // Classify right-tree edges for(unsigned int i = 0; i < q.size(); i++) { unsigned int smallest_subset = UINT_MAX; unsigned int edge_id = UINT_MAX; for(unsigned int j = 0; j < shared_edges.size(); j++) { unsigned int r = shared_edges[j].second.SubsetRemainder(q[i]); if(r < smallest_subset) { smallest_subset = r; edge_id = j; if(smallest_subset == 1) break; } } if(edge_id == UINT_MAX) // dump it in the generic bin bins[shared_edges.size()].second.push_back(q[i]); else bins[edge_id].second.push_back(q[i]); } // Precache all incompatibilities in a 'huge' logical array // (this could be made into a bitmatrix later, if space is in demand) // (this could also be upper-triangular, but again, space isn't a big deal // at the moment) // Keep in mind here that the IDs are _only_ valid within a certain tree. // Thus the ordering always must be (left-hand tree edge , right-hand tree // edge), and the matrix is NOT symmetric unsigned int max_id = a.size(); for(unsigned int binid = 0; binid < bins.size(); binid++) for(unsigned int i = 0; i < bins[binid].first.size(); i++) for(unsigned int j = 0; j < bins[binid].second.size(); j++) incompatible[bins[binid].first[i].id*max_id + bins[binid].second[j].id] = EdgesCompatible(bins[binid].first[i], bins[binid].second[j]); // 2. For every shared edge bin, compute distance for all unique edges under // it. double unique_edge_factor = 0; for(unsigned int i = 0; i < bins.size(); i++) if(bins[i].first.size() > 0) unique_edge_factor += DisjointTreeDistance(bins[i].first, bins[i].second, incompatible, max_id); // 3. Calculate the final distance from both shared & unique edges // a) the shared edges have Euclidean distance: double shared_edge_factor = 0; for(unsigned int i = 0; i < shared_edges.size(); i++) { double t = shared_edges[i].first.weight - shared_edges[i].second.weight; shared_edge_factor += t*t; } // c) combine them together and return return sqrt(unique_edge_factor + shared_edge_factor); } struct NetworkFlowResult { double flow; PhyEdgeSet A_i, B_i, A_i_prime, B_i_prime; }; NetworkFlowResult EKNetworkFlow(PhyEdgeSet &A_i, PhyEdgeSet& B_i, stl_bool* incompatible, unsigned int max_id) { // make these convenient, since they won't be changing unsigned int ac = A_i.size(); unsigned int bc = B_i.size(); // compute square sums of A_i and B_i double sumsq_A_i = 0; for(unsigned int i = 0; i < ac; i++) sumsq_A_i += A_i[i].weight*A_i[i].weight; double sumsq_B_i = 0; for(unsigned int j = 0; j < bc; j++) sumsq_B_i += B_i[j].weight*B_i[j].weight; // A_i first, then B_i, then source, then sink unsigned int N = ac + bc + 2; // for offsets unsigned int s = ac+bc; // source index unsigned int t = ac+bc+1; // sink index // set up the variables for Edmonds-Karp // total flow double f = 0; double *F = new double[N*N]; // flow double *C = new double[N*N]; // capacity // set all the flow to zero fill(F,F + N*N,0); fill(C,C + N*N,0); // necessary? // add capacities from source to A_i for(unsigned int i = 0; i < A_i.size(); i++) // for (s,i) { C[N*s + i] = (A_i[i].weight*A_i[i].weight)/sumsq_A_i; C[N*i + s] = 0; } // add capacities from B_i to sink for(unsigned int j = 0; j < B_i.size(); j++) // for (j, t) { C[N*(ac+j) + t] = (B_i[j].weight*B_i[j].weight)/sumsq_B_i; C[N*t + (ac+j)] = 0; } // add in incompatible edges vector > incompatible_edge_pairs; for(unsigned int i = 0; i < A_i.size(); i++) { for(unsigned int j = 0; j < B_i.size(); j++) { if(incompatible[A_i[i].id*max_id + B_i[j].id ]) // add (i,j) and (j,i) { C[N*i+(ac+j)] = DBL_MAX; C[N*(ac+j)+i] = 0; } } } int *P = new int[N]; // path (reused) double *M = new double[N]; // capacity to node while(true) { // clean up old values, init the path fill(P, P+N, -1); fill(M, M+N, DBL_MAX); P[s] = -2; // do BFS to get a new path std::queue tc; // to-check tc.push(s); bool res = false; // make sure we get a result while(!tc.empty()) { unsigned int u = tc.front(); tc.pop(); // we need to get OUT-edges // decide if we're on a A_i, B_i, s, or t vector E; if(u == s) { // add the nodes in A_i E.resize(ac,0); for(unsigned int i = 0; i < ac; i++) E[i] = i; } else if(u == t) { // add the nodes in B_i E.resize(bc,0); for(unsigned int i = 0; i < bc; i++) E[i] = ac + i; } else if(u < ac) // then it is in A_i { // add the nodes in A_i for(unsigned int j = 0; j < bc; j++) if(incompatible[A_i[u].id*max_id + B_i[j].id ]) E.push_back(ac + j); // add the source - don't think we actually need to do this E.push_back(s); } else // then it is in B_i { // add the nodes in A_i for(unsigned int j = 0; j < ac; j++) if(incompatible[A_i[j].id*max_id + B_i[u-ac].id ]) E.push_back(j); // add the sink E.push_back(t); } // now iterate through E for(unsigned int i = 0; i < E.size(); i++) { if(C[ N*u+E[i] ] - F[ N*u+E[i] ] > 0 && P[E[i]] == -1) { P[E[i]] = u; M[E[i]] = min(M[u],C[ N*u+E[i] ] - F[ N*u+E[i] ]); if(E[i] != t) tc.push(E[i]); else { // we're done with the BFS - empty the queue out while(!tc.empty()) tc.pop(); res = true; } } } } double m = 0; if(res) m = M[t]; // check capacity of the path, break if 0 (no new path found) if(m == 0) break; // add capacity of the path to flow f += m; // travel from sink to source along the path, updating edges with +/- cap unsigned int v = t; while(v != s) { unsigned int u = P[v]; F[N*u + v] += m; F[N*v + u] -= m; v = u; } } NetworkFlowResult r; r.flow = f; if(f < 1 - DBL_EPSILON*100) { // do a BFS to get splits // reset P fill(P, P+N, -1); std::queue Q; Q.push(s); while(!Q.empty()) { unsigned int u = Q.front(); Q.pop(); // we need to get OUT-edges // decide if we're on a A_i, B_i, s, or t vector E; if(u == s) { // add the nodes in A_i E.resize(ac,0); for(unsigned int i = 0; i < ac; i++) E[i] = i; } else if(u == t) { // add the nodes in B_i E.resize(bc,0); for(unsigned int i = 0; i < bc; i++) E[i] = ac + i; } else if(u < ac) // then it is in A_i { // add the nodes in A_i for(unsigned int j = 0; j < bc; j++) if(incompatible[ A_i[u].id*max_id + B_i[j].id ]) E.push_back(ac + j); // add the source - don't think we actually need to do this E.push_back(s); } else // then it is in B_i { // add the nodes in A_i for(unsigned int j = 0; j < ac; j++) if(incompatible[A_i[j].id*max_id + B_i[u-ac].id ]) E.push_back(j); // add the sink E.push_back(t); } // now iterate through E for(unsigned int i = 0; i < E.size(); i++) { if(C[ N*u+E[i] ] - F[ N*u+E[i] ] > 0 && P[E[i]] == -1) { P[E[i]] = 1; Q.push(E[i]); } } } // form our split vectors for(unsigned int i = 0; i < N-2; i++) // ignore the last 2, since source/sink { if(i < ac) // in A_i { if(P[i] == 1) r.A_i_prime.push_back(A_i[i]); else r.A_i.push_back(A_i[i]); } else // in B_i { if(P[i] == 1) r.B_i_prime.push_back(B_i[i-ac]); else r.B_i.push_back(B_i[i-ac]); } } } delete [] C; delete [] F; delete [] P; delete [] M; return r; } double DisjointTreeDistance(PhyEdgeSet &a, PhyEdgeSet &b, stl_bool *incompatible, unsigned int max_id) { // 2. For unique edges: // a) initialize A_1 to all unique edges in a, B_1 to all unique edges in b // b) for each edge e in A_i find all incompatible edges in b // c) construct a bipartite graph based on these incompatible edges // d) run max flow // e) do BFS to find the min-weight vertex cover // f) if min-weight vertex cover has weight < 1, split based on // min-weight vertex cover and push blocks into the right places, else move to next A_i stack > unchecked_blocks; std::vector > finished_blocks; // NB blocks in reverse order pair p; for(unsigned int i = 0; i < a.size(); i++) p.first.push_back(a[i]); for(unsigned int j = 0; j < b.size(); j++) p.second.push_back(b[j]); unchecked_blocks.push(p); while(!unchecked_blocks.empty()) { PhyEdgeSet A_i = unchecked_blocks.top().first; PhyEdgeSet B_i = unchecked_blocks.top().second; unchecked_blocks.pop(); // Some sanity checks if(A_i.size() == 0 || B_i.size() == 0) { finished_blocks.push_back( make_pair(A_i, B_i) ); continue; } if(A_i.size() == 1 && B_i.size() == 1) { finished_blocks.push_back( make_pair(A_i, B_i) ); continue; } NetworkFlowResult r = EKNetworkFlow(A_i, B_i, incompatible, max_id); if(r.flow < 1 - DBL_EPSILON*100) { unchecked_blocks.push( make_pair(r.A_i_prime, r.B_i_prime) ); unchecked_blocks.push( make_pair(r.A_i, r.B_i) ); } else { finished_blocks.push_back( make_pair(A_i, B_i) ); } } // the unique edges have distance sqrt( sum_i (||A_i|| + ||B_i||)^2 ) double unique_edge_factor = 0; for(unsigned int i = 0; i < finished_blocks.size(); i++) { double accum = 0; PhyEdgeSet a = finished_blocks[i].first; PhyEdgeSet b = finished_blocks[i].second; // do the A_i block double t = 0; for(unsigned j = 0; j < a.size(); j++) t += a[j].weight*a[j].weight; t = sqrt(t); accum += t; // do the B_i block t = 0; for(unsigned j = 0; j < b.size(); j++) t += b[j].weight*b[j].weight; t = sqrt(t); accum += t; unique_edge_factor += accum*accum; } return unique_edge_factor; // n.b., not square rooted } distory/src/newick.cpp0000644000176200001440000001663113062442436014535 0ustar liggesusers/* * Polynomial Time Distance Algorithm for Binary Rooted Phylogenetic Trees * * Note: Algorithm based on presentation by Scott Provan / Megan Owen, * "Computing Geodesic Distances in Tree Space in Polynomial Time" * * John Chakerian * chakj@stanford.edu * * Last edited: Apr 22 2010 * */ #include #include #include #include #include #include #include #include #include #include "newick.h" // Optionally parse a floating point value after a ':' to indicate edge weight. // Sets the weight to 1 if no ':' is found. double ParseWeight(const string &str, unsigned int pos, unsigned int *off) { double w = 1; if(str[pos] == ':') { char* e; string s = str.substr(pos+1); const char* cs = s.c_str(); w = strtod(cs, &e); if(cs == e) { w = 1; } else { pos += (e - cs)+1; } } if(off) *off = pos; return w; } map AssignLeafLabels(const string &str) { unsigned int ctr = 0; map ret; bool rec = false; string t = ""; for(unsigned int i = 0; i < str.size(); i++) { if(str[i] == ' ') continue; if(str[i] == '(' || str[i] == ',') { rec = true; continue; } if(rec) { if(isalpha(str[i]) || isdigit(str[i]) || str[i] == '_' || str[i] == '-') t += str[i]; else { ret[t] = ctr; t = ""; ctr++; rec = false; } } } return ret; } // We'll use a simple state machine to parse over the trees. Technically this // isn't the best solution - a PDA would be more appropriate, however we're not // interested in a full parse, only the partitions caused by each edge. Because // of this, we can avoid recursion or any sort of tree data structure and just // go right to the partitions. PhyEdgeSet NewickParse(const string &str, map &name_to_pos) { unsigned int pos = 0; // position in string unsigned int ctr = 0; // nesting level unsigned int id_ctr = 0; // for assigning each a unique id string errstr; vector< set > in_leaves(30); stack commas; // for ensuring trees are bifurcating vector edges; if(str.length() == 0) { return vector(); } // move past leading whitespace while(str[pos] == ' ' || str[pos] == '\t') pos++; // we can bail out early here if we have an empty string or a ; character if(str[pos] == ';') { return vector(); } // if the last non-whitespace character isn't a ;, bail out size_t tpos = str.length()-1; while(tpos > 0 && isspace(str[tpos])) tpos--; if(tpos == 0) // empty string { return vector(); } if(tpos > 0 && str[tpos] != ';') { errstr = "Tree not terminated with ';'"; goto error; } goto start_state; start_state: if(str[pos] == ';') goto finish; else if(str[pos] == '(') goto new_nesting; else { errstr = "Parse error at first character. A tree must have at least 2 taxa."; goto error; } new_nesting: ctr++; in_leaves.push_back(set()); commas.push(0); pos++; // move to first character of new nesting // move past whitespace while(str[pos] == ' ' || str[pos] == '\t') pos++; if(str[pos] == '(') goto new_nesting; // start the process over again else if(isalpha(str[pos]) || isdigit(str[pos]) || str[pos] == '_' || str[pos] == '-') goto leaf_entry; else if(str[pos] == ':') { errstr = "Leaf nodes must have string labels."; goto error; } else if(str[pos] == ')') { errstr = "Empty blocks of form () are not allowed."; goto error; } else { errstr = "Parse error after '('"; goto error; } leaf_entry: // move past whitespace while(str[pos] == ' ' || str[pos] == '\t') pos++; // add the edge to the leaf { // get the leaf name unsigned int spos = pos; while(isalpha(str[pos]) || isdigit(str[pos]) || str[pos] == '_' || str[pos] == '-') { pos++; } string leaflabel = str.substr(spos,pos-spos); PhyEdge e(name_to_pos.size()); e.id = id_ctr++; double w = ParseWeight(str,pos,&pos); e.weight = w; e.split[ name_to_pos[leaflabel] ] = false; edges.push_back(e); for(unsigned int i = 0; i < in_leaves.size(); i++) { in_leaves[i].insert(leaflabel); } } // move past extra whitespace while(str[pos] == ' ' || str[pos] == '\t') pos++; // decide what to do next if(str[pos] == ',') { commas.top()++; if(commas.top() > 1) { errstr = "More than one comma detected in a nesting."; goto error; } pos++; // move past whitespace while(str[pos] == ' ' || str[pos] == '\t') pos++; if(str[pos] == '(') goto new_nesting; else if(isdigit(str[pos]) || isalpha(str[pos]) || str[pos] == '_' || str[pos] == '-') goto leaf_entry; else { errstr = "Couldn't figure out what to do when moving past a ','."; goto error; } } else if(str[pos] == ')') goto close_nesting; else { errstr = "Couldn't figure out what to do after a first leaf entry "; goto error; } close_nesting: if(str[pos] != ')') { errstr = "Close-nesting expected but no end-bracket found."; goto error; } pos++; if(str[pos] == ';') goto finish; // add the edge { PhyEdge e(name_to_pos.size()); e.id = id_ctr++; e.weight = ParseWeight(str,pos,&pos); for(set::iterator it = in_leaves[in_leaves.size()-1].begin(); it != in_leaves[in_leaves.size()-1].end(); ++it) e.split[ name_to_pos[*it] ] = false; edges.push_back(e); } in_leaves.pop_back(); commas.pop(); if(str[pos] == ';') goto finish; // decide what to do next if(str[pos] == ',') { commas.top()++; if(commas.top() > 1) { errstr = "More than one comma detected in a nesting. Trees MUST be binary."; goto error; } pos++; // move past whitespace while(str[pos] == ' ' || str[pos] == '\t') pos++; if(str[pos] == '(') goto new_nesting; else if(isdigit(str[pos]) || isalpha(str[pos]) || str[pos] == '_' || str[pos] == '-') goto leaf_entry; else { errstr = "Couldn't figure out what to do when moving past a ','."; goto error; } } else if(str[pos] == ')') goto close_nesting; else { errstr = "Couldn't figure out what to do after a second leaf entry"; goto error; } error: if(errstr == "") errstr = "Parser ran off the edge."; error("An error was encountered in parsing near position %d: %s\n", pos, errstr.c_str()); return vector(); finish: return edges; } distory/NEWS0000644000176200001440000000034413646322402012451 0ustar liggesusers CHANGES IN DISTORY VERSION 1.4.4 NEW FEATURES o Calls to is.binary.tree() have replaced by calls to is.binary.phylo(). o The file NAMESPACE has been updated. o EP is taking over as maintainer of distory.distory/R/0000755000176200001440000000000013646743247012170 5ustar liggesusersdistory/R/zzz.R0000644000176200001440000000012112102722032013111 0ustar liggesusers.onUnload <- function(libpath) { library.dynam.unload("distory", libpath) } distory/R/position.leverage.R0000644000176200001440000000123511364503012015724 0ustar liggesusersposition.leverage <- function(data, F, to = NULL, rep = 50, by = 1) { N = ncol(data) D = rep(1, N) leverage.vals = c() if(is.null(to)) { to = F(lookup.samples(data, list(convert.table.to.idx(D)))[[1]]) } for(i in seq(1,N,by)) { Dp = D; S = sample(N, N/rep); while(i %in% S) { S = sample(N,N/rep); } Dp[S] = 0; Dp[i] = N/rep; new.tree = F(lookup.samples(data, list(convert.table.to.idx(Dp)))[[1]]) leverage.vals = c(leverage.vals, as.matrix(dist.multiPhylo(list(to, new.tree)))[1,2]) } leverage.vals } distory/R/bethe.tree.R0000644000176200001440000000160311367360306014325 0ustar liggesusersbethe.tree <- function(tips, level.lengths = NULL, outgroup="O", outgroup.dist=1) { if(length(tips) != length(unique(tips))) { stop("Not all tips are unique.\n") } if(!is.null(level.lengths)) { if(length(level.lengths) == 1) { level.lengths = rep(level.lengths, 2) } } if(is.null(level.lengths) || length(level.lengths) == 0) { level.lengths = 1 } groupings <- as.numeric(sapply(1:(length(tips)/2), function(x) rep(x,2))) d <- level.lengths[1] level.lengths <- level.lengths[-1] tips <- lapply(split(tips,groupings), function(x) { sprintf("(%s:%d,%s:%d)",x[1],d,x[2],d)} ) if(length(tips) == 1) { read.tree(text=sprintf("(%s:%d, %s:%d);", tips[[1]], d, outgroup, outgroup.dist)) } else { bethe.tree(tips, level.lengths, outgroup, outgroup.dist) } } distory/R/orthant.boundary.tree.R0000644000176200001440000000126311364501143016532 0ustar liggesusersorthant.boundary.tree <- function(x, y) { e1 <- distinct.edges(x,y) e2 <- distinct.edges(y,x) if(length(e1) != 1) stop("Trees must differ by only one edge") length.e1 = x$edge.length[e1] length.e2 = y$edge.length[e2] lambda = length.e1 / ( length.e1 + length.e2) bdy.tree <- x partitions <- partition.leaves(x) bdy.tree$edge.length <- lapply(partitions, function(e) { a.e <- x$edge.length[edge.from.split(x, e)] b.e <- y$edge.length[edge.from.split(y, e)] a.e*(1 - lambda) + b.e * lambda } ) bdy.tree$edge.length[e1] = 0; bdy.tree } distory/R/gromov.hyperbolicity.R0000644000176200001440000000124113646351430016474 0ustar liggesusersgromov.hyperbolicity <- function(d, deltas = FALSE, scale = NA) { d <- as.matrix(as.dist(d)) a = dim(d); if(a[1] != a[2]) { stop("The parameter could not be coerced into a square distance matrix.") } if(a[1] < 4) { stop("At least 4 points must be used to compute the Gromov hyperbolicity constant.") } if(is.na(scale)) { scale = "none" } scaleV = pmatch(scale, c("none", "max", "perimeter")) if(length(scaleV) > 1 || is.na(scaleV)) { stop("Unknown or ambiguous scale method."); } .Call(gromov_distmatrix, as.double(d), as.logical(deltas), as.integer(scaleV)) } distory/R/dist.multiPhylo.R0000644000176200001440000001404313646351043015412 0ustar liggesusersdist.multiPhylo <- function(x, method="geodesic", force.multi2di = FALSE, outgroup = NULL, convert.multifurcating = FALSE, use.random.resolution = FALSE, scale = NULL, verbose = FALSE) { if(length(x) < 2) return(matrix()) if(class(x) == "multiPhylo") # ideally, we will have this { # run checks if appropriate # separate it out into a vector of strings if(!is.null(outgroup)) { x <- lapply(x, function(k) { if(class(k) == "phylo") { if(!is.rooted(k)) root(k, outgroup, resolve.root = TRUE) else k } else NA }) } if(force.multi2di == TRUE) { x <- lapply(x, function(k) { if(class(k) == "phylo") { multi2di(k,random=use.random.resolution) } else NA }) } else if(convert.multifurcating == TRUE) # won't resolve multifurcations at the root { x <- lapply(x, function(k) { if(class(k) == "phylo") { if(!is.binary.phylo(k)) multi2di(k,random=use.random.resolution) else k } else NA }) } if(!is.null(scale)) { if(class(scale) == "phylo") { T <- sum(scale$edge.length) x <- lapply(x, function(k) { if(class(k) == "phylo") { k$edge.length = k$edge.length * (T / sum(k$edge.length)) k } else NA }) } else if(class(scale) == "numeric") { x <- lapply(x, function(k) { if(class(k) == "phylo") { k$edge.length = k$edge.length * (scale / sum(k$edge.length)) k } else NA }) } else { stop("Scale parameter not understood.\n") } } # do some sanity checks before we start out r <- lapply(x, function(k) { if(class(k) == "phylo") { is.rooted(k) } else NA }) if(!all(as.logical(r), na.rm=TRUE)) { stop("Some trees are not rooted. Specify an outgroup to fix this problem. All trees must be rooted.\n") } r <- lapply(x, function(k) { if(class(k) == "phylo") is.binary.phylo(k) else NA }) if(!all(as.logical(r), na.rm=TRUE)) { stop("Some trees are not binary. All input trees must be strictly binary.\n") } # check to make sure all trees have the same tip labels tips = x[[1]]$tip.label r <- lapply(x, function(k) { if(class(k) == "phylo") setequal(k$tip.label, tips) else NA}) if(!all(as.logical(r), na.rm=TRUE)) { stop("Not all trees have the same tips.\n") } # convert our list of class phylo to a list of strings treestrs <- lapply(x, function(k) { if(class(k) == "phylo") write.tree(k) else "" }) method=tolower(method) method.id = pmatch(method, c("edgeset", "geodesic")) # call the C interface function and return the value automatically if(method.id == 1) { rv <- .Call(phycpp_bin_trees, treestrs) } else if(method.id == 2) { rv <- .Call(phycpp_compute_tree_distance_set, treestrs, as.logical(verbose)) } else { stop("Specified method is not valid") } as.dist(rv) } else if(typeof(x) == "list") { if(class(x[[1]]) == "phylo") # a list of phylo's that for some reason is not classed as multiPhylo { class(x) <- "multiPhylo" # it already is basically a multiPhylo anyways - we'll mark it as such dist.multiPhylo(x, method=method, force.multi2di=force.multi2di, outgroup=outgroup, convert.multifurcating=convert.multifurcating, use.random.resolution=use.random.resolution, scale=scale, verbose=verbose) } else if(class(x[[1]]) == "character") # a list of strings, presuming one tree each, properly terminated { # read with /ape/, run checks, dump back t <- paste(x, sep="", collapse="") k <- read.tree(text=t) dist.multiPhylo(x, method=method, force.multi2di=force.multi2di, outgroup=outgroup, convert.multifurcating=convert.multifurcating, use.random.resolution=use.random.resolution, scale=scale, verbose=verbose) } } else if(class(x) == "character") # this is for one string containing multiple trees { # read with ape and dump back to a vector of strings k <- read.tree(text=x) # call this to process it properly dist.multiPhylo(x, method=method, force.multi2di=force.multi2di, outgroup=outgroup, convert.multifurcating=convert.multifurcating, use.random.resolution=use.random.resolution, scale=scale, verbose=verbose) } else { stop("Cannot coerce the argument into a usable type.") } } distory/R/bin.multiPhylo.R0000644000176200001440000000067711364500765015231 0ustar liggesusersbin.multiPhylo <- function(treelist) { edge.dists <- dist.multiPhylo(treelist, method="edgeset") edge.dists <- as.matrix(edge.dists) bin.id = 1 binning = rep(NA,length(treelist)) while(any(is.na(binning))) { # find the next unbinned tree & find all matching trees next.tr = min(which(is.na(binning))) binning[edge.dists[,next.tr] == 0] = bin.id bin.id = bin.id + 1 } binning } distory/R/mcmc.target.seq.R0000644000176200001440000000602711367372615015306 0ustar liggesusersmcmc.target.seq <- function(data, x, F, n) { N = ncol(data) D = rep(1, N) likvals = c() a = runif(n) old.tree = F(lookup.samples(data, list(convert.table.to.idx(D)))[[1]]) old.dist = as.matrix(dist.multiPhylo(list(x, old.tree)))[1,2] best.tree = old.tree best.dist = old.dist for(i in 1:n) { Dp = D good = FALSE r = sample.int(N, 2) while(D[r[1]] == N || D[r[2]] == 0) { r = sample.int(N, 2) } # propose the swap Dp[r[1]] = Dp[r[1]] + 1; Dp[r[2]] = Dp[r[2]] - 1; new.tree = F(lookup.samples(data, list(convert.table.to.idx(Dp)))[[1]]) new.dist = as.matrix(dist.multiPhylo(list(x, new.tree)))[1,2] v = old.dist/new.dist if(v >= 1) { v = 1 best.tree = new.tree best.dist = new.dist } if(v^(100-exp(1/i)) > a[i]) { D = Dp old.tree = new.tree old.dist = new.dist } likvals = c(likvals, old.dist) } list(data = D, tree = best.tree, distance = best.dist, vals = likvals) } # modified from ape code for boot.phylo boot.samples.idxs <- function(data, B = 100, block = 1) { if (is.list(data) && !is.data.frame(data)) { if (inherits(data, "DNAbin")) data <- as.matrix(data) else { nm <- names(data) n <- length(data) data <- unlist(data) nL <- length(data) data <- matrix(data, n, nL/n, byrow = TRUE) rownames(data) <- nm } } boot.smpls <- vector("list", B) for (i in 1:B) { if (block > 1) { y <- seq(block, ncol(data), block) boot.i <- sample(y, replace = TRUE) boot.samp <- numeric(ncol(data)) boot.samp[y] <- boot.i for (j in 1:(block - 1)) boot.samp[y - j] <- boot.i - j } else boot.samp <- sample(ncol(data), replace = TRUE) boot.smpls[[i]] <- boot.samp } boot.smpls } lookup.samples <- function(data, idxs) { if (is.list(data) && !is.data.frame(data)) { if (inherits(data, "DNAbin")) data <- as.matrix(data) else { nm <- names(data) n <- length(data) data <- unlist(data) nL <- length(data) data <- matrix(data, n, nL/n, byrow = TRUE) rownames(data) <- nm } } lapply(idxs, function(i) { data[,i] } ) } phylo.diff <- function(x, y, ...) { uniqT1 <- distinct.edges(x, y) uniqT2 <- distinct.edges(y, x) treeA.cs <- rep("black", dim(x$edge)[1]) treeA.cs[uniqT1] = "red" treeB.cs <- rep("black", dim(y$edge)[1]) treeB.cs[uniqT2] = "red" par(mfrow=c(1,2)) plot(x, edge.color=treeA.cs, ...) plot(y, edge.color=treeB.cs, ...) invisible() } convert.table.to.idx <- function(T) { dat = c() for(i in 1:length(T)) { dat = c(dat, rep(i, T[i])) } dat } distory/R/phylo.diff.R0000644000176200001440000000262611367133506014350 0ustar liggesusersphylo.diff <- function(x, y, ...) { uniqT1 <- distinct.edges(x, y) uniqT2 <- distinct.edges(y, x) treeA.cs <- rep("black", dim(x$edge)[1]) treeA.cs[uniqT1] = "red" treeB.cs <- rep("black", dim(y$edge)[1]) treeB.cs[uniqT2] = "red" par(mfrow=c(1,2)) plot(x, edge.color=treeA.cs, ...) plot(y, edge.color=treeB.cs, ...) invisible() } distinct.edges <- function(x, y) # all edges in x not in y { bp1 <- partition.leaves(x) bp1 <- lapply(bp1, sort) bp2 <- partition.leaves(y) bp2 <- lapply(bp2, sort) p = c() for(i in 1:length(bp1)) { if(!(list(bp1[[i]]) %in% bp2)) { p <- append(p, i) } } p } edge.from.split <- function(x, split) { splits <- partition.leaves(x) splits <- lapply(splits, sort) split <- sort(split) match(list(split), splits) } get.bipartition <- function(x, e) # of leaves from edge { acc = vector() inc = FALSE; for(i in 1:dim(x$edge)[1]) { if(x$edge[i,1] == e) { acc <- append(acc, get.bipartition(x, x$edge[i,2])); inc = TRUE; } } if(!inc) { acc = x$tip.label[e]; } acc } partition.leaves <- function(x) # get all bipartitoins { r = list() for(i in 1:dim(x$edge)[1]) { t <- get.bipartition(x, x$edge[i,2]); r <- append(r, list(t)); } r } distory/MD50000644000176200001440000000275113647011422012264 0ustar liggesusers29304ea5f131e14a9b05128c0096d072 *DESCRIPTION 22cf79908ff979b1ccf61d093167990c *LICENSE 659dae8baf93188f3cb225c3f2be88a6 *NAMESPACE a1a7b911567b21803f323b78805704a9 *NEWS 850d5d746dff4c1e7e623fe7cd35b03a *R/bethe.tree.R a2ed747f17ee32505c78a67a2ab03945 *R/bin.multiPhylo.R 21578a173088fd458f97e92d8d856072 *R/dist.multiPhylo.R 0c4a3430217babff21aa779cda40589f *R/gromov.hyperbolicity.R 4606b7200fc81f8a2c8204b170d8e19a *R/mcmc.target.seq.R 056d8a5ee03dc5dd336ed03dd90357e7 *R/orthant.boundary.tree.R 626d600d681b8f4cc1b842759126e3ee *R/phylo.diff.R b9984556b72472e92c3b298ddcd06df5 *R/position.leverage.R b645bf9091131f9aafecc84714c99776 *R/zzz.R b52d734c21851961c9dbd937f1a5bb1d *man/bethe.tree.Rd 7b411270bd4a9d3bfc6222b9547173c7 *man/bin.multiPhylo.Rd c1cad01c56ee9e476006294b16672b4c *man/dist.multiPhylo.Rd 30d355dcb7a5b0afb2b4db8172025672 *man/distory-package.Rd bfb9d55ab817f217405415650a9fca2c *man/gromov.hyperbolicity.Rd 11026a8b477487e8562a24e6e801c44b *man/mcmc.target.seq.Rd 083f25bd19060c82556e7263b8a51fb1 *man/orthant.boundary.tree.Rd 62fa8a5cf68c312c16948510063013a7 *man/phylo.diff.Rd 87291ab9b8b2bdad9687a1198920fd0d *man/position.leverage.Rd 7cac7222e5dbfec64aaa99579562258b *src/distory.c f4d2b55dddfbaca90b778b3e1f45e536 *src/newick.cpp 76960301ce979b7a7595211c7f3df496 *src/newick.h 5f84e1edcdbd8fde7662fc282747fed1 *src/phydist_r.cpp ab78adf7f6f16d7cbc09a1265f36b4e1 *src/phylo.h 32800a06d5f51921d0ce2c1ba960bff1 *src/treedist-stripped.cpp 16f8de5df9c6f8798a3e6f57495ec821 *src/treedist.h