energy/0000755000176200001440000000000014351017352011544 5ustar liggesusersenergy/NAMESPACE0000644000176200001440000000165414300430160012757 0ustar liggesusersuseDynLib(energy, .registration=TRUE) importFrom(Rcpp, evalCpp) importFrom("stats", "as.dist", "dist", "dnorm", "hclust", "model.matrix", "pnorm", "ppois", "pt", "rnorm", "rpois", "sd", "var") importFrom(boot, boot) importFrom(gsl, hyperg_1F1) export( bcdcor, calc_dist, D_center, Dcenter, dcor, dcor2d, DCOR, dcor.t, dcor.test, dcor.ttest, dcorT, dcorT.test, dcov, dcov2d, dcov.test, dcovU, dcovU_stats, disco, disco.between, edist, energy.hclust, eqdist.e, eqdist.etest, indep.test, is.dmatrix, kgroups, ksample.e, mutualIndep.test, mvI, mvI.test, mvnorm.e, mvnorm.etest, mvnorm.test, normal.e, normal.test, pdcor, pdcor.test, pdcov, pdcov.test, poisson.e, poisson.etest, poisson.m, poisson.mtest, poisson.tests, sortrank, U_center, U_product, Ucenter ) S3method(print, disco) S3method(print, kgroups) S3method(fitted, kgroups) energy/README.md0000644000176200001440000000116014005374454013026 0ustar liggesusers# energy energy package for R The energy package for R implements several methods in multivariate analysis and multivariate inference based on the energy distance, which characterizes equality of distributions. Distance correlation (multivariate independence), disco (nonparametric extension of ANOVA), and goodness-of-fit tests are examples of some of the methods included. energy is named based on the analogy with potential energy in physics. See the references in the manual for more details. [![CRAN_Status_Badge](http://www.r-pkg.org/badges/version/energy)]https://cran.r-project.org/package=energy)energy/data/0000755000176200001440000000000014005374454012462 5ustar liggesusersenergy/data/EVnormal.rda0000644000176200001440000000777714005374454014717 0ustar liggesusersBZh91AY&SYrB) jt: Q@jdF2 m!zmOSiha#=OS#@@42 M4MiFb44ѠڞQMz&iFhETi`ѣGLQaOP3hhi2=@H'=& 4 =#jz'=OQ24ƈbi1 L@ MLLiE50@4 dd&hAM4hh UTFCLbz&a1bdBz4LAL0F xjcSeOOMii2f&CC@Ԯ<%UUUUUUUUUUUUUUUUUUUU !%')+-/1356789:;<=>? !"#$%&'()*ڡ!apȋlMpx}F (s"#dDD[իVCDAgc`&O_x!A=0$.ԿC;4!dQ  LAYbh5u=Bm$ܹۗ['bc4=(̻QlOFrpM+ ՘g pC~ 2FUS̙Kn3iwY(鼝CS 7e \&_L7A͌lumCB&/^TFՏ+q1yH(8v7^C7 iF#j WD'`)|tEP#C0:I \)>F5܇Q73pYrCuxD,5+,M. iطqz#nԳy< m^*-YU_P_MoBV3CbC˃7.d|. NU48pCs>լq"85ވSјVX-,Na.lK)ZZpR|R̋Gƭy`fsgҲ,닎N&tԌS3BXX*YC'[.)Q`G2rA\]{rv%ye7Y=L!J(I_ ')lGNK"gJa]:['HDYAVfb9a2tdۑvri229*3H!ؒI$I$sJI$(J(J(J(Jm#2|R;+8BlPY^ MW<"#) i(O%~Z ݪLS0e'j=&5 R\$aM /}ˊY9j)PvHlaf֭Oh^(u/KkOV 'B)rG`'gOkf#X'luMO0Aԣ3iԮ<: L ܡfى1#J/b")F.Rr-Sj+wJHCTkRFs-46cո8s Eo {WF ЙhD:KXim[a i$C8Oq f dPב&w0M{b@KH43 V7/y+*Bn@ZGX? bSFDd)!xٍh| gmѠ́n]È ;1_UhVeބ!4 sd*TY yu,:5,Be7HaD%&0d#C‚zPќw52ÅaߵR,"Fh(Vck/ϞqVqJ+YdÚ뒻 A*ɾ{z.OV#;-j&oҟJ\e"KÞyNKgvi#dPBY2֔8>3)(`xt)nZUCa aOxPR ,bDw hjJkC/q͙}!>&~k4fǃX╪ H3?8dDt --q`i?2z.٩G L `R 0>N b~6snGф$X}u&v'{4ù/sd.0eg4j:ḙR)&!Sʞ%OgŸ ǬqHt>#o<*_Ai.0yEe)f~iI?q!fJmqr\| R;=G`.du>+n}\'o^D[F .k\2".pkӮ(*/I(7u4W/,sIqxCѓ٘!"DÚW?9`InY8:9SPb͗,tdhF×p#9 %;Z#%"\V|a e<甕:ο ##x>%v<%OgqO سnP-.nCv& YͣtBQ -E xQl*A@~}-fꙑ´1F;#e}P@A Di$q8Urt1clP;(c R@ 5.vV}´"\T$X6v"APx"IqtEiv e"d.YB7EL\QaBbu~G\w2rX?jqyw!ascm "Zڛ?ҹaQl&࡞Ȟ$ Y 50 n <- 100 x <- rnorm(100) y <- rnorm(100) all.equal(dcov(x, y)^2, dcov2d(x, y), check.attributes = FALSE) all.equal(bcdcor(x, y), dcor2d(x, y, "U"), check.attributes = FALSE) x <- rlnorm(400) y <- rexp(400) dcov.test(x, y, R=199) #permutation test dcor.test(x, y, R=199) } } \keyword{ htest } \keyword{ nonparametric } \concept{ independence } \concept{ distance correlation } \concept{ distance covariance } \concept{ energy statistics } energy/man/dcorT.Rd0000644000176200001440000000521414227267445013677 0ustar liggesusers\name{dcorT} \alias{dcorT.test} \alias{dcorT} \title{ Distance Correlation t-Test} \description{ Distance correlation t-test of multivariate independence for high dimension.} \usage{ dcorT.test(x, y) dcorT(x, y) } \arguments{ \item{x}{ data or distances of first sample} \item{y}{ data or distances of second sample} } \details{ \code{dcorT.test} performs a nonparametric t-test of multivariate independence in high dimension (dimension is close to or larger than sample size). As dimension goes to infinity, the asymptotic distribution of the test statistic is approximately Student t with \eqn{n(n-3)/2-1} degrees of freedom and for \eqn{n \geq 10} the statistic is approximately distributed as standard normal. The sample sizes (number of rows) of the two samples must agree, and samples must not contain missing values. The t statistic (dcorT) is a transformation of a bias corrected version of distance correlation (see SR 2013 for details). Large values (upper tail) of the dcorT statistic are significant. } \note{ \code{dcor.t} and \code{dcor.ttest} are deprecated. } \value{ \code{dcorT} returns the dcor t statistic, and \code{dcorT.test} returns a list with class \code{htest} containing \item{ method}{ description of test} \item{ statistic}{ observed value of the test statistic} \item{ parameter}{ degrees of freedom} \item{ estimate}{ (bias corrected) squared dCor(x,y)} \item{ p.value}{ p-value of the t-test} \item{ data.name}{ description of data} } \seealso{ \code{\link{bcdcor}} \code{\link{dcov.test}} \code{\link{dcor}} \code{\link{DCOR}} } \references{ Szekely, G.J. and Rizzo, M.L. (2013). The distance correlation t-test of independence in high dimension. \emph{Journal of Multivariate Analysis}, Volume 117, pp. 193-213. \cr \doi{10.1016/j.jmva.2013.02.012} Szekely, G.J., Rizzo, M.L., and Bakirov, N.K. (2007), Measuring and Testing Dependence by Correlation of Distances, \emph{Annals of Statistics}, Vol. 35 No. 6, pp. 2769-2794. \cr \doi{10.1214/009053607000000505} Szekely, G.J. and Rizzo, M.L. (2009), Brownian Distance Covariance, \emph{Annals of Applied Statistics}, Vol. 3, No. 4, 1236-1265. \cr \doi{10.1214/09-AOAS312} } \author{ Maria L. Rizzo \email{mrizzo@bgsu.edu} and Gabor J. Szekely } \examples{ x <- matrix(rnorm(100), 10, 10) y <- matrix(runif(100), 10, 10) dcorT(x, y) dcorT.test(x, y) } \keyword{ htest } \keyword{ multivariate } \keyword{ nonparametric } \concept{ independence } \concept{ multivariate } \concept{ distance correlation } \concept{ distance covariance } \concept{ energy statistics } energy/man/eigen.Rd0000644000176200001440000000202214014532550013667 0ustar liggesusers\name{EVnormal} \docType{data} \alias{EVnormal} \alias{eigenvalues} \title{Eigenvalues for the energy Test of Univariate Normality} \description{ Pre-computed eigenvalues corresponding to the asymptotic sampling distribution of the energy test statistic for univariate normality, under the null hypothesis. Four Cases are computed: \enumerate{ \item Simple hypothesis, known parameters. \item Estimated mean, known variance. \item Known mean, estimated variance. \item Composite hypothesis, estimated parameters. } Case 4 eigenvalues are used in the test function \code{normal.test} when \code{method=="limit"}. } \usage{data(EVnormal)} \format{Numeric matrix with 125 rows and 5 columns; column 1 is the index, and columns 2-5 are the eigenvalues of Cases 1-4.} \source{Computed} \references{ Szekely, G. J. and Rizzo, M. L. (2005) A New Test for Multivariate Normality, \emph{Journal of Multivariate Analysis}, 93/1, 58-80, \doi{10.1016/j.jmva.2003.12.002}. } energy/man/pdcor.Rd0000644000176200001440000000412514251252003013710 0ustar liggesusers\name{pdcor} \alias{pdcor} \alias{pdcov} \alias{pdcor.test} \alias{pdcov.test} \title{ Partial distance correlation and covariance } \description{Partial distance correlation pdcor, pdcov, and tests.} \usage{ pdcov.test(x, y, z, R) pdcor.test(x, y, z, R) pdcor(x, y, z) pdcov(x, y, z) } \arguments{ \item{x}{ data or dist object of first sample} \item{y}{ data or dist object of second sample} \item{z}{ data or dist object of third sample} \item{R}{ replicates for permutation test} } \details{ \code{pdcor(x, y, z)} and \code{pdcov(x, y, z)} compute the partial distance correlation and partial distance covariance, respectively, of x and y removing z. A test for zero partial distance correlation (or zero partial distance covariance) is implemented in \code{pdcor.test}, and \code{pdcov.test}. Argument types supported are numeric data matrix, data.frame, tibble, numeric vector, class "dist" object, or factor. For unordered factors a 0-1 distance matrix is computed. } \value{ Each test returns an object of class \code{htest}. } \author{ Maria L. Rizzo \email{mrizzo@bgsu.edu} and Gabor J. Szekely } \references{ Szekely, G.J. and Rizzo, M.L. (2014), Partial Distance Correlation with Methods for Dissimilarities. \emph{Annals of Statistics}, Vol. 42 No. 6, 2382-2412. } \examples{ n = 30 R <- 199 ## mutually independent standard normal vectors x <- rnorm(n) y <- rnorm(n) z <- rnorm(n) pdcor(x, y, z) pdcov(x, y, z) set.seed(1) pdcov.test(x, y, z, R=R) set.seed(1) pdcor.test(x, y, z, R=R) \donttest{ if (require(MASS)) { p = 4 mu <- rep(0, p) Sigma <- diag(p) ## linear dependence y <- mvrnorm(n, mu, Sigma) + x print(pdcov.test(x, y, z, R=R)) ## non-linear dependence y <- mvrnorm(n, mu, Sigma) * x print(pdcov.test(x, y, z, R=R)) } } } \keyword{ htest } \keyword{ multivariate } \keyword{ nonparametric } \concept{ independence } \concept{ multivariate } \concept{ distance correlation } \concept{ distance covariance } \concept{ energy statistics } energy/man/energy.hclust.Rd0000644000176200001440000001274214227266735015423 0ustar liggesusers\name{energy.hclust} \alias{energy.hclust} \title{ Hierarchical Clustering by Minimum (Energy) E-distance } \description{ Performs hierarchical clustering by minimum (energy) E-distance method. } \usage{ energy.hclust(dst, alpha = 1) } \arguments{ \item{dst}{\code{dist} object} \item{alpha}{distance exponent} } \details{ Dissimilarities are \eqn{d(x,y) = \|x-y\|^\alpha}{||x-y||^a}, where the exponent \eqn{\alpha}{a} is in the interval (0,2]. This function performs agglomerative hierarchical clustering. Initially, each of the n singletons is a cluster. At each of n-1 steps, the procedure merges the pair of clusters with minimum e-distance. The e-distance between two clusters \eqn{C_i, C_j} of sizes \eqn{n_i, n_j} is given by \deqn{e(C_i, C_j)=\frac{n_i n_j}{n_i+n_j}[2M_{ij}-M_{ii}-M_{jj}], } where \deqn{M_{ij}=\frac{1}{n_i n_j}\sum_{p=1}^{n_i} \sum_{q=1}^{n_j} \|X_{ip}-X_{jq}\|^\alpha,}{ M_{ij} = 1/(n_i n_j) sum[1:n_i, 1:n_j] ||X_(ip) - X_(jq)||^a,} \eqn{\|\cdot\|}{|| ||} denotes Euclidean norm, and \eqn{X_{ip}}{ X_(ip)} denotes the p-th observation in the i-th cluster. The return value is an object of class \code{hclust}, so \code{hclust} methods such as print or plot methods, \code{plclust}, and \code{cutree} are available. See the documentation for \code{hclust}. The e-distance measures both the heterogeneity between clusters and the homogeneity within clusters. \eqn{\mathcal E}{E}-clustering (\eqn{\alpha=1}{a=1}) is particularly effective in high dimension, and is more effective than some standard hierarchical methods when clusters have equal means (see example below). For other advantages see the references. \code{edist} computes the energy distances for the result (or any partition) and returns the cluster distances in a \code{dist} object. See the \code{edist} examples. } \value{ An object of class \code{hclust} which describes the tree produced by the clustering process. The object is a list with components: \item{merge:}{ an n-1 by 2 matrix, where row i of \code{merge} describes the merging of clusters at step i of the clustering. If an element j in the row is negative, then observation -j was merged at this stage. If j is positive then the merge was with the cluster formed at the (earlier) stage j of the algorithm.} \item{height:}{the clustering height: a vector of n-1 non-decreasing real numbers (the e-distance between merging clusters)} \item{order:}{ a vector giving a permutation of the indices of original observations suitable for plotting, in the sense that a cluster plot using this ordering and matrix \code{merge} will not have crossings of the branches.} \item{labels:}{ labels for each of the objects being clustered.} \item{call:}{ the call which produced the result.} \item{method:}{ the cluster method that has been used (e-distance).} \item{dist.method:}{ the distance that has been used to create \code{dst}.} } \note{ Currently \code{stats::hclust} implements Ward's method by \code{method="ward.D2"}, which applies the squared distances. That method was previously \code{"ward"}. Because both \code{hclust} and energy use the same type of Lance-Williams recursive formula to update cluster distances, now with the additional option \code{method="ward.D"} in \code{hclust}, the energy distance method is easily implemented by \code{hclust}. (Some "Ward" algorithms do not use Lance-Williams, however). Energy clustering (with \code{alpha=1}) and "ward.D" now return the same result, except that the cluster heights of energy hierarchical clustering with \code{alpha=1} are two times the heights from \code{hclust}. In order to ensure compatibility with hclust methods, \code{energy.hclust} now passes arguments through to \code{hclust} after possibly applying the optional exponent to distance. } \references{ Szekely, G. J. and Rizzo, M. L. (2005) Hierarchical Clustering via Joint Between-Within Distances: Extending Ward's Minimum Variance Method, \emph{Journal of Classification} 22(2) 151-183. \cr \doi{10.1007/s00357-005-0012-9} Szekely, G. J. and Rizzo, M. L. (2004) Testing for Equal Distributions in High Dimension, \emph{InterStat}, November (5). Szekely, G. J. (2000) Technical Report 03-05: \eqn{\mathcal{E}}{E}-statistics: Energy of Statistical Samples, Department of Mathematics and Statistics, Bowling Green State University. } \author{ Maria L. Rizzo \email{mrizzo@bgsu.edu} and Gabor J. Szekely } \seealso{ \code{\link{edist}} \code{\link{ksample.e}} \code{\link{eqdist.etest}} \code{hclust}} \examples{ \dontrun{ library(cluster) data(animals) plot(energy.hclust(dist(animals))) data(USArrests) ecl <- energy.hclust(dist(USArrests)) print(ecl) plot(ecl) cutree(ecl, k=3) cutree(ecl, h=150) ## compare performance of e-clustering, Ward's method, group average method ## when sampled populations have equal means: n=200, d=5, two groups z <- rbind(matrix(rnorm(1000), nrow=200), matrix(rnorm(1000, 0, 5), nrow=200)) g <- c(rep(1, 200), rep(2, 200)) d <- dist(z) e <- energy.hclust(d) a <- hclust(d, method="average") w <- hclust(d^2, method="ward.D2") list("E" = table(cutree(e, k=2) == g), "Ward" = table(cutree(w, k=2) == g), "Avg" = table(cutree(a, k=2) == g)) } } \keyword{ multivariate } \keyword{ cluster } \concept{ energy statistics } energy/man/poisson.Rd0000644000176200001440000001032314227267103014302 0ustar liggesusers\name{Poisson Tests} \alias{poisson.tests} \alias{poisson.e} \alias{poisson.etest} \alias{poisson.m} \alias{poisson.mtest} \title{ Goodness-of-Fit Tests for Poisson Distribution} \description{ Performs the mean distance goodness-of-fit test and the energy goodness-of-fit test of Poisson distribution with unknown parameter. } \usage{ poisson.e(x) poisson.m(x) poisson.etest(x, R) poisson.mtest(x, R) poisson.tests(x, R, test="all") } \arguments{ \item{x}{ vector of nonnegative integers, the sample data } \item{R}{ number of bootstrap replicates } \item{test}{ name of test(s) } } \details{ Two distance-based tests of Poissonity are applied in \code{poisson.tests}, "M" and "E". The default is to do all tests and return results in a data frame. Valid choices for \code{test} are "M", "E", or "all" with default "all". If "all" tests, all tests are performed by a single parametric bootstrap computing all test statistics on each sample. The "M" choice is two tests, one based on a Cramer-von Mises distance and the other an Anderson-Darling distance. The "E" choice is the energy goodness-of-fit test. \code{R} must be a positive integer for a test. If \code{R} is missing or 0, a warning is printed but test statistics are computed (without testing). The mean distance test of Poissonity (M-test) is based on the result that the sequence of expected values E|X-j|, j=0,1,2,... characterizes the distribution of the random variable X. As an application of this characterization one can get an estimator \eqn{\hat F(j)} of the CDF. The test statistic (see \code{\link{poisson.m}}) is a Cramer-von Mises type of distance, with M-estimates replacing the usual EDF estimates of the CDF: \deqn{M_n = n\sum_{j=0}^\infty (\hat F(j) - F(j\;; \hat \lambda))^2 f(j\;; \hat \lambda).}{M_n = n sum [j>=0] (\hat F(j) - F(j; \hat \lambda))^2 f(j; \hat \lambda).} In \code{poisson.tests}, an Anderson-Darling type of weight is also applied when \code{test="M"} or \code{test="all"}. The tests are implemented by parametric bootstrap with \code{R} replicates. An energy goodness-of-fit test (E) is based on the test statistic \deqn{Q_n = n (\frac{2}{n} \sum_{i=1}^n E|x_i - X| - E|X-X'| - \frac{1}{n^2} \sum_{i,j=1}^n |x_i - x_j|, }{Q_n = n((2/n) sum[1:n] E|x_i-X| - E|X-X'| - (1/n^2) sum[1:n,1:n] |x_i-x_j|),} where X and X' are iid with the hypothesized null distribution. For a test of H: X ~ Poisson(\eqn{\lambda}), we can express E|X-X'| in terms of Bessel functions, and E|x_i - X| in terms of the CDF of Poisson(\eqn{\lambda}). If test=="all" or not specified, all tests are run with a single parametric bootstrap. \code{poisson.mtest} implements only the Poisson M-test with Cramer-von Mises type distance. \code{poisson.etest} implements only the Poisson energy test. } \value{ The functions \code{poisson.m} and \code{poisson.e} return the test statistics. The function \code{poisson.mtest} or \code{poisson.etest} return an \code{htest} object containing \item{method}{Description of test} \item{statistic}{observed value of the test statistic} \item{p.value}{approximate p-value of the test} \item{data.name}{replicates R} \item{estimate}{sample mean} \code{poisson.tests} returns "M-CvM test", "M-AD test" and "Energy test" results in a data frame with columns \item{estimate}{sample mean} \item{statistic}{observed value of the test statistic} \item{p.value}{approximate p-value of the test} \item{method}{Description of test} which can be coerced to a \code{tibble}. } \note{The running time of the M test is much faster than the E-test.} \references{ Szekely, G. J. and Rizzo, M. L. (2004) Mean Distance Test of Poisson Distribution, \emph{Statistics and Probability Letters}, 67/3, 241-247. \doi{10.1016/j.spl.2004.01.005}. Szekely, G. J. and Rizzo, M. L. (2005) A New Test for Multivariate Normality, \emph{Journal of Multivariate Analysis}, 93/1, 58-80, \doi{10.1016/j.jmva.2003.12.002}. } \author{ Maria L. Rizzo \email{mrizzo@bgsu.edu} and Gabor J. Szekely } \examples{ x <- rpois(50, 2) poisson.m(x) poisson.e(x) \donttest{ poisson.etest(x, R=199) poisson.mtest(x, R=199) poisson.tests(x, R=199) } } \keyword{ htest } \keyword{ energy } energy/man/disco.Rd0000644000176200001440000001200714350623777013724 0ustar liggesusers\name{disco} \alias{disco} \alias{disco.between} \alias{print.disco} \title{ distance components (DISCO)} \description{ E-statistics DIStance COmponents and tests, analogous to variance components and anova. } \usage{ disco(x, factors, distance, index=1.0, R, method=c("disco","discoB","discoF")) disco.between(x, factors, distance, index=1.0, R) } \arguments{ \item{x}{ data matrix or distance matrix or dist object} \item{factors}{ matrix or data frame of factor labels or integers (not design matrix)} \item{distance}{ logical, TRUE if x is distance matrix} \item{index}{ exponent on Euclidean distance in (0,2]} \item{R}{ number of replicates for a permutation test} \item{method}{ test statistic } } \details{ \code{disco} calculates the distance components decomposition of total dispersion and if R > 0 tests for significance using the test statistic disco "F" ratio (default \code{method="disco"}), or using the between component statistic (\code{method="discoB"}), each implemented by permutation test. If \code{x} is a \code{dist} object, argument \code{distance} is ignored. If \code{x} is a distance matrix, set \code{distance=TRUE}. In the current release \code{disco} computes the decomposition for one-way models only. } \value{ When \code{method="discoF"}, \code{disco} returns a list similar to the return value from \code{anova.lm}, and the \code{print.disco} method is provided to format the output into a similar table. Details: \code{disco} returns a class \code{disco} object, which is a list containing \item{call}{call} \item{method}{method} \item{statistic}{vector of observed statistics} \item{p.value}{vector of p-values} \item{k}{number of factors} \item{N}{number of observations} \item{between}{between-sample distance components} \item{withins}{one-way within-sample distance components} \item{within}{within-sample distance component} \item{total}{total dispersion} \item{Df.trt}{degrees of freedom for treatments} \item{Df.e}{degrees of freedom for error} \item{index}{index (exponent on distance)} \item{factor.names}{factor names} \item{factor.levels}{factor levels} \item{sample.sizes}{sample sizes} \item{stats}{matrix containing decomposition} When \code{method="discoB"}, \code{disco} passes the arguments to \code{disco.between}, which returns a class \code{htest} object. \code{disco.between} returns a class \code{htest} object, where the test statistic is the between-sample statistic (proportional to the numerator of the F ratio of the \code{disco} test. } \references{ M. L. Rizzo and G. J. Szekely (2010). DISCO Analysis: A Nonparametric Extension of Analysis of Variance, Annals of Applied Statistics, Vol. 4, No. 2, 1034-1055. \cr \doi{10.1214/09-AOAS245} } \note{ The current version does all calculations via matrix arithmetic and boot function. Support for more general additive models and a formula interface is under development. \code{disco} methods have been added to the cluster distance summary function \code{edist}, and energy tests for equality of distribution (see \code{eqdist.etest}). } \seealso{ \code{ \link{edist} } \code{ \link{eqdist.e} } \code{ \link{eqdist.etest} } \code{ \link{ksample.e} } } \author{ Maria L. Rizzo \email{mrizzo@bgsu.edu} and Gabor J. Szekely } \examples{ ## warpbreaks one-way decompositions data(warpbreaks) attach(warpbreaks) disco(breaks, factors=wool, R=99) ## warpbreaks two-way wool+tension disco(breaks, factors=data.frame(wool, tension), R=0) ## warpbreaks two-way wool*tension disco(breaks, factors=data.frame(wool, tension, wool:tension), R=0) ## When index=2 for univariate data, we get ANOVA decomposition disco(breaks, factors=tension, index=2.0, R=99) aov(breaks ~ tension) ## Multivariate response ## Example on producing plastic film from Krzanowski (1998, p. 381) tear <- c(6.5, 6.2, 5.8, 6.5, 6.5, 6.9, 7.2, 6.9, 6.1, 6.3, 6.7, 6.6, 7.2, 7.1, 6.8, 7.1, 7.0, 7.2, 7.5, 7.6) gloss <- c(9.5, 9.9, 9.6, 9.6, 9.2, 9.1, 10.0, 9.9, 9.5, 9.4, 9.1, 9.3, 8.3, 8.4, 8.5, 9.2, 8.8, 9.7, 10.1, 9.2) opacity <- c(4.4, 6.4, 3.0, 4.1, 0.8, 5.7, 2.0, 3.9, 1.9, 5.7, 2.8, 4.1, 3.8, 1.6, 3.4, 8.4, 5.2, 6.9, 2.7, 1.9) Y <- cbind(tear, gloss, opacity) rate <- factor(gl(2,10), labels=c("Low", "High")) ## test for equal distributions by rate disco(Y, factors=rate, R=99) disco(Y, factors=rate, R=99, method="discoB") ## Just extract the decomposition table disco(Y, factors=rate, R=0)$stats ## Compare eqdist.e methods for rate ## disco between stat is half of original when sample sizes equal eqdist.e(Y, sizes=c(10, 10), method="original") eqdist.e(Y, sizes=c(10, 10), method="discoB") ## The between-sample distance component disco.between(Y, factors=rate, R=0) } \keyword{ htest } \keyword{ multivariate } energy/man/kgroups.Rd0000644000176200001440000000730014005374454014305 0ustar liggesusers\name{kgroups} \alias{kgroups} \title{ K-Groups Clustering } \description{ Perform k-groups clustering by energy distance. } \usage{ kgroups(x, k, iter.max = 10, nstart = 1, cluster = NULL) } \arguments{ \item{x}{Data frame or data matrix or distance object} \item{k}{number of clusters} \item{iter.max}{maximum number of iterations} \item{nstart}{number of restarts} \item{cluster}{initial clustering vector} } \details{ K-groups is based on the multisample energy distance for comparing distributions. Based on the disco decomposition of total dispersion (a Gini type mean distance) the objective function should either maximize the total between cluster energy distance, or equivalently, minimize the total within cluster energy distance. It is more computationally efficient to minimize within distances, and that makes it possible to use a modified version of the Hartigan-Wong algorithm (1979) to implement K-groups clustering. The within cluster Gini mean distance is \deqn{G(C_j) = \frac{1}{n_j^2} \sum_{i,m=1}^{n_j} |x_{i,j} - x_{m,j}|} and the K-groups within cluster distance is \deqn{W_j = \frac{n_j}{2}G(C_j) = \frac{1}{2 n_j} \sum_{i,m=1}^{n_j} |x_{i,j} - x_{m,j}.} If z is the data matrix for cluster \eqn{C_j}, then \eqn{W_j} could be computed as \code{sum(dist(z)) / nrow(z)}. If cluster is not NULL, the clusters are initialized by this vector (can be a factor or integer vector). Otherwise clusters are initialized with random labels in k approximately equal size clusters. If \code{x} is not a distance object (class(x) == "dist") then \code{x} is converted to a data matrix for analysis. Run up to \code{iter.max} complete passes through the data set until a local min is reached. If \code{nstart > 1}, on second and later starts, clusters are initialized at random, and the best result is returned. } \value{ An object of class \code{kgroups} containing the components \item{call}{the function call} \item{cluster}{vector of cluster indices} \item{sizes}{cluster sizes} \item{within}{vector of Gini within cluster distances} \item{W}{sum of within cluster distances} \item{count}{number of moves} \item{iterations}{number of iterations} \item{k}{number of clusters} \code{cluster} is a vector containing the group labels, 1 to k. \code{print.kgroups} prints some of the components of the kgroups object. Expect that count is 0 if the algorithm converged to a local min (that is, 0 moves happened on the last iteration). If iterations equals iter.max and count is positive, then the algorithm did not converge to a local min. } \author{ Maria Rizzo and Songzi Li } \references{ Li, Songzi (2015). "K-groups: A Generalization of K-means by Energy Distance." Ph.D. thesis, Bowling Green State University. Li, S. and Rizzo, M. L. (2017). "K-groups: A Generalization of K-means Clustering". ArXiv e-print 1711.04359. https://arxiv.org/abs/1711.04359 Szekely, G. J., and M. L. Rizzo. "Testing for equal distributions in high dimension." InterStat 5, no. 16.10 (2004). Rizzo, M. L., and G. J. Szekely. "Disco analysis: A nonparametric extension of analysis of variance." The Annals of Applied Statistics (2010): 1034-1055. Hartigan, J. A. and Wong, M. A. (1979). "Algorithm AS 136: A K-means clustering algorithm." Applied Statistics, 28, 100-108. doi: 10.2307/2346830. } \examples{ x <- as.matrix(iris[ ,1:4]) set.seed(123) kg <- kgroups(x, k = 3, iter.max = 5, nstart = 2) kg fitted(kg) \donttest{ d <- dist(x) set.seed(123) kg <- kgroups(d, k = 3, iter.max = 5, nstart = 2) kg kg$cluster fitted(kg) fitted(kg, method = "groups") } } \keyword{ cluster } \keyword{ multivariate } energy/man/mutualIndep.Rd0000644000176200001440000000400014266252605015076 0ustar liggesusers\name{mutual independence} \alias{mutualIndep.test} \title{ Energy Test of Mutual Independence} \description{ The test statistic is the sum of d-1 bias-corrected squared dcor statistics where the number of variables is d. Implementation is by permuation test. } \usage{ mutualIndep.test(x, R) } \arguments{ \item{x}{ data matrix or data frame} \item{R}{ number of permutation replicates} } \details{ A population coefficient for mutual independence of d random variables, \eqn{d \geq 2}, is \deqn{ \sum_{k=1}^{d-1} \mathcal R^2(X_k, [X_{k+1},\dots,X_d]). } which is non-negative and equals zero iff mutual independence holds. For example, if d=4 the population coefficient is \deqn{ \mathcal R^2(X_1, [X_2,X_3,X_4]) + \mathcal R^2(X_2, [X_3,X_4]) + \mathcal R^2(X_3, X_4), } A permutation test is implemented based on the corresponding sample coefficient. To test mutual independence of \deqn{X_1,\dots,X_d} the test statistic is the sum of the d-1 statistics (bias-corrected \eqn{dcor^2} statistics): \deqn{\sum_{k=1}^{d-1} \mathcal R_n^*(X_k, [X_{k+1},\dots,X_d])}. } \value{ \code{mutualIndep.test} returns an object of class \code{power.htest}. } \note{ See Szekely and Rizzo (2014) for details on unbiased \eqn{dCov^2} and bias-corrected \eqn{dCor^2} (\code{bcdcor}) statistics. } \seealso{ \code{\link{bcdcor}}, \code{\link{dcovU_stats}} } \references{ Szekely, G.J., Rizzo, M.L., and Bakirov, N.K. (2007), Measuring and Testing Dependence by Correlation of Distances, \emph{Annals of Statistics}, Vol. 35 No. 6, pp. 2769-2794. \cr \doi{10.1214/009053607000000505} Szekely, G.J. and Rizzo, M.L. (2014), Partial Distance Correlation with Methods for Dissimilarities. \emph{Annals of Statistics}, Vol. 42 No. 6, 2382-2412. } \author{ Maria L. Rizzo \email{mrizzo@bgsu.edu} and Gabor J. Szekely } \examples{ x <- matrix(rnorm(100), nrow=20, ncol=5) mutualIndep.test(x, 199) } \keyword{ multivariate } \concept{ independence } \concept{ distance correlation } \concept{ distance covariance } \concept{ energy statistics } energy/man/dmatrix.Rd0000644000176200001440000000332214300431253014250 0ustar liggesusers\name{Distance Matrix} \alias{is.dmatrix} \alias{calc_dist} \title{ Distance Matrices } \description{ Utilities for working with distance matrices. \code{is.dmatrix} is a utility that checks whether the argument is a distance or dissimilarity matrix; is it square symmetric, non-negative, with zero diagonal? \code{calc_dist} computes a distance matrix directly from a data matrix. } \usage{ is.dmatrix(x, tol = 100 * .Machine$double.eps) calc_dist(x) } \arguments{ \item{x}{ numeric matrix} \item{tol}{ tolerance for checking required conditions} } \details{ Energy functions work with the distance matrices of samples. The \code{is.dmatrix} function is used internally when converting arguments to distance matrices. The default \code{tol} is the same as default tolerance of \code{isSymmetric}. \code{calc_dist} is an exported Rcpp function that returns a Euclidean distance matrix from the input data matrix. } \value{ \code{is.dmatrix} returns TRUE if (within tolerance) \code{x} is a distance/dissimilarity matrix; otherwise FALSE. It will return FALSE if \code{x} is a class \code{dist} object. \code{calc_dist} returns the Euclidean distance matrix for the data matrix \code{x}, which has observations in rows. } \note{ In practice, if \code{dist(x)} is not yet computed, \code{calc_dist(x)} will be faster than \code{as.matrix(dist(x))}. On working with non-Euclidean dissimilarities, see the references. } \examples{ x <- matrix(rnorm(20), 10, 2) D <- calc_dist(x) is.dmatrix(D) is.dmatrix(cov(x)) } \author{ Maria L. Rizzo \email{mrizzo@bgsu.edu} } \references{ Szekely, G.J. and Rizzo, M.L. (2014), Partial Distance Correlation with Methods for Dissimilarities. \emph{Annals of Statistics}, Vol. 42 No. 6, 2382-2412. } energy/man/dcovu.Rd0000644000176200001440000000461414251434573013742 0ustar liggesusers\name{Unbiased distance covariance} \alias{bcdcor} \alias{dcovU} \title{Unbiased dcov and bias-corrected dcor statistics} \description{ These functions compute unbiased estimators of squared distance covariance and a bias-corrected estimator of (squared) distance correlation. } \usage{ bcdcor(x, y) dcovU(x, y) } \arguments{ \item{x}{ data or dist object of first sample} \item{y}{ data or dist object of second sample} } \details{ The unbiased (squared) dcov is inner product definition of dCov, in the Hilbert space of U-centered distance matrices. The sample sizes (number of rows) of the two samples must agree, and samples must not contain missing values. Argument types supported are numeric data matrix, data.frame, or tibble, with observations in rows; numeric vector; ordered or unordered factors. In case of unordered factors a 0-1 distance matrix is computed. } \value{ \code{dcovU} returns the unbiased estimator of squared dcov. \code{bcdcor} returns a bias-corrected estimator of squared dcor. } \note{ Unbiased distance covariance (SR2014) corresponds to the biased (original) \eqn{\mathrm{dCov^2}}{dCov^2}. Since \code{dcovU} is an unbiased statistic, it is signed and we do not take the square root. For the original distance covariance test of independence (SRB2007, SR2009), the distance covariance test statistic is the V-statistic \eqn{\mathrm{n\, dCov^2} = n \mathcal{V}_n^2}{n V_n^2} (not dCov). Similarly, \code{bcdcor} is bias-corrected, so we do not take the square root as with dCor. } \references{ Szekely, G.J. and Rizzo, M.L. (2014), Partial Distance Correlation with Methods for Dissimilarities. \emph{Annals of Statistics}, Vol. 42 No. 6, 2382-2412. Szekely, G.J., Rizzo, M.L., and Bakirov, N.K. (2007), Measuring and Testing Dependence by Correlation of Distances, \emph{Annals of Statistics}, Vol. 35 No. 6, pp. 2769-2794. \cr \doi{10.1214/009053607000000505} Szekely, G.J. and Rizzo, M.L. (2009), Brownian Distance Covariance, \emph{Annals of Applied Statistics}, Vol. 3, No. 4, 1236-1265. \cr \doi{10.1214/09-AOAS312} } \author{ Maria L. Rizzo \email{mrizzo@bgsu.edu} and Gabor J. Szekely } \examples{ x <- iris[1:50, 1:4] y <- iris[51:100, 1:4] dcovU(x, y) bcdcor(x, y) } \keyword{ multivariate } \keyword{ nonparametric } \concept{ independence } \concept{ multivariate } \concept{ distance correlation } \concept{ distance covariance } \concept{ energy statistics } energy/man/dcov.test.Rd0000644000176200001440000001237214250476236014534 0ustar liggesusers\name{dcov.test} \alias{distance covariance} \alias{dcov.test} \alias{dcor.test} \title{ Distance Covariance Test and Distance Correlation test} \description{ Distance covariance test and distance correlation test of multivariate independence. Distance covariance and distance correlation are multivariate measures of dependence.} \usage{ dcov.test(x, y, index = 1.0, R = NULL) dcor.test(x, y, index = 1.0, R) } \arguments{ \item{x}{ data or distances of first sample} \item{y}{ data or distances of second sample} \item{R}{ number of replicates} \item{index}{ exponent on Euclidean distance, in (0,2]} } \details{ \code{dcov.test} and \code{dcor.test} are nonparametric tests of multivariate independence. The test decision is obtained via permutation bootstrap, with \code{R} replicates. The sample sizes (number of rows) of the two samples must agree, and samples must not contain missing values. The \code{index} is an optional exponent on Euclidean distance. Valid exponents for energy are in (0, 2) excluding 2. Argument types supported are numeric data matrix, data.frame, or tibble, with observations in rows; numeric vector; ordered or unordered factors. In case of unordered factors a 0-1 distance matrix is computed. Optionally pre-computed distances can be input as class "dist" objects or as distance matrices. For data types of arguments, distance matrices are computed internally. The \code{dcov} test statistic is \eqn{n \mathcal V_n^2}{nV_n^2} where \eqn{\mathcal V_n(x,y)}{V_n(x,y)} = dcov(x,y), which is based on interpoint Euclidean distances \eqn{\|x_{i}-x_{j}\|}{||x_{i}-x_{j}||}. The \code{index} is an optional exponent on Euclidean distance. Similarly, the \code{dcor} test statistic is based on the normalized coefficient, the distance correlation. (See the manual page for \code{dcor}.) Distance correlation is a new measure of dependence between random vectors introduced by Szekely, Rizzo, and Bakirov (2007). For all distributions with finite first moments, distance correlation \eqn{\mathcal R}{R} generalizes the idea of correlation in two fundamental ways: (1) \eqn{\mathcal R(X,Y)}{R(X,Y)} is defined for \eqn{X} and \eqn{Y} in arbitrary dimension. (2) \eqn{\mathcal R(X,Y)=0}{R(X,Y)=0} characterizes independence of \eqn{X} and \eqn{Y}. Characterization (2) also holds for powers of Euclidean distance \eqn{\|x_i-x_j\|^s}{|x_i-x_j|^s}, where \eqn{0= 0.12.6), stats, boot, gsl LinkingTo: Rcpp Suggests: MASS, CompQuadForm Depends: R (>= 3.1) URL: https://github.com/mariarizzo/energy License: GPL (>= 2) LazyData: true NeedsCompilation: yes Repository: CRAN Packaged: 2022-12-21 16:56:06 UTC; maria Author: Maria Rizzo [aut, cre], Gabor Szekely [aut] Maintainer: Maria Rizzo Date/Publication: 2022-12-22 09:10:02 UTC energy/build/0000755000176200001440000000000014350635242012646 5ustar liggesusersenergy/build/partial.rdb0000644000176200001440000000007514350635242014775 0ustar liggesusersb```b`abb`b1 H020piּb C".X7energy/src/0000755000176200001440000000000014350635242012336 5ustar liggesusersenergy/src/U-product.cpp0000644000176200001440000000072014246144107014722 0ustar liggesusers#include using namespace Rcpp; /* Author: Maria L. Rizzo energy package github.com/mariarizzo/energy */ // [[Rcpp::export]] double U_product(NumericMatrix U, NumericMatrix V) { // U and V are U-centered dissimilarity matrices of the two samples int n = U.nrow(); int i, j; double sums = 0.0; for (i = 0; i < n; i++) for (j=0; j #include void ksampleEtest(double *x, int *byrow, int *nsamples, int *sizes, int *dim, int *R, double *e0, double *e, double *pval); void E2sample(double *x, int *sizes, int *dim, double *stat); double edist(double **D, int m, int n); double multisampleE(double **D, int nsamples, int *sizes, int *perm); double twosampleE(double **D, int m, int n, int *xrows, int *yrows); double E2(double **x, int *sizes, int *start, int ncol, int *perm); double Eksample(double *x, int *byrow, int r, int d, int K, int *sizes, int *ix); void distance(double **bxy, double **D, int N, int d); /* utilities.c */ extern double **alloc_matrix(int r, int c); extern int **alloc_int_matrix(int r, int c); extern void free_matrix(double **matrix, int r, int c); extern void free_int_matrix(int **matrix, int r, int c); extern void permute(int *J, int n); extern void roworder(double *x, int *byrow, int r, int c); extern void vector2matrix(double *x, double **y, int N, int d, int isroworder); extern void distance(double **bxy, double **D, int N, int d); extern void Euclidean_distance(double *x, double **Dx, int n, int d); extern void index_distance(double *x, double **Dx, int n, int d, double index); extern void sumdist(double *x, int *byrow, int *nrow, int *ncol, double *lowersum); void E2sample(double *x, int *sizes, int *dim, double *stat) { /* compute test statistic *stat for testing H:F=G does not store distance matrix x must be in row order: x=as.double(t(x)) where x is pooled sample in matrix sum(en) by dim */ int m=sizes[0], n=sizes[1], d=(*dim); int i, j, k, p, q; double dif, dsum, sumxx, sumxy, sumyy, w; sumxy = 0.0; for (i=0; i 0) { data = alloc_matrix(N, d); /* sample matrix */ vector2matrix(x, data, N, d, *byrow); distance(data, D, N, d); free_matrix(data, N, d); } else vector2matrix(x, D, N, N, *byrow); *e0 = multisampleE(D, K, sizes, perm); /* bootstrap */ if (B > 0) { ek = 0; GetRNGstate(); for (b=0; b using namespace Rcpp; /* Author: Maria L. Rizzo energy package github.com/mariarizzo/energy */ // [[Rcpp::export]] NumericMatrix calc_dist(NumericMatrix x) { int n = x.nrow(), d = x.ncol(), i, j, k; double dsum, dk; NumericMatrix Dx(n, n); for (i = 0; i < n; i++) { for (j = i; j < n; j++) { if (i == j) { Dx(i, i) = 0.0; } else { dsum = 0.0; for (k = 0; k < d; k++) { dk = x(i,k) - x(j,k); dsum += dk * dk; } Dx(i, j) = sqrt(dsum); Dx(j, i) = Dx(i, j); } } } return Dx; } energy/src/energy_init.c0000644000176200001440000000500514251416071015013 0ustar liggesusers#include #include #include // for NULL #include /* Author: Maria L. Rizzo energy package github.com/mariarizzo/energy */ /* declarations to register native routines in this package */ /* .C calls */ extern void dCOV(void *, void *, void *, void *); extern void dCOVtest(void *, void *, void *, void *, void *, void *, void *); extern void indepE(void *, void *, void *, void *, void *); extern void indepEtest(void *, void *, void *, void *, void *, void *, void *); extern void ksampleEtest(void *, void *, void *, void *, void *, void *, void *, void *, void *); extern void permute_check(void *, void *); /* .Call calls */ extern SEXP _energy_D_center(SEXP); extern SEXP _energy_dcovU_stats(SEXP, SEXP); extern SEXP _energy_partial_dcor(SEXP, SEXP, SEXP); extern SEXP _energy_partial_dcov(SEXP, SEXP, SEXP); extern SEXP _energy_poisMstat(SEXP); extern SEXP _energy_projection(SEXP, SEXP); extern SEXP _energy_U_center(SEXP); extern SEXP _energy_U_product(SEXP, SEXP); extern SEXP _energy_Btree_sum(SEXP, SEXP); extern SEXP _energy_kgroups_start(SEXP, SEXP, SEXP, SEXP, SEXP); extern SEXP _energy_calc_dist(SEXP); extern SEXP _energy_dCov2(SEXP, SEXP, SEXP); extern SEXP _energy_dCov2stats(SEXP, SEXP, SEXP); static const R_CMethodDef CEntries[] = { {"dCOV", (DL_FUNC) &dCOV, 4}, {"dCOVtest", (DL_FUNC) &dCOVtest, 7}, {"indepE", (DL_FUNC) &indepE, 5}, {"indepEtest", (DL_FUNC) &indepEtest, 7}, {"ksampleEtest", (DL_FUNC) &ksampleEtest, 9}, {"permute_check",(DL_FUNC) &permute_check,2}, {NULL, NULL, 0} }; static const R_CallMethodDef CallEntries[] = { {"_energy_D_center", (DL_FUNC) &_energy_D_center, 1}, {"_energy_dcovU_stats", (DL_FUNC) &_energy_dcovU_stats, 2}, {"_energy_partial_dcor", (DL_FUNC) &_energy_partial_dcor, 3}, {"_energy_partial_dcov", (DL_FUNC) &_energy_partial_dcov, 3}, {"_energy_poisMstat", (DL_FUNC) &_energy_poisMstat, 1}, {"_energy_projection", (DL_FUNC) &_energy_projection, 2}, {"_energy_U_center", (DL_FUNC) &_energy_U_center, 1}, {"_energy_U_product", (DL_FUNC) &_energy_U_product, 2}, {"_energy_Btree_sum", (DL_FUNC) &_energy_Btree_sum, 2}, {"_energy_kgroups_start", (DL_FUNC) &_energy_kgroups_start, 5}, {"_energy_calc_dist", (DL_FUNC) &_energy_calc_dist, 1}, {NULL, NULL, 0} }; void R_init_energy(DllInfo *dll) { R_registerRoutines(dll, CEntries, CallEntries, NULL, NULL); R_useDynamicSymbols(dll, FALSE); } energy/src/dcovU.cpp0000644000176200001440000000200514246144104014113 0ustar liggesusers#include using namespace Rcpp; // Author: Maria L. Rizzo // energy package // github.com/mariarizzo/energy NumericMatrix U_center(NumericMatrix); //[[Rcpp::export]] NumericVector dcovU_stats(NumericMatrix Dx, NumericMatrix Dy) { // x and y must be square distance matrices NumericMatrix A = U_center(Dx); NumericMatrix B = U_center(Dy); double ab = 0.0, aa = 0.0, bb = 0.0; double V, dcorU = 0.0; double eps = std::numeric_limits::epsilon(); //machine epsilon int n = Dx.nrow(); int n2 = n * (n - 3); for (int i=0; i eps) dcorU = ab / sqrt(V); return NumericVector::create( _["dCovU"] = ab, _["bcdcor"] = dcorU, _["dVarXU"] = aa, _["dVarYU"] = bb ); } energy/src/dcov.c0000644000176200001440000001242314251415574013443 0ustar liggesusers/* dcov.c: distance correlation and covariance statistics and dCov test for multivariate independence Szekely, G.J., Rizzo, M.L., and Bakirov, N.K. (2007) "Measuring and testing dependence by correlation of distances" Annals of Statistics, Vol. 35 No. 6, pp. 2769-2794. Author: Maria L. Rizzo energy package github.com/mariarizzo/energy */ #include #include void dCOVtest(double *x, double *y, int *nrow, int *nreps, double *reps, double *DCOV, double *pval); void dCOV(double *x, double *y, int *nrow, double *DCOV); double Akl(double **akl, double **A, int n); /* functions in utilities.c */ extern double **alloc_matrix(int r, int c); extern int **alloc_int_matrix(int r, int c); extern void free_matrix(double **matrix, int r, int c); extern void free_int_matrix(int **matrix, int r, int c); extern void permute(int *J, int n); extern void roworder(double *x, int *byrow, int r, int c); extern void Euclidean_distance(double *x, double **Dx, int n, int d); extern void index_distance(double **Dx, int n, double index); extern void vector2matrix(double *x, double **y, int N, int d, int isroworder); void dCOVtest(double *x, double *y, int *nrow, int *nreps, double *reps, double *DCOV, double *pval) { /* input vectors must expand to distance matrices any exponent must be pre-computed in R computes dCov(x,y), dCor(x,y), dVar(x), dVar(y) V-statistic is n*dCov^2 where n*dCov^2 --> Q DCOV : vector [dCov, dCor, dVar(x), dVar(y), mean(A), mean(B)] */ int i, j, k, r, J, K, M; int n = nrow[0], R = nreps[0]; int* perm; double **Dx, **Dy, **A, **B; double dcov, V; double n2 = (double) n * n; Dx = alloc_matrix(n, n); Dy = alloc_matrix(n, n); vector2matrix(x, Dx, n, n, 1); vector2matrix(y, Dy, n, n, 1); A = alloc_matrix(n, n); B = alloc_matrix(n, n); Akl(Dx, A, n); Akl(Dy, B, n); free_matrix(Dx, n, n); free_matrix(Dy, n, n); /* compute dCov(x,y), dVar(x), dVar(y) */ for (k=0; k<4; k++) DCOV[k] = 0.0; for (k=0; k 0) DCOV[k] = sqrt(DCOV[k]); else DCOV[k] = 0.0; } /* compute dCor(x, y) */ V = DCOV[2]*DCOV[3]; if (V > DBL_EPSILON) DCOV[1] = DCOV[0] / sqrt(V); else DCOV[1] = 0.0; if (R > 0) { /* compute the replicates */ if (DCOV[1] > 0.0) { perm = Calloc(n, int); M = 0; for (i=0; i= DCOV[0]) M++; } *pval = (double) (M+1) / (double) (R+1); PutRNGstate(); Free(perm); } else { *pval = 1.0; } } free_matrix(A, n, n); free_matrix(B, n, n); return; } void dCOV(double *x, double *y, int *nrow, double *DCOV) { /* input vectors must expand to distance matrices any exponent must be pre-computed in R computes dCov(x,y), dCor(x,y), dVar(x), dVar(y) V-statistic is n*dCov^2 where n*dCov^2 --> Q DCOV : vector [dCov, dCor, dVar(x), dVar(y)] */ int j, k, n = nrow[0]; double **Dx, **Dy, **A, **B; double V, n2 = (double) n * n; Dx = alloc_matrix(n, n); Dy = alloc_matrix(n, n); vector2matrix(x, Dx, n, n, 1); vector2matrix(y, Dy, n, n, 1); A = alloc_matrix(n, n); B = alloc_matrix(n, n); Akl(Dx, A, n); Akl(Dy, B, n); free_matrix(Dx, n, n); free_matrix(Dy, n, n); n2 = ((double) n) * n; /* compute dCov(x,y), dVar(x), dVar(y) */ for (k=0; k<4; k++) DCOV[k] = 0.0; for (k=0; k 0) DCOV[k] = sqrt(DCOV[k]); else DCOV[k] = 0.0; } /* compute dCor(x, y) */ V = DCOV[2]*DCOV[3]; if (V > DBL_EPSILON) DCOV[1] = DCOV[0] / sqrt(V); else DCOV[1] = 0.0; free_matrix(A, n, n); free_matrix(B, n, n); return; } double Akl(double **akl, double **A, int n) { /* -computes the A_{kl} or B_{kl} distances from the distance matrix (a_{kl}) or (b_{kl}) for dCov, dCor, dVar dCov = mean(Akl*Bkl), dVar(X) = mean(Akl^2), etc. */ int j, k; double *akbar; double abar; akbar = Calloc(n, double); abar = 0.0; for (k=0; k using namespace Rcpp; /* Author: Maria L. Rizzo energy package github.com/mariarizzo/energy */ NumericMatrix U_center(NumericMatrix); double U_product(NumericMatrix, NumericMatrix); // [[Rcpp::export]] NumericMatrix projection(NumericMatrix Dx, NumericMatrix Dz) { /* returns the projection of A(x) distance matrix Dx onto the orthogonal complement of C(z) distance matrix; both Dx and Dz are n by n distance or dissimilarity matrices the projection is an n by n matrix */ int n = Dx.nrow(); int i, j; NumericMatrix A(n, n), C(n, n), P(n, n); double AC, CC, c1; double eps = std::numeric_limits::epsilon(); //machine epsilon A = U_center(Dx); // U-centering to get A^U etc. C = U_center(Dz); AC = U_product(A, C); // (A,C) = dcov^U CC = U_product(C, C); c1 = 0.0; // if (C,C)==0 then C==0 so c1=(A,C)=0 if (fabs(CC) > eps) c1 = AC / CC; for (i=0; i using namespace Rcpp; /* Author: Maria L. Rizzo energy package github.com/mariarizzo/energy */ int kgroups_update(NumericMatrix x, int k, IntegerVector clus, IntegerVector sizes, NumericVector within, bool distance); List kgroups_start(NumericMatrix x, int k, IntegerVector clus, int iter_max, bool distance); int kgroups_update(NumericMatrix x, int k, IntegerVector clus, IntegerVector sizes, NumericVector w, bool distance) { /* * k-groups one pass through sample moving one point at a time * x: data matrix or distance * k: number of clusters * clus: clustering vector clus(i)==j ==> x_i is in cluster j * sizes: cluster sizes * within: vector of within cluster dispersions * distance: true if x is distance matrix * update clus, sizes, and withins * return count = number of points moved */ int n = x.nrow(), d = x.ncol(); int i, j, I, J, ix, nI, nJ; NumericVector rowdst(k), e(k); int best, count = 0; double dsum, dif; for (ix = 0; ix < n; ix++) { I = clus(ix); nI = sizes(I); if (nI > 1) { // calculate the E-distances of this point to each cluster rowdst.fill(0.0); for (i = 0; i < n; i++) { J = clus(i); if (distance == true) { rowdst(J) += x(ix, i); } else { dsum = 0.0; for (j = 0; j < d; j++) { dif = x(ix, j) - x(i, j); dsum += dif * dif; } rowdst(J) += sqrt(dsum); } } for (J = 0; J < k; J++) { nJ = sizes(J); e(J) = (2.0 / (double) nJ) * (rowdst(J) - w(J)); } best = Rcpp::which_min(e); if (best != I) { // move this point and update nI = sizes(I); nJ = sizes(best); w(best) = (((double) nJ) * w(best) + rowdst(best)) / ((double) (nJ + 1)); w(I) = (((double) nI) * w(I) - rowdst(I)) / ((double) (nI - 1)); clus(ix) = best; sizes(I) = nI - 1; sizes(best) = nJ + 1; count ++; // number of moves } } } return count; } // [[Rcpp::export]] List kgroups_start(NumericMatrix x, int k, IntegerVector clus, int iter_max, bool distance) { // k-groups clustering with initial clustering vector clus // up to iter_max iterations of n possible moves each // distance: true if x is distance matrix NumericVector within(k, 0.0); IntegerVector sizes(k, 0); double dif, dsum; int I, J, h, i, j; int n = x.nrow(), d = x.ncol(); for (i = 0; i < n; i++) { I = clus(i); sizes(I)++; for (j = 0; j < i; j++) { J = clus(j); if (I == J) { if (distance == true) { within(I) += x(i, j); } else { dsum = 0.0; for (h = 0; h < d; h++) { dif = x(i, h) - x(j, h); dsum += dif * dif; } within(I) += sqrt(dsum); } } } } for (I = 0; I < k; I++) within(I) /= ((double) sizes(I)); int it = 1, count = 1; count = kgroups_update(x, k, clus, sizes, within, distance); while (it < iter_max && count > 0) { count = kgroups_update(x, k, clus, sizes, within, distance); it++; } double W = Rcpp::sum(within); return List::create( _["within"] = within, _["W"] = W, _["sizes"] = sizes, _["cluster"] = clus, _["iterations"] = it, _["count"] = count); } energy/src/centering.cpp0000644000176200001440000000334514246144104015021 0ustar liggesusers// double centering utilities for the energy package // // Author: Maria L. Rizzo // energy package // github.com/mariarizzo/energy #include using namespace Rcpp; NumericMatrix D_center(NumericMatrix Dx); NumericMatrix U_center(NumericMatrix Dx); // [[Rcpp::export]] NumericMatrix D_center(NumericMatrix Dx) { /* computes the double centered distance matrix for distance matrix Dx for dCov, dCor, etc. a_{ij} - a_{i.}/n - a_{.j}/n + a_{..}/n^2, all i, j */ int j, k; int n = Dx.nrow(); NumericVector akbar(n); NumericMatrix A(n, n); double abar = 0.0; for (k=0; k using namespace Rcpp; // Author: Maria L. Rizzo // energy package // github.com/mariarizzo/energy // compute partial sum using binary search algorithm like AVL // pre-compute powers of two to save repeated calculations IntegerVector containerNodes (int y, IntegerVector pwr2, IntegerVector psum); NumericVector gamma1_direct(IntegerVector y, NumericVector z); IntegerVector p2sum(IntegerVector pwr2); IntegerVector powers2 (int L); NumericVector rowsumsDist(NumericVector x, NumericVector sorted, IntegerVector ranks); IntegerVector subNodes (int y, IntegerVector pwr2, IntegerVector psum); // [[Rcpp::export]] NumericVector Btree_sum (IntegerVector y, NumericVector z) { // // y is a permutation of the integers 1:n // z is a numeric vector of length n // compute gamma1(i) = sum(j 0) gamma1(i) += sums(node); } } return gamma1; } IntegerVector containerNodes (int y, IntegerVector pwr2, IntegerVector psum) { /* * get the indices of all nodes of binary tree whose closed * intervals contain integer y */ int i, L = pwr2.length(); IntegerVector nodes(L); nodes(0) = y; for (i = 0; i < L-1; i++) { nodes(i+1) = ceil((double) y / pwr2(i)) + psum(i); } return nodes; } IntegerVector subNodes (int y, IntegerVector pwr2, IntegerVector psum) { /* * get indices of nodes whose intervals disjoint union is 1:y */ int L = psum.length(); int idx, k, level, p2; IntegerVector nodes(L); std::fill(nodes.begin(), nodes.end(), -1L); k = y; for (level = L - 1; level > 0; level --) { p2 = pwr2(level - 1); if (k >= p2) { // at index of left node plus an offset idx = psum(level - 1) + (y / p2); nodes(L - level - 1) = idx; k -= p2; } } if (k > 0) nodes(L - 1) = y; return nodes; } IntegerVector powers2 (int L) { // (2, 4, 8, ..., 2^L, 2^(L+1)) int k; IntegerVector pwr2(L); pwr2(0) = 2; for (k = 1; k < L; k++) pwr2(k) = pwr2(k-1) * 2; return pwr2; } IntegerVector p2sum(IntegerVector pwr2) { // computes the cumsum of 2^L, 2^(L-1), ..., 2^2, 2 int i, L = pwr2.length(); IntegerVector psum(L); std::fill(psum.begin(), psum.end(), pwr2(L-1)); for (i = 1; i < L; i++) psum(i) = psum(i-1) + pwr2(L-i-1); return psum; } NumericVector gamma1_direct(IntegerVector y, NumericVector z) { // utility: direct computation of the sum gamm1 // for the purpose of testing and benchmarks int n = y.length(); int i, j; NumericVector gamma1(n); for (i = 1; i < n; i++) { for (j = 0; j < i; j++) { if (y(j) < y(i)) { gamma1(i) += z(j); } } } return gamma1; } energy/src/partial-dcor.cpp0000644000176200001440000000534414246144104015425 0ustar liggesusers#include using namespace Rcpp; /* Author: Maria L. Rizzo energy package github.com/mariarizzo/energy */ NumericVector partial_dcor(NumericMatrix Dx, NumericMatrix Dy, NumericMatrix Dz); double partial_dcov(NumericMatrix Dx, NumericMatrix Dy, NumericMatrix Dz); NumericMatrix U_center(NumericMatrix); double U_product(NumericMatrix U, NumericMatrix V); NumericMatrix projection(NumericMatrix Dx, NumericMatrix Dz); // [[Rcpp::export]] NumericVector partial_dcor(NumericMatrix Dx, NumericMatrix Dy, NumericMatrix Dz) { /* partial distance correlation, second formulation Dx, Dy, Dz are symmetric distance or dissimilarity matrices with zero diagonals partial_dcor : vector length 4, partial_dcor[0] is pdcor partial_dcor returns vector [Rxyz, Rxy, Rxz, Ryz] starred versions */ int n = Dx.nrow(); NumericMatrix A(n, n), B(n, n), C(n, n); double Rxy=0.0, Rxz=0.0, Ryz=0.0, Rxyz=0.0, den; double AB, AC, BC, AA, BB, CC, pDCOV; double eps = std::numeric_limits::epsilon(); //machine epsilon A = U_center(Dx); /* U-centering to get A^U etc. */ B = U_center(Dy); C = U_center(Dz); AB = U_product(A, B); AC = U_product(A, C); BC = U_product(B, C); AA = U_product(A, A); BB = U_product(B, B); CC = U_product(C, C); pDCOV = U_product(projection(Dx, Dz), projection(Dy, Dz)); den = sqrt(AA*BB); if (den > eps) Rxy = AB / den; den = sqrt(AA*CC); if (den > eps) Rxz = AC / den; den = sqrt(BB*CC); if (den > eps) Ryz = BC / den; den = sqrt(1 - Rxz*Rxz) * sqrt(1 - Ryz * Ryz); if (den > eps) Rxyz = (Rxy - Rxz * Ryz) / den; else { Rxyz = 0.0; } return NumericVector::create( _["pdcor"] = Rxyz, _["pdcov"] = pDCOV, _["Rxy"] = Rxy, _["Rxz"] = Rxz, _["Ryz"] = Ryz ); } //[[Rcpp::export]] double partial_dcov(NumericMatrix Dx, NumericMatrix Dy, NumericMatrix Dz) { /* pdcov following the definition via projections Dx, Dy, Dz are symmetric distance or dissimilarity matrices with zero diagonals returns pdcov sample coefficient */ int n = Dx.nrow(); int i, j; NumericMatrix A(n, n), B(n, n), C(n, n), Pxz(n, n), Pyz(n, n); double AC, BC, CC, c1, c2; double eps = std::numeric_limits::epsilon(); //machine epsilon A = U_center(Dx); /* U-centering to get A^U etc. */ B = U_center(Dy); C = U_center(Dz); AC = U_product(A, C); BC = U_product(B, C); CC = U_product(C, C); c1 = c2 = 0.0; // if (C,C)==0 then C=0 and both (A,C)=0 and (B,C)=0 if (fabs(CC) > eps) { c1 = AC / CC; c2 = BC / CC; } for (i=0; i #include void indepE(double *x, double *y, int *byrow, int *dims, double *Istat); void indepEtest(double *x, double *y, int *byrow, int *dims, double *Istat, double *reps, double *pval); void squared_distance(double *x, double **D, int n, int d); extern double **alloc_matrix(int r, int c); extern int **alloc_int_matrix(int r, int c); extern void free_matrix(double **matrix, int r, int c); extern void free_int_matrix(int **matrix, int r, int c); extern void permute(int *J, int n); extern void roworder(double *x, int *byrow, int r, int c); extern void Euclidean_distance(double *x, double **D, int n, int d); void indepE(double *x, double *y, int *byrow, int *dims, double *Istat) { /* E statistic for multiv. indep. of X in R^p and Y in R^q statistic returned is I_n^2 [nI_n^2 has a limit dist under indep] dims[0] = n (sample size) dims[1] = p (dimension of X) dims[2] = q (dimension of Y) Istat : the statistic I_n (normalized) */ int i, j, k, m, n, p, q; double Cx, Cy, Cz, C3, C4, n2, n3, n4, v; double **D2x, **D2y; n = dims[0]; p = dims[1]; q = dims[2]; if (*byrow == FALSE) { /* avoid this step: use as.double(t(x)) in R */ roworder(x, byrow, n, p); *byrow = FALSE; /* false for y */ roworder(y, byrow, n, q); } D2x = alloc_matrix(n, n); D2y = alloc_matrix(n, n); Euclidean_distance(x, D2x, n, p); Euclidean_distance(y, D2y, n, q); Cx = Cy = Cz = C3 = C4 = 0.0; n2 = ((double) n) * n; n3 = n2 * n; n4 = n2 * n2; /* compute observed test statistic */ for (i=0; i Q dims[0] = n (sample size) dims[1] = p (dimension of X) dims[2] = q (dimension of Y) dims[3] = B (number of replicates, dimension of reps) Istat : the statistic I_n (normalized) */ int b, i, j, k, m, n, p, q, B, M; int *perm; double Cx, Cy, Cz, C3, C4, n2, n3, n4, v; double **D2x, **D2y; n = dims[0]; p = dims[1]; q = dims[2]; B = dims[3]; if (*byrow == FALSE) { /* avoid this step: use as.double(t(x)) in R */ roworder(x, byrow, n, p); *byrow = FALSE; /* false for y */ roworder(y, byrow, n, q); } D2x = alloc_matrix(n, n); D2y = alloc_matrix(n, n); squared_distance(x, D2x, n, p); squared_distance(y, D2y, n, q); Cx = Cy = Cz = C3 = C4 = 0.0; n2 = ((double) n) * n; n3 = n2 * n; n4 = n2 * n2; /* compute observed test statistic */ for (i=0; i 0) { GetRNGstate(); perm = Calloc(n, int); for (i=0; i= (*Istat)) M++; } *pval = (double) M / (double) B; PutRNGstate(); Free(perm); } free_matrix(D2x, n, n); free_matrix(D2y, n, n); return; } void squared_distance(double *x, double **D2, int n, int d) { /* interpret x as an n by d matrix, in row order (n vectors in R^d) compute the squared distance matrix D2 */ int i, j, k, p, q; double dsum, dif; for (i=1; i using namespace Rcpp; /* Author: Maria L. Rizzo energy package github.com/mariarizzo/energy */ // [[Rcpp::export(.poisMstat)]] NumericVector poisMstat(IntegerVector x) { /* computes the Poisson mean distance statistic */ int i, j, k, n=x.size(); double eps=1.0e-10; double ad, cvm, d, lambda, m, q; double Mcdf1, Mcdf0, Mpdf1, cdf1, cdf0; NumericVector stats(2); lambda = mean(x); q = R::qpois(1.0-eps, lambda, TRUE, FALSE) + 1; m = 0.0; for (j=0; j 1) Mcdf1 = 1.0; cdf1 = R::ppois(i, lambda, TRUE, FALSE); /* MLE of F(i) */ d = Mcdf1 - cdf1; cvm += d * d * (cdf1 - cdf0); ad += d * d * (cdf1 - cdf0) / (cdf1 * (1-cdf1)); cdf0 = cdf1; Mcdf0 = Mcdf1; } cvm *= n; ad *= n; stats(0) = cvm; stats(1) = ad; return stats; } energy/src/utilities.c0000644000176200001440000001374314250370717014527 0ustar liggesusers/* utilities.c: some utilities for the energy package Author: Maria L. Rizzo github.com/mariarizzo/energy alloc_matrix, alloc_int_matrix, free_matrix, free_int_matrix: use R (Calloc, Free) instead of C (calloc, free) for memory management permute permutes the first n elements of an integer vector row_order converts arg from column order to row order vector2matrix copies double* arg into double** arg distance computes Euclidean distance matrix from double** Euclidean_distance computes Euclidean distance matrix from double* index_distance computes Euclidean distance matrix D then D^index sumdist sums the distance matrix without creating the matrix Notes: 1. index_distance (declaration and body of the function) revised in energy 1.3-0, 2/2011. */ #include #include double **alloc_matrix(int r, int c); int **alloc_int_matrix(int r, int c); void free_matrix(double **matrix, int r, int c); void free_int_matrix(int **matrix, int r, int c); void permute(int *J, int n); void permute_check(int *J, int *N); void roworder(double *x, int *byrow, int r, int c); void vector2matrix(double *x, double **y, int N, int d, int isroworder); void distance(double **bxy, double **D, int N, int d); void Euclidean_distance(double *x, double **Dx, int n, int d); void index_distance(double **Dx, int n, double index); void sumdist(double *x, int *byrow, int *nrow, int *ncol, double *lowersum); double **alloc_matrix(int r, int c) { /* allocate a matrix with r rows and c columns */ int i; double **matrix; matrix = Calloc(r, double *); for (i = 0; i < r; i++) matrix[i] = Calloc(c, double); return matrix; } int **alloc_int_matrix(int r, int c) { /* allocate an integer matrix with r rows and c columns */ int i; int **matrix; matrix = Calloc(r, int *); for (i = 0; i < r; i++) matrix[i] = Calloc(c, int); return matrix; } void free_matrix(double **matrix, int r, int c) { /* free a matrix with r rows and c columns */ int i; for (i = 0; i < r; i++) Free(matrix[i]); Free(matrix); } void free_int_matrix(int **matrix, int r, int c) { /* free an integer matrix with r rows and c columns */ int i; for (i = 0; i < r; i++) Free(matrix[i]); Free(matrix); } void permute(int *J, int n) { /* permute the first n integers of J if n is length(J), returns a permutation vector equal to rev(Rcpp::sample(n, n, false)) */ int i, j, j0, m=n; for (i=0; i DBL_EPSILON) { for (i=0; i do not edit by hand // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 #include using namespace Rcpp; #ifdef RCPP_USE_GLOBAL_ROSTREAM Rcpp::Rostream& Rcpp::Rcout = Rcpp::Rcpp_cout_get(); Rcpp::Rostream& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get(); #endif // Btree_sum NumericVector Btree_sum(IntegerVector y, NumericVector z); RcppExport SEXP _energy_Btree_sum(SEXP ySEXP, SEXP zSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< IntegerVector >::type y(ySEXP); Rcpp::traits::input_parameter< NumericVector >::type z(zSEXP); rcpp_result_gen = Rcpp::wrap(Btree_sum(y, z)); return rcpp_result_gen; END_RCPP } // calc_dist NumericMatrix calc_dist(NumericMatrix x); RcppExport SEXP _energy_calc_dist(SEXP xSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< NumericMatrix >::type x(xSEXP); rcpp_result_gen = Rcpp::wrap(calc_dist(x)); return rcpp_result_gen; END_RCPP } // U_product double U_product(NumericMatrix U, NumericMatrix V); RcppExport SEXP _energy_U_product(SEXP USEXP, SEXP VSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< NumericMatrix >::type U(USEXP); Rcpp::traits::input_parameter< NumericMatrix >::type V(VSEXP); rcpp_result_gen = Rcpp::wrap(U_product(U, V)); return rcpp_result_gen; END_RCPP } // D_center NumericMatrix D_center(NumericMatrix Dx); RcppExport SEXP _energy_D_center(SEXP DxSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< NumericMatrix >::type Dx(DxSEXP); rcpp_result_gen = Rcpp::wrap(D_center(Dx)); return rcpp_result_gen; END_RCPP } // U_center NumericMatrix U_center(NumericMatrix Dx); RcppExport SEXP _energy_U_center(SEXP DxSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< NumericMatrix >::type Dx(DxSEXP); rcpp_result_gen = Rcpp::wrap(U_center(Dx)); return rcpp_result_gen; END_RCPP } // dcovU_stats NumericVector dcovU_stats(NumericMatrix Dx, NumericMatrix Dy); RcppExport SEXP _energy_dcovU_stats(SEXP DxSEXP, SEXP DySEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< NumericMatrix >::type Dx(DxSEXP); Rcpp::traits::input_parameter< NumericMatrix >::type Dy(DySEXP); rcpp_result_gen = Rcpp::wrap(dcovU_stats(Dx, Dy)); return rcpp_result_gen; END_RCPP } // kgroups_start List kgroups_start(NumericMatrix x, int k, IntegerVector clus, int iter_max, bool distance); RcppExport SEXP _energy_kgroups_start(SEXP xSEXP, SEXP kSEXP, SEXP clusSEXP, SEXP iter_maxSEXP, SEXP distanceSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< NumericMatrix >::type x(xSEXP); Rcpp::traits::input_parameter< int >::type k(kSEXP); Rcpp::traits::input_parameter< IntegerVector >::type clus(clusSEXP); Rcpp::traits::input_parameter< int >::type iter_max(iter_maxSEXP); Rcpp::traits::input_parameter< bool >::type distance(distanceSEXP); rcpp_result_gen = Rcpp::wrap(kgroups_start(x, k, clus, iter_max, distance)); return rcpp_result_gen; END_RCPP } // partial_dcor NumericVector partial_dcor(NumericMatrix Dx, NumericMatrix Dy, NumericMatrix Dz); RcppExport SEXP _energy_partial_dcor(SEXP DxSEXP, SEXP DySEXP, SEXP DzSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< NumericMatrix >::type Dx(DxSEXP); Rcpp::traits::input_parameter< NumericMatrix >::type Dy(DySEXP); Rcpp::traits::input_parameter< NumericMatrix >::type Dz(DzSEXP); rcpp_result_gen = Rcpp::wrap(partial_dcor(Dx, Dy, Dz)); return rcpp_result_gen; END_RCPP } // partial_dcov double partial_dcov(NumericMatrix Dx, NumericMatrix Dy, NumericMatrix Dz); RcppExport SEXP _energy_partial_dcov(SEXP DxSEXP, SEXP DySEXP, SEXP DzSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< NumericMatrix >::type Dx(DxSEXP); Rcpp::traits::input_parameter< NumericMatrix >::type Dy(DySEXP); Rcpp::traits::input_parameter< NumericMatrix >::type Dz(DzSEXP); rcpp_result_gen = Rcpp::wrap(partial_dcov(Dx, Dy, Dz)); return rcpp_result_gen; END_RCPP } // poisMstat NumericVector poisMstat(IntegerVector x); RcppExport SEXP _energy_poisMstat(SEXP xSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< IntegerVector >::type x(xSEXP); rcpp_result_gen = Rcpp::wrap(poisMstat(x)); return rcpp_result_gen; END_RCPP } // projection NumericMatrix projection(NumericMatrix Dx, NumericMatrix Dz); RcppExport SEXP _energy_projection(SEXP DxSEXP, SEXP DzSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< NumericMatrix >::type Dx(DxSEXP); Rcpp::traits::input_parameter< NumericMatrix >::type Dz(DzSEXP); rcpp_result_gen = Rcpp::wrap(projection(Dx, Dz)); return rcpp_result_gen; END_RCPP } energy/R/0000755000176200001440000000000014350635222011746 5ustar liggesusersenergy/R/centering.R0000644000176200001440000000127614005374454014061 0ustar liggesusers## use the Rcpp exported function U_center or D_center ## the utilities in this file are provided for reference and historical reasons Dcenter <- function(x) { ## x is a dist object or data matrix if (!inherits(x, "dist")) x <- dist(x) d <- as.matrix(x) n <- nrow(d) m <- rowSums(d) M <- sum(m) / n^2 m <- m / n a <- sweep(d, 1, m) b <- sweep(a, 2, m) B <- b + M } Ucenter <- function(x) { ## x is a dist object or data matrix if (!inherits(x, "dist")) x <- dist(x) d <- as.matrix(x) n <- nrow(d) m <- rowSums(d) M <- sum(m) / ((n-1)*(n-2)) m <- m / (n-2) a <- sweep(d, 1, m) b <- sweep(a, 2, m) B <- b + M diag(B) <- 0 B } energy/R/dcorT.R0000644000176200001440000000432014005374454013147 0ustar liggesusers### dcorT.R ### implementation of the distance correlation t-test ### for high dimension Astar <- function(d) { ## d is a distance matrix or distance object ## modified or corrected doubly centered distance matrices ## denoted A* (or B*) in JMVA t-test paper (2013) if (inherits(d, "dist")) d <- as.matrix(d) n <- nrow(d) if (n != ncol(d)) stop("Argument d should be distance") m <- rowMeans(d) M <- mean(d) a <- sweep(d, 1, m) b <- sweep(a, 2, m) A <- b + M #same as plain A #correction to get A^* A <- A - d/n diag(A) <- m - M (n / (n-1)) * A } BCDCOR <- function(x, y) { ## compute bias corrected distance correlation ## internal function not in NAMESPACE (external: use bcdcor) ## revised version from v. 1.7-7 if (!inherits(x, "dist")) { x <- as.matrix(dist(x)) } else { x <- as.matrix(x) } if (!inherits(y, "dist")) { y <- as.matrix(dist(y)) } else { y <- as.matrix(y) } n <- NROW(x) AA <- Astar(x) BB <- Astar(y) XY <- sum(AA*BB) - (n/(n-2)) * sum(diag(AA*BB)) XX <- sum(AA*AA) - (n/(n-2)) * sum(diag(AA*AA)) YY <- sum(BB*BB) - (n/(n-2)) * sum(diag(BB*BB)) list(bcR=XY / sqrt(XX*YY), XY=XY/n^2, XX=XX/n^2, YY=YY/n^2, n=n) } dcorT <- function(x, y) { # computes the t statistic for corrected high-dim dCor # should be approximately student T # x and y are observed samples or distance objects r <- BCDCOR(x, y) Cn <- r$bcR n <- r$n M <- n*(n-3)/2 sqrt(M-1) * Cn / sqrt(1-Cn^2) } dcorT.test <- function(x, y) { # x and y are observed samples or distance objects dname <- paste(deparse(substitute(x)),"and", deparse(substitute(y))) stats <- BCDCOR(x, y) bcR <- stats$bcR n <- stats$n M <- n * (n-3) / 2 df <- M - 1 names(df) <- "df" tstat <- sqrt(M-1) * bcR / sqrt(1-bcR^2) names(tstat) <- "T" estimate <- bcR names(estimate) <- "Bias corrected dcor" pval <- 1 - pt(tstat, df=df) method <- "dcor t-test of independence for high dimension" rval <- list(statistic = tstat, parameter = df, p.value = pval, estimate=estimate, method=method, data.name=dname) class(rval) <- "htest" return(rval) } energy/R/dcovu.R0000644000176200001440000000164314251434432013215 0ustar liggesusers## dcovu.R ## unbiased dcov^2 and bias-corrected dcor^2 ## bcdcor <- function(x, y) { ## compute bias corrected distance correlation dcorU(x, y) } dcovU <- function(x, y) { ## unbiased dcov^2 if (!inherits(x, "dist")) x <- dist(x) if (!inherits(y, "dist")) y <- dist(y) x <- as.matrix(x) y <- as.matrix(y) n <- nrow(x) m <- nrow(y) if (n != m) stop("sample sizes must agree") if (! (all(is.finite(c(x, y))))) stop("data contains missing or infinite values") estimates <- dcovU_stats(x, y) #RcppExports return (estimates[1]) } dcorU <- function(x, y) { ## unbiased dcov^2 x <- .arg2dist.matrix(x) y <- .arg2dist.matrix(y) n <- nrow(x) m <- nrow(y) if (n != m) stop("sample sizes must agree") if (! (all(is.finite(c(x, y))))) stop("data contains missing or infinite values") estimates <- dcovU_stats(x, y) #RcppExports return (estimates[2]) } energy/R/dcov2d.R0000644000176200001440000001200014300165200013230 0ustar liggesusersdcor2d<- function(x, y, type = c("V", "U")) { ## computes dcor^2 or bias-corrected dcor^2 by O(n log n) algorithm ## bivariate data only: (x,y) in R^2 ## should be faster than direct calc. for big n type <- match.arg(type) ## argument checking in dcov2d stat <- dcov2d(x, y, type, all.stats=TRUE) dvarX <- stat[2] dvarY <- stat[3] R2 <- 0.0 if (abs(dvarX*dvarY > 10*.Machine$double.eps)) R2 <- stat[1] / sqrt(dvarX*dvarY) return (R2) } dcov2d<- function(x, y, type=c("V", "U"), all.stats=FALSE) { ## O(n log n) computation of dcovU or dcov^2 (V^2) for (x, y) in R^2 only type <- match.arg(type) if (!is.vector(x) || !is.vector(y)) { if (NCOL(x) > 1 || NCOL(y) > 1) stop("this method is only for univariate x and y") } x <- as.vector(x) y <- as.vector(y) n <- length(x) if (n != length(y)) stop("sample sizes must agree") Sums <- .dcovSums2d(x, y, all.sums=all.stats) if (type =="V") { d1 <- n^2 d2 <- n^3 d3 <- n^4 } else { d1 <- n * (n - 3) d2 <- d1 * (n - 2) d3 <- d2 * (n - 1) } dCov2d <- Sums$S1/d1 - 2*Sums$S2/d2 + Sums$S3/d3 if (all.stats) { dvarX <- Sums$S1a/d1 - 2*Sums$S2a/d2 + Sums$S3a/d3 dvarY <- Sums$S1b/d1 - 2*Sums$S2b/d2 + Sums$S3b/d3 } rval <- ifelse(type=="V", c(V=dCov2d), c(U=dCov2d)) if (all.stats) rval <- c(rval, dvarX=dvarX, dvarY=dvarY) return (rval) } .dcovSums2d <- function(x, y, all.sums = FALSE) { ## compute the sums S1, S2, S3 of distances for dcov^2 ## dCov^2 <- S1/d1 - 2 * S2/d2 + S3/d3 ## denominators differ for U-statistic, V-statisic ## if all.sums==TRUE, also return sums for dVar and kernel if (is.matrix(x) || is.matrix(y)) { if (ncol(x) > 1 || ncol(y) > 1) stop("Found multivariate (x,y) in .dcovSums2d, expecting bivariate") } n <- length(x) SRx <- sortrank(x) SRy <- sortrank(y) ## compute the rowSums of the distance matrices a. <- .rowSumsDist1(x, SRx) b. <- .rowSumsDist1(y, SRy) S2 <- sum(a. * b.) a.. <- sum(a.) b.. <- sum(b.) S3 <- sum(a.) * sum(b.) ## also need order and rank for y[order(x)] in gamma1() x1 <- SRx$x y1 <- y[SRx$ix] SRy1 <- sortrank(y1) ones <- rep(1, n) g_1 <- .gamma1(x1=x1, y1=y1, z1=ones, SRx=SRx, SRy1=SRy1) g_x <- .gamma1(x1=x1, y1=y1, z1=x1, SRx=SRx, SRy1=SRy1) g_y <- .gamma1(x1=x1, y1=y1, z1=y1, SRx=SRx, SRy1=SRy1) g_xy <- .gamma1(x1=x1, y1=y1, z1=x1*y1, SRx=SRx, SRy1=SRy1) S1 <- sum(x * y * g_1 + g_xy - x * g_y - y * g_x) L <- list(S1=S1, S2=S2, S3=S3, S1a=NA, S1b=NA, S2a=NA, S2b=NA, S3a=NA, S3b=NA, rowsumsA=NA, rowsumsB=NA, sumA=NA, sumB=NA) if (all.sums) { L$S1a <- 2 * n * (n-1) * var(x) L$S1b <- 2 * n * (n-1) * var(y) L$S2a <- sum(a.^2) L$S2b <- sum(b.^2) L$S3a <- a..^2 L$S3b <- b..^2 L$rowsumsA <- a. L$rowsumsB <- b. L$sumA <- a.. L$sumB <- b.. } return (L); } .dvarU2 <- function(x, SRx = NULL) { ## O(n log n) computation of dvarU for univariate x only ## this is an internal function that will do a stand-alone dVar calc. ## but it is not faster than dcovU2(x, x) unless we supply ## the precomputed sort + rank results in SRx n <- length(x) ## compute the rowSums of the distance matrices if (is.null(SRx)) SRx <- sortrank(x) a. <- .rowSumsDist1(x, SRx) S2 <- sum(a. * a.) S3 <- sum(a.)^2 ## also need order and rank for y[order(x)] in gamma1() x1 <- SRx$x x2 <- x1 SRx1 <- sortrank(x1) ones <- rep(1, n) g_1 <- .gamma1(x1=x1, y1=x2, z1=ones, SRx, SRx1) g_x <- .gamma1(x1=x1, y1=x2, z1=x1, SRx, SRx1) g_xx <- .gamma1(x1=x1, y1=x2, z1=x1*x2, SRx, SRx1) S1 <- sum(x^2 * g_1 + g_xx - 2 * x * g_x) d1 <- n * (n - 3) d2 <- d1 * (n - 2) d3 <- d2 * (n - 1) dVar <- S1/d1 - 2 * S2/d2 + S3/d3 return(dVar) } .gamma1 <- function(x1, y1, z1, SRx, SRy1) { # computes the terms of the sum (ab) in dcovU # original sample (x_i, y_i, z_i) # triples (x1_i, y1_i, z1_i) are sorted by ix=order(x) # SRx is the result of sortrank(x), original order # SRy1 is the result of sortrank(y1), y1=y[order(x)] # pre-compute SRx, SRy1 to avoid repeated sort and rank # n <- length(x1) ix <- SRx$ix #order(x) rankx <- SRx$r #ranks of original sample x ## ranks and order vector for this permutation of sample y1 iy1 <- SRy1$ix #order(y1) ranky1 <- SRy1$r #rank(y1) ## the partial sums in the formula g_1 psumsy1 <- (cumsum(as.numeric(z1[iy1])) - z1[iy1])[ranky1] psumsx1 <- cumsum(as.numeric(z1)) - z1 gamma1 <- Btree_sum(y=ranky1, z=z1) #y1 replaced by rank(y1) g <- sum(z1) - z1 - 2 * psumsx1 - 2 * psumsy1 + 4 * gamma1 g <- g[rankx] } .rowSumsDist1 <- function(x, Sx = NULL) { ## for univariate samples, equivalent to rowSums(as.matrix(dist(x))) ## but much faster ## Sx is a sortrank object usually pre-computed here ## x is the data vector, Sx$x is sort(x) if (is.null(Sx)) Sx <- sortrank(x) n <- length(x) r <- Sx$r #ranks z <- Sx$x #ordered sample x psums1 <- (cumsum(as.numeric(z)) - z)[r] (2*(r-1)-n)*x + sum(x) - 2*psums1 } energy/R/dcov.R0000644000176200001440000001004514251416000013013 0ustar liggesusersdcov.test <- function(x, y, index=1.0, R=NULL) { ## check for valid number of replicates R method <- "Specify the number of replicates R (R > 0) for an independence test" if (! is.null(R)) { R <- floor(R) if (R < 1) R <- 0 if (R > 0) method <- "dCov independence test (permutation test)" } else { R <- 0 } Dx <- .arg2dist.matrix(x) Dy <- .arg2dist.matrix(y) if (!isTRUE(all.equal(index, 1.0))) { Dx <- Dx^index Dy <- Dy^index } n <- nrow(Dx) m <- nrow(Dy) if (n != m) stop("Sample sizes must agree") stat <- dcorr <- reps <- 0 dcov <- rep(0, 4) if (R > 0) reps <- rep(0, R) pval <- 1 dims <- c(n, ncol(Dx), ncol(Dy), R) # dcov = [dCov,dCor,dVar(x),dVar(y)] a <- .C("dCOVtest", x = as.double(t(Dx)), y = as.double(t(Dy)), nrow = as.integer(nrow(Dx)), nreps = as.integer(R), reps = as.double(reps), DCOV = as.double(dcov), pval = as.double(pval), PACKAGE = "energy") # test statistic is n times the square of dCov statistic stat <- n * a$DCOV[1]^2 dcorr <- a$DCOV V <- dcorr[[1]] names(stat) <- "nV^2" names(V) <- "dCov" dataname <- paste("index ", index, ", replicates ", R, sep="") pval <- ifelse (R < 1, NA, a$pval) e <- list( statistic = stat, method = method, estimate = V, estimates = dcorr, p.value = pval, replicates = n* a$reps^2, n = n, data.name = dataname) class(e) <- "htest" return(e) } dcor.test <- function(x, y, index=1.0, R) { # distance correlation test for multivariate independence # like dcov.test but using dcor as the test statistic if (missing(R)) R <- 0 R <- ifelse(R > 0, floor(R), 0) RESULT <- dcov.test(x, y, index=index, R) # this test statistic is n times the square of dCov statistic DCOVteststat <- RESULT$statistic DCOVreplicates <- RESULT$replicates # RESULT$estimates = [dCov,dCor,dVar(x),dVar(y)] # dVar are invariant under permutation of sample indices estimates = RESULT$estimates names(estimates) <- c("dCov", "dCor", "dVar(X)", "dVar(Y)") DCORteststat <- RESULT$estimates[2] dvarX <- RESULT$estimates[3] dvarY <- RESULT$estimates[4] n <- RESULT$n if (R > 0) { DCORreps <- sqrt(DCOVreplicates / n) / sqrt(dvarX * dvarY) p.value <- (1 + sum(DCORreps >= DCORteststat)) / (1 + R) } else { p.value <- NA DCORreps <- NA } names(DCORteststat) <- "dCor" dataname <- paste("index ", index, ", replicates ", R, sep="") method <- ifelse(R > 0, "dCor independence test (permutation test)", "Specify the number of replicates R>0 for an independence test") e <- list( method = method, statistic = DCORteststat, estimates = estimates, p.value = p.value, replicates = DCORreps, n = n, data.name = dataname) class(e) <- "htest" return(e) } .dcov <- function(x, y, index=1.0) { # distance covariance statistic for independence # dcov = [dCov,dCor,dVar(x),dVar(y)] (vector) # this function provides the fast method for computing dCov # it is called by the dcov and dcor functions Dx <- .arg2dist.matrix(x) Dy <- .arg2dist.matrix(y) if (!isTRUE(all.equal(index, 1.0))) { Dx <- Dx^index Dy <- Dy^index } n <- nrow(Dx) m <- nrow(Dy) if (n != m) stop("Sample sizes must agree") dims <- c(n, ncol(Dx), ncol(Dy)) idx <- 1:dims[1] DCOV <- numeric(4) a <- .C("dCOV", x = as.double(t(Dx)), y = as.double(t(Dy)), nrow = as.integer(n), DCOV = as.double(DCOV), PACKAGE = "energy") return(a$DCOV) } dcov <- function(x, y, index=1.0) { # distance correlation statistic for independence return(.dcov(x, y, index)[1]) } dcor <- function(x, y, index=1.0) { # distance correlation statistic for independence return(.dcov(x, y, index)[2]) } energy/R/mutual-indep.R0000644000176200001440000000161514300172713014474 0ustar liggesusersmutualIndep.test <- function(x, R) { if (NCOL(x) < 2) { stop("Expecting two or more samples") } bootfn <- function(x, i) { d <- ncol(x) dc <- numeric(d-1) for (k in 1:(d-1)) { dc[k] <- energy::bcdcor(x[i,k], x[,(k+1):d]) } return (dc) } b <- boot::boot(x, bootfn, sim="permutation", R=R) t0 <- sum(b$t0) tp <- rowSums(b$t) pval <- (1 + sum(tp > t0)) / (R + 1) estimate <- round(b$t0, 3) names(t0) <- "Sum(R*)" names(estimate) <- paste0("R*", 1:length(b$t0)) method <- paste("Energy Test of Mutual Independence") call <- match.call() NOTE <- "statistic=sum(bcdcor); permutation test" rval <- list(statistic = t0, p.value = pval, call = call, data.name=paste(deparse(substitute(x))," dim ", paste(dim(x), collapse=",")), estimate=estimate, method=method, note=NOTE) class(rval) <- "power.htest" return(rval) } energy/R/util.R0000644000176200001440000000454614300425313013050 0ustar liggesusers## util.R ## ## utilities for the energy package ## Author: Maria Rizzo ## github.com/mariarizzo/energy ## .arg2dist.matrix <- function(x) { ## argument check and conversion for energy functions ## that take optionally data or distance object arguments ## check type of argument, return a distance matrix ## supported argument types: matrix, vector, data.frame, tibble, factor, dist if (anyNA(x)) warning("missing values not supported") if (inherits(x, "dist")) { Dx <- as.matrix(x) return(Dx) } if (is.factor(x)) { z <- as.matrix(as.integer(x)) Dx <- calc_dist(z) if (!is.ordered(x) && nlevels(x) > 2) { # need a 0-1 matrix Dx <- matrix(as.integer(Dx > 0), nrow=nrow(Dx)) } return(Dx) } if (is.vector(x) || is.data.frame(x)) { ## also for tibble Dx <- calc_dist(as.matrix(x)) } if (is.matrix(x)) { if (is.dmatrix(x)) { Dx <- x } else { ## should be data matrix Dx <- calc_dist(x) } } return(Dx) ## if here, arg type is not supported stop(paste("cannot compute distances for", class(x))) return(NA) } is.dmatrix <- function(x, tol = 100 * .Machine$double.eps) { ## check if zero diagonal, symmetric, non-negative square matrix ## i.e., distance matrix or dissimilarity matrix value <- FALSE if (is.matrix(x)) { if (nrow(x) == ncol(x)) { if (max(abs(diag(x)) < tol) && (max(abs(x - t(x)) < tol))) { if (! any(x < 0.0)) value <- TRUE } } } return (value) } perm.matrix <- function(n, R) { ## Generate the same matrix as boot.array with ## sim="permutation" and default other arguments ## with same seed we get boot.array(boot.out, indices=T) pfn <- function(x, n) x[sample.int(n)] perms <- matrix(1:n, n, R) perms <- t(apply(perms, 2, pfn, n=n)) } permutation <- function(n) { ## call the internal permute() function using permute_check() J <- 1:n a <- .C("permute_check", J = as.integer(J), n = as.integer(n), PACKAGE = "energy") return (a$J) } sortrank <- function(x) { ## sort and rank data with one call to order() ## faster than calling sort and rank separately ## returns an object identical to: ## list(x=sort(x), ix=order(x), r=rank(x, ties.method = "first")) o <- order(x) n <- length(o) N <- 1:n N[o] <- N return(list(x=x[o], ix=o, r=N)) } energy/R/Epoisson.R0000644000176200001440000000651214253334600013672 0ustar liggesuserspoisson.tests <- function(x, R, test="all") { # parametric bootstrap tests of Poisson distribution # poisson.e is the energy GOF statistic # poisson.m is the mean distance statistic # (not related to the test stats::poisson.test) if (!is.integer(x) || any(x < 0)) { warning("sample must be non-negative integers") return(NULL) } test <- tolower(test) poisson.stats <- function(x) { c(poisson.m(x), poisson.e(x)) } stat <- switch(test, "m" = poisson.m, "e" = poisson.e, poisson.stats) method <- switch(test, m=c("M-CvM","M-AD"), e="Energy", c("M-CvM","M-AD","Energy")) method <- paste(method, " test", sep="") n <- length(x) lambda <- mean(x) if (missing(R) || is.null(R)) { R <- 0 message("Specify R > 0 replicates for MC test") } bootobj <- boot::boot(x, statistic = stat, R = R, sim = "parametric", ran.gen = function(x, y) {rpois(n, lambda)}) N <- length(bootobj$t0) p <- rep(NA, times=N) if (R > 0) { for (i in 1:N) { p[i] <- 1 - mean(bootobj$t[,i] < bootobj$t0[i]) } } # a data frame, not an htest object # comparable to broom::tidy on an htest object RVAL <- data.frame(estimate=lambda, statistic=bootobj$t0, p.value=p, method=method) return(RVAL) } poisson.mtest <- function(x, R=NULL) { if (is.null(R)) R <- 0 rval <- poisson.tests(x, R, test="M") DNAME <- paste(deparse1(substitute(x)), "replicates: ", R) stat <- rval$statistic[1] names(stat) <- "M-CvM" e <- list( method = paste("Poisson M-test", sep = ""), statistic = stat, p.value = rval$p.value[1], data.name = DNAME, estimate = rval$estimate[1]) class(e) <- "htest" e } poisson.etest <- function(x, R=NULL) { if (is.null(R)) R <- 0 rval <- poisson.tests(x, R, test="E") DNAME <- paste(deparse1(substitute(x)), "replicates: ", R) stat <- rval$statistic names(stat) <- "E" e <- list( method = paste("Poisson E-test", sep = ""), statistic = stat, p.value = rval$p.value, data.name = paste("replicates: ", R, sep=""), estimate = rval$estimate) class(e) <- "htest" e } poisson.m <- function(x) { # mean distance statistic for Poissonity if (any(!is.integer(x)) || any(x < 0)) { warning("sample must be non-negative integers") return(NULL) } stats <- .poisMstat(x) names(stats) <- c("M-CvM", "M-AD") return(stats) } poisson.e <- function(x) { # energy GOF statistic for Poissonity if (any(!is.integer(x)) || any(x < 0)) { warning("sample must be non-negative integers") return(NULL) } lambda <- mean(x) n <- length(x) ## E|y-X| for X Poisson(lambda) (vectorized) Px <- ppois(x, lambda) Px1 <- ppois(x-1, lambda) meanvec <- 2*x*Px - 2*lambda*Px1 + lambda - x ## second mean E|X-X'| a <- 2 * lambda EXX <- a * exp(-a) * (besselI(a, 0) + besselI(a, 1)) ## third mean = sum_{i,j} |x_i - x_j| / n^2 K <- seq(1 - n, n - 1, 2) y <- sort(x) meanxx <- 2 * sum(K * y) / n^2 stat <- n * (2 * mean(meanvec) - EXX - meanxx) names(stat) <- "E" return(stat) } energy/R/Ecluster.R0000644000176200001440000000100414005374454013656 0ustar liggesusers energy.hclust <- function(dst, alpha = 1) { if (!inherits(dst, "dist")) stop("The first argument must be a dist object.") d <- dst n <- attr(d, "Size") if (!isTRUE(all.equal(alpha, 1))) { if (alpha > 2) warning("Exponent alpha should be in (0,2]") if (alpha < 0) stop("Cannot use negative exponent on distance.") d <- d^alpha } ## heights of hclust are half of energy; otherwise equivalent return(hclust(d, method = "ward.D")) } energy/R/pdcov-test.R0000644000176200001440000000437614251251214014166 0ustar liggesuserspdcov.test <- function(x, y, z, R) { if (missing(R)) R <- 0 Dx <- .arg2dist.matrix(x) Dy <- .arg2dist.matrix(y) Dz <- .arg2dist.matrix(z) n <- nrow(Dx) Pxz <- projection(Dx, Dz) #U-center and compute projections Pyz <- projection(Dy, Dz) #PxzU <- U_center(Pxz) #not necessary, because of invariance #PyzU <- U_center(Pyz) teststat <- n * U_product(Pxz, Pyz) ## calc. pdcor den <- sqrt(U_product(Pxz, Pxz) * U_product(Pyz, Pyz)) if (den > 0.0) { estimate <- teststat / (n * den) } else estimate <- 0.0 bootfn <- function(Pxz, i, Pyz) { # generate the permutation replicates of dcovU(Pxz, Pyz) # PxzU and PyzU are the U-centered matrices U_product(Pxz[i, i], Pyz) #RcppExports } if (R > 0 && den > 0.0) { reps <- replicate(R, expr= { i <- sample(1:n) bootfn(Pxz, i, Pyz=Pyz) }) replicates <- n * reps pval <- (1 + sum(replicates > teststat)) / (1 + R) #df <- n * (n-3) / 2 - 2 } else { pval <- NA replicates <- NA } dataname <- paste("replicates ", R, sep="") if (! R>0) dataname <- "Specify R>0 replicates for a test" condition <- (den > 0.0) names(estimate) <- "pdcor" names(teststat) <- "n V^*" e <- list( call = match.call(), method = paste("pdcov test", sep = ""), statistic = teststat, estimate = estimate, p.value = pval, n = n, replicates = replicates, condition = condition, data.name = dataname) class(e) <- "htest" return(e) } pdcor.test <- function(x, y, z, R) { ## x, y, z must be dist. objects or data matrices (no dist matrix) ## all required calc. done in pdcov.test if (missing(R)) R <- 0 result <- pdcov.test(x, y, z, R=R) if (result$condition) { ## if (A*A)(B*B) > 0 nRootV <- result$statistic / result$estimate pdcor_reps <- result$replicates / nRootV } else pdcor_reps <- NA e <- list( call = match.call(), method = paste("pdcor test", sep = ""), statistic = result$estimate, estimate = result$estimate, p.value = result$p.value, n = result$n, replicates = pdcor_reps, condition = result$condition, data.name = result$data.name) class(e) <- "htest" return(e) } energy/R/energy-defunct.R0000644000176200001440000000067114005374454015020 0ustar liggesusers## defunct functions from the energy package indep.e<- function(x, y) { # energy statistic for multivariate independence (deprecated) .Defunct(new = "mvI", package = "energy") } indep.etest<- function(x, y, R) { # energy test for multivariate independence (deprecated) .Defunct(new = "indep.test", package = "energy", msg = "indep.etest removed; use indep.test with method mvI.") } energy/R/RcppExports.R0000644000176200001440000000200714350635222014361 0ustar liggesusers# Generated by using Rcpp::compileAttributes() -> do not edit by hand # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 Btree_sum <- function(y, z) { .Call(`_energy_Btree_sum`, y, z) } calc_dist <- function(x) { .Call(`_energy_calc_dist`, x) } U_product <- function(U, V) { .Call(`_energy_U_product`, U, V) } D_center <- function(Dx) { .Call(`_energy_D_center`, Dx) } U_center <- function(Dx) { .Call(`_energy_U_center`, Dx) } dcovU_stats <- function(Dx, Dy) { .Call(`_energy_dcovU_stats`, Dx, Dy) } kgroups_start <- function(x, k, clus, iter_max, distance) { .Call(`_energy_kgroups_start`, x, k, clus, iter_max, distance) } partial_dcor <- function(Dx, Dy, Dz) { .Call(`_energy_partial_dcor`, Dx, Dy, Dz) } partial_dcov <- function(Dx, Dy, Dz) { .Call(`_energy_partial_dcov`, Dx, Dy, Dz) } .poisMstat <- function(x) { .Call(`_energy_poisMstat`, x) } projection <- function(Dx, Dz) { .Call(`_energy_projection`, Dx, Dz) } energy/R/edist.R0000644000176200001440000000443214005374454013210 0ustar liggesusersedist <- function(x, sizes, distance = FALSE, ix = 1:sum(sizes), alpha = 1, method = c("cluster","discoB")) { # computes the e-dissimilarity matrix between k samples or clusters # x: pooled sample or Euclidean distances # sizes: vector of sample (cluster) sizes # distance: TRUE if x is a distance matrix, otherwise FALSE # ix: a permutation of row indices of x # alpha: distance exponent # method: cluster distances or disco statistics # k <- length(sizes) if (k == 1) return (as.dist(0.0)) if (k < 1) return (NA) e <- matrix(nrow=k, ncol=k) n <- cumsum(sizes) m <- 1 + c(0, n[1:(k-1)]) if (is.vector(x)) x <- matrix(x, ncol=1) if (inherits(x, "dist")) distance <- TRUE if (distance) dst <- as.matrix(x) else dst <- as.matrix(dist(x)) N <- NROW(dst) if (NCOL(dst) != N) stop("distance==TRUE but first argument is not distance") if (alpha != 1) { if (alpha <= 0 || alpha > 2) warning("exponent alpha should be in (0,2]") dst <- dst^alpha } type <- match.arg(method) if (type == "cluster") { for (i in 1:(k - 1)) { e[i, i] <- 0.0 for (j in (i + 1):k) { n1 <- sizes[i] n2 <- sizes[j] ii <- ix[m[i]:n[i]] jj <- ix[m[j]:n[j]] w <- n1 * n2 / (n1 + n2) m11 <- sum(dst[ii, ii]) / (n1 * n1) m22 <- sum(dst[jj, jj]) / (n2 * n2) m12 <- sum(dst[ii, jj]) / (n1 * n2) e[i, j] <- e[j, i] <- w * ((m12 + m12) - (m11 + m22)) } } } if (type == "discoB") { #disco statistics for testing F=G for (i in 1:(k - 1)) { e[i, i] <- 0.0 for (j in (i + 1):k) { n1 <- sizes[i] n2 <- sizes[j] ii <- ix[m[i]:n[i]] jj <- ix[m[j]:n[j]] J <- c(ii,jj) d <- dst[J, J] e[i, j] <- eqdist.e(d, sizes=c(n1, n2), distance=TRUE) e[j, i] <- e[i, j] <- e[i, j] * (n1 + n2) } } e <- 0.5 * e / sum(sizes) #discoB formula } e <- as.dist(e) attr(e,"method") <- paste(method,": index= ", alpha) e } energy/R/disco.R0000644000176200001440000001445114350630353013177 0ustar liggesusers### disco tests - implementation of DIStance COmponents methods in: ### ### Rizzo, M.L. and Szekely, G.J. (2010) "DISCO Analysis: A Nonparametric ### Extension of Analysis of Variance, Annals of Applied Statistics ### Vol. 4, No. 2, 1034-1055. ### ### disco: computes the decomposition and test using F ratio ### disco.between: statistic and test using between component ### .disco1: internal computations for one factor ### .disco1stat, .disco1Bstat: internal for boot function ### ### disco <- function(x, factors, distance = FALSE, index = 1, R, method = c("disco", "discoB", "discoF")) { ## x is response or Euclidean distance matrix or dist() object factors ## is a matrix or data frame of group labels distance=TRUE if x is ## distance, otherwise FALSE index is the exponent on distance, in (0,2] ## R is number of replicates for test method: use F ratio (default) or ## between component (discoB) disco method is currently alias for discoF method <- match.arg(method) factors <- data.frame(factors) if (inherits(x, "dist")) distance <- TRUE if (method == "discoB") return(disco.between(x, factors = factors, distance = distance, index = index, R = R)) nfactors <- NCOL(factors) if (distance || inherits(x, "dist")) dst <- as.matrix(x) else dst <- as.matrix(dist(x)) N <- NROW(dst) if (NCOL(dst) != N) stop("distance==TRUE but first argument is not distance") if (!isTRUE(all.equal(index, 1))) dst <- dst^index stats <- matrix(0, nfactors, 6) colnames(stats) <- c("Trt", "Within", "df1", "df2", "Stat", "p-value") for (j in 1:nfactors) { trt <- factors[, j] stats[j, 1:4] <- .disco1(trt = trt, dst = dst) if (R > 0) { b <- boot::boot(data = dst, statistic = .disco1stat, sim = "permutation", R = R, trt = trt) stats[j, 5] <- b$t0 stats[j, 6] <- (sum(b$t > b$t0) + 1)/(R + 1) } else { stats[j, 5] <- .disco1stat(dst, i = 1:nrow(dst), trt = trt) stats[j, 6] <- NA } } methodname <- "DISCO (F ratio)" dataname <- deparse(substitute(x)) total <- sum(stats[1, 1:2]) within <- total - sum(stats[, 1]) Df.trt <- stats[, 3] factor.names <- names(factors) factor.levels <- sapply(factors, nlevels) sizes <- sapply(factors, tabulate) e <- list(call = match.call(), method = methodname, statistic = stats[, 5], p.value = stats[, 6], k = nfactors, N = N, between = stats[, 1], withins = stats[, 2], within = within, total = total, Df.trt = Df.trt, Df.e = nrow(dst) - sum(Df.trt) - 1, index = index, factor.names = factor.names, factor.levels = factor.levels, sample.sizes = sizes, stats = stats) class(e) <- "disco" e } disco.between <- function(x, factors, distance = FALSE, index = 1, R) { ## disco test based on the between-sample component similar to disco ## except that 'disco' test is based on the F ratio disco.between test ## for one factor (balanced) is asymptotically equivalent to k-sample E ## test (test statistics are proportional in that case but not in ## general). x is response or Euclidean distance matrix or dist() ## object factors is a matrix or data frame of group labels ## distance=TRUE if x is distance, otherwise FALSE index is the exponent ## on distance, in (0,2] factors <- data.frame(factors) nfactors <- NCOL(factors) if (nfactors > 1) stop("More than one factor is not implemented in disco.between") if (distance || inherits(x, "dist")) dst <- as.matrix(x) else dst <- as.matrix(dist(x)) N <- NROW(dst) if (NCOL(dst) != N) stop("distance==TRUE but first argument is not distance") if (!isTRUE(all.equal(index, 1))) dst <- dst^index trt <- factors[, 1] if (R > 0) { b <- boot::boot(data = dst, statistic = .disco1Bstat, sim = "permutation", R = R, trt = trt) between <- b$t0 reps <- b$t pval <- mean(reps >= between) } else { between <- .disco1Bstat(dst, i = 1:nrow(dst), trt = trt) pval <- NA } if (R == 0) return(between) methodname <- "DISCO (Between-sample)" dataname <- deparse(substitute(x)) names(between) <- "DISCO between statistic" e <- list(call = match.call(), method = methodname, statistic = between, p.value = pval, data.name = dataname) class(e) <- "htest" e } .disco1 <- function(trt, dst) { ## dst is Euclidean distance matrix or power of it trt is the treatment, ## a factor trt <- factor(trt) k <- nlevels(trt) n <- tabulate(trt) N <- sum(n) total <- sum(dst)/(2 * N) y <- as.vector(dst[, 1]) M <- model.matrix(y ~ 0 + trt) G <- t(M) %*% dst %*% M withins <- diag(G)/(2 * n) W <- sum(withins) B <- total - W c(B, W, k - 1, N - k) } .disco1stat <- function(dst, i, trt) { ## i is permuation vector supplied by bootstrap dst is Euclidean ## distance matrix or power of it trt is the treatment, a factor returns ## the disco 'F' ratio idx <- 1:nrow(dst) d <- .disco1(trt = trt[idx[i]], dst = dst) statistic <- (d[1]/d[3])/(d[2]/d[4]) } .disco1Bstat <- function(dst, i, trt) { ## i is permuation vector supplied by bootstrap dst is Euclidean ## distance matrix or power of it trt is the treatment, a factor returns ## the between-sample component (for one factor) idx <- 1:nrow(dst) .disco1(trt = trt[idx[i]], dst = dst)[1] } print.disco <- function(x, ...) { k <- x$k md1 <- x$between/x$Df.trt md2 <- x$within/x$Df.e f0 <- x$statistic print(x$call) cat(sprintf("\nDistance Components: index %5.2f\n", x$index)) cat(sprintf("%-15s %4s %10s %10s %9s %9s\n", "Source", "Df", "Sum Dist", "Mean Dist", "F-ratio", "p-value")) fabb <- abbreviate(x$factor.names, minlength=12) for (i in 1:k) { fname <- fabb[i] cat(sprintf("%-15s %4d %10.5f %10.5f %9.3f %9s\n", fname, x$Df.trt[i], x$between[i], md1[i], f0[i], format.pval(x$p.value[i]))) } cat(sprintf("%-15s %4d %10.5f %10.5f\n", "Within", x$Df.e, x$within, md2)) cat(sprintf("%-15s %4d %10.5f\n", "Total", x$N - 1, x$total)) } energy/R/pdcor.R0000644000176200001440000000047714251254420013205 0ustar liggesusers## pdcor.R ## ## pdcor <- function(x, y, z) { x <- .arg2dist.matrix(x) y <- .arg2dist.matrix(y) z <- .arg2dist.matrix(z) partial_dcor(x, y, z)["pdcor"] } pdcov <- function(x, y, z) { x <- .arg2dist.matrix(x) y <- .arg2dist.matrix(y) z <- .arg2dist.matrix(z) partial_dcov(x, y, z) } energy/R/Eindep.R0000644000176200001440000000426214005374454013305 0ustar liggesusersindep.test<- function(x, y, method = c("dcov","mvI"), index = 1, R) { # two energy tests for multivariate independence type <- match.arg(method) if (type == "dcov") return(dcov.test(x, y, index, R)) else if (type == "mvI") return(mvI.test(x, y, R)) } mvI <- function(x, y) { # energy statistic for multivariate independence # returns dependence coefficient I_n x <- as.matrix(x) y <- as.matrix(y) n <- nrow(x) m <- nrow(y) if (n != m || n < 2) stop("Sample sizes must agree") if (! (all(is.finite(c(x, y))))) stop("Data contains missing or infinite values") stat <- 0 dims <- c(n, ncol(x), ncol(y)) e <- .C("indepE", x = as.double(t(x)), y = as.double(t(y)), byrow = as.integer(TRUE), dims = as.integer(dims), stat = as.double(stat), PACKAGE = "energy") sqrt(e$stat) } mvI.test<- function(x, y, R) { # energy test for multivariate independence x <- as.matrix(x) y <- as.matrix(y) n <- nrow(x) m <- nrow(y) if (n != m || n < 2) stop("Sample sizes must agree") if (! (all(is.finite(c(x, y))))) stop("Data contains missing or infinite values") stat <- reps <- 0 if (R > 0) reps <- rep(0, R) pval <- 1 dims <- c(n, ncol(x), ncol(y), R) a <- .C("indepEtest", x = as.double(t(x)), y = as.double(t(y)), byrow = as.integer(TRUE), dims = as.integer(dims), stat = as.double(stat), reps = as.double(reps), pval = as.double(pval), PACKAGE = "energy") stat <- n*a$stat est <- sqrt(a$stat) names(est) <- "I" names(stat) <- "nI^2" dataname <- paste("x (",n," by ",ncol(x), "), y(",n," by ", ncol(y), "), replicates ", R, sep="") if (R > 0) p.value = a$pval else p.value = NA e <- list( method = "mvI energy test of independence", statistic = stat, estimate = est, replicates = n*reps, p.value = p.value, data.name = dataname) class(e) <- "htest" e } energy/R/Eeqdist.R0000644000176200001440000000712614005374454013501 0ustar liggesuserseqdist.e <- function(x, sizes, distance = FALSE, method = c("original","discoB","discoF")) { ## multivariate E-statistic for testing equal distributions ## x: matrix of pooled sample or distance matrix ## sizes: vector of sample sizes ## distance: logical, TRUE if x is a distance matrix, otherwise false ## method: original (default) or disco between components, or disco F ratio method <-match.arg(method) if (method=="discoB") { g <- as.factor(rep(1:length(sizes), sizes)) RVAL <- disco(x, factors=g, distance=distance, R=0, method=method) } else { RVAL <- eqdist.etest(x, sizes, distance = distance, R=0, method=method)$statistic } RVAL } eqdist.etest <- function(x, sizes, distance = FALSE, method = c("original","discoB","discoF"), R) { ## multivariate E-test of the multisample hypothesis of equal distributions ## x: matrix of pooled sample or distance matrix ## sizes: vector of sample sizes ## distance: logical, TRUE if x is a distance matrix, otherwise false ## method: original (default) or disco components ## R: number of replicates ## method <-match.arg(method) if (method=="discoB" || method=="discoF") { g <- as.factor(rep(1:length(sizes), sizes)) # for other index use disco() function directly return(disco(x, factors=g, distance=distance, index=1.0, R=R, method=method)) } nsamples <- length(sizes) if (nsamples < 2) return (NA) if (min(sizes) < 1) return (NA) if (!is.null(attr(x, "Size"))) distance <- TRUE x <- as.matrix(x) if (NROW(x) != sum(sizes)) stop("nrow(x) should equal sum(sizes)") if (distance == FALSE && nrow(x) == ncol(x)) warning("square data matrix with distance==FALSE") d <- NCOL(x) if (distance == TRUE) d <- 0 str <- "Multivariate " if (d == 1) str <- "Univariate " if (d == 0) str <- "" e0 <- 0.0 repl <- rep(0, R) pval <- 1.0 b <- .C("ksampleEtest", x = as.double(t(x)), byrow = as.integer(1), nsamples = as.integer(nsamples), sizes = as.integer(sizes), dim = as.integer(d), R = as.integer(R), e0 = as.double(e0), e = as.double(repl), pval = as.double(pval), PACKAGE = "energy") names(b$e0) <- "E-statistic" sz <- paste(sizes, collapse = " ", sep = "") methodname <- paste(str, length(sizes), "-sample E-test of equal distributions", sep = "") dataname <- paste("sample sizes ", sz, ", replicates ", R, sep="") e <- list( call = match.call(), method = methodname, statistic = b$e0, p.value = b$pval, data.name = dataname) class(e) <- "htest" e } ksample.e <- function(x, sizes, distance = FALSE, method = c("original","discoB","discoF"), ix = 1:sum(sizes)) { ## computes k-sample E-statistics for equal distributions ## retained for backward compatibility or use with boot ## (this function simply passes arguments to eqdist.e) ## ## x: pooled sample or distance matrix ## sizes: vector of sample sizes ## distance: TRUE if x is a distance matrix, otherwise FALSE ## method: default (original) or disco between components or disco F ratio ## ix: a permutation of row indices of x ## x <- as.matrix(x) method <- match.arg(method) eqdist.e(x[ix,], sizes=sizes, distance=distance, method=method) } energy/R/Emvnorm.R0000644000176200001440000001022314172025577013522 0ustar liggesusersmvnorm.test <- function(x, R) { # parametric bootstrap E-test for multivariate normality if (missing(R)) { method = "Energy test of multivariate normality: (Specify R > 0 for MC test)" R <- 0 } else { method = "Energy test of multivariate normality: estimated parameters" } if (is.vector(x) || NCOL(x)==1) { n <- NROW(x) d <- 1 bootobj <- boot::boot(x, statistic = normal.e, R = R, sim = "parametric", ran.gen = function(x, y) { return(rnorm(n)) }) } else { n <- nrow(x) d <- ncol(x) bootobj <- boot::boot(x, statistic = mvnorm.e, R = R, sim = "parametric", ran.gen = function(x, y) { return(matrix(rnorm(n * d), nrow = n, ncol = d)) }) } if (R > 0) p <- 1 - mean(bootobj$t < bootobj$t0) else p <- NA names(bootobj$t0) <- "E-statistic" e <- list(statistic = bootobj$t0, p.value = p, method = method, data.name = paste("x, sample size ", n, ", dimension ", d, ", replicates ", R, sep = "")) class(e) <- "htest" e } mvnorm.etest <- function(x, R) { return(mvnorm.test(x, R)) } mvnorm.e <- function(x) { # E-statistic for multivariate normality if (is.vector(x) || NCOL(x)==1) return(normal.e(x)) n <- nrow(x) d <- ncol(x) if (n < 2) { warning("sample size must be at least 2") return(NA) } # subtract column means and compute S^(-1/2) z <- scale(x, scale = FALSE) ev <- eigen(var(x), symmetric = TRUE) P <- ev$vectors lambda <- ev$values D <- diag(d) diag(D) <- 1 / sqrt(lambda) y <- z %*% (P %*% D %*% t(P)) if (any(!is.finite(y))) { warning("missing or non-finite y") return(NA) } if (requireNamespace("gsl", quietly=TRUE)) { const <- exp(lgamma((d+1)/2) - lgamma(d/2)) mean2 <- 2*const ysq <- rowSums(y^2) mean1 <- sqrt(2) * const * mean(gsl::hyperg_1F1(-1/2, d/2, -ysq/2)) mean3 <- 2*sum(dist(y)) / n^2 return(n * (2*mean1 - mean2 - mean3)) } else { warning("package gsl required but not found") return (NA) } } normal.e <- function(x) { ## Case 4: unknown parameters x <- as.vector(x) n <- length(x) s <- sd(x) if (!is.finite(s) || !(s > 0)) { warning("sd(x)>0 required") return(NA) } y <- (x - mean(x)) / sd(x) y <- sort(y) K <- seq(1 - n, n - 1, 2) return(2 * (sum(2 * y * pnorm(y) + 2 * dnorm(y)) - n/sqrt(pi) - mean(K * y))) } normal.test <- function(x, method=c("mc", "limit"), R) { ## implements the test for for d=1 ## Case 4: composite hypothesis method <- match.arg(method) estimate <- c(mean(x), sd(x)) names(estimate) <- c("mean", "sd") if (method == "mc") { ## Monte Carlo approach if (missing(R)) R <- 0 e <- energy::mvnorm.etest(x, R=R) e$method <- "Energy test of normality" e$method <- ifelse(R > 0, paste0(e$method,": estimated parameters"), paste0(e$method, " (Specify R > 0 for MC test)")) e$estimate <- estimate return(e) } ## implement test using asymptotic distribution for p-value if (!is.numeric(x) || (!is.vector(x) && NCOL(x) > 1)) { warning("x must be a numeric vector") return (NA) } else { x <- as.vector(x, mode="numeric") } n <- length(x) t0 <- normal.e(x) names(t0) <- "statistic" ## load pre-computed eigenvalues ev <- energy::EVnormal[, "Case4"] if (requireNamespace("CompQuadForm", quietly=TRUE)) { p <- CompQuadForm::imhof(t0, ev)$Qq } else { warning("limit distribution method requires CompQuadForm package for p-value") p <- NA } estimate <- c(mean(x), sd(x)) names(estimate) <- c("mean", "sd") e <- list(statistic = t0, p.value = p, method = paste("Energy test of normality: limit distribution"), estimate = estimate, data.name = "Case 4: composite hypothesis, estimated parameters") class(e) <- "htest" e } energy/R/energy-deprecated.R0000644000176200001440000000412414173570742015471 0ustar liggesusers## deprecated functions in energy package dcor.ttest <- function(x, y, distance=FALSE) { # x and y are observed samples or distance # distance arg is checked in bcdcor .Deprecated(new = "dcorT.test", package = "energy", msg = "dcort.ttest is deprecated, replaced by dcorT.test") if (distance == TRUE) { x <- as.dist(x) y <- as.dist(y) } return(dcorT.test(x, y)) } dcor.t <- function(x, y, distance=FALSE) { # computes the t statistic for corrected high-dim dCor # should be approximately student T # distance arg is checked in bcdcor .Deprecated(new = "dcorT", package = "energy", msg = "dcort.t is deprecated, replaced by dcorT") if (distance == TRUE) { x <- as.dist(x) y <- as.dist(y) } return(dcorT(x, y)) } DCOR <- function(x, y, index=1.0) { # distance covariance and correlation statistics # alternate method, implemented in R without .C call # this method is usually slower than the C version .Deprecated(new = "dcor", package = "energy", msg = "DCOR is deprecated, replaced by dcor or dcov") if (!inherits(x, "dist")) x <- dist(x) if (!inherits(y, "dist")) y <- dist(y) x <- as.matrix(x) y <- as.matrix(y) n <- nrow(x) m <- nrow(y) if (n != m) stop("Sample sizes must agree") if (! (all(is.finite(c(x, y))))) stop("Data contains missing or infinite values") if (index < 0 || index > 2) { warning("index must be in [0,2), using default index=1") index=1.0} stat <- 0 dims <- c(n, ncol(x), ncol(y)) Akl <- function(x) { d <- as.matrix(x)^index m <- rowMeans(d) M <- mean(d) a <- sweep(d, 1, m) b <- sweep(a, 2, m) return(b + M) } A <- Akl(x) B <- Akl(y) dCov <- sqrt(mean(A * B)) dVarX <- sqrt(mean(A * A)) dVarY <- sqrt(mean(B * B)) V <- sqrt(dVarX * dVarY) if (V > 0) dCor <- dCov / V else dCor <- 0 return(list(dCov=dCov, dCor=dCor, dVarX=dVarX, dVarY=dVarY)) } energy/R/kgroups.R0000644000176200001440000000373614005374454013600 0ustar liggesusers kgroups <- function(x, k, iter.max = 10, nstart = 1, cluster = NULL) { distance <- inherits(x, "dist") x <- as.matrix(x) if (!is.numeric(x)) stop("x must be numeric") n <- nrow(x) if (is.null(cluster)) { cluster <- sample(0:(k-1), size = n, replace = TRUE) } else { ## recode cluster as 0,1,...,k-1 cluster <- factor(cluster) if(length(levels(cluster)) != k) stop("cluster vector does not have k clusters") cluster <- as.integer(cluster) - 1 if(length(cluster) != n) stop("data and length of cluster vector must match") } value <- kgroups_start(x, k, cluster, iter.max, distance = distance) if (nstart > 1) { objective <- rep(0, nstart) objective[1] <- value$W values <- vector("list", nstart) values[[1]] <- value for (j in 2:nstart) { ## random initialization of cluster labels cluster <- sample(0:(k-1), size = n, replace = TRUE) values[[j]] <- kgroups_start(x, k, cluster, iter.max, distance = distance) objective[j] <- values[[j]]$W } best <- which.min(objective) value <- values[[best]] } obj <- structure(list( call = match.call(), cluster = value$cluster + 1, sizes = value$sizes, within = value$within, W = sum(value$within), count = value$count, iterations = value$it, k = k), class = "kgroups") return (obj) } print.kgroups <- function(x, ...) { cat("\n"); print(x$call) cat("\nK-groups cluster analysis\n") cat(x$k, " groups of size ", x$sizes, "\n") cat("Within cluster distances:\n", x$within) cat("\nIterations: ", x$iterations, " Count: ", x$count, "\n") } fitted.kgroups <- function(object, method = c("labels", "groups"), ...) { method = match.arg(method) if (method == "groups") { k <- object$k CList <- vector("list", k) for (i in 1:k) CList[[i]] <- which(object$cluster == i) return (CList) } return (object$cluster) } energy/NEWS.md0000644000176200001440000001570314300433720012643 0ustar liggesusers# energy 1.7-11 * User level changes - more argument types supported in bcdcor, dcov, dcor, pdcov, pdcor functions and tests - mutualIndep.test test of mutual independence (new) - is.dmatrix() utility function is now exported - calc_dist() (Rcpp export) is exported * Internal changes - C functions dCOVtest() and dCOV() simplified, now expect distance matrices, exponent handled in R - unused C function dCovtest() removed - .arg2dist.matrix() utility to convert valid types of args to distance matrices # energy 1.7-10 * update email address in docs # energy 1.7-9 * bug fix in normal.test * pdcor.test now gives p.value=NA in degenerate case * DCOR is deprecated; use dcor or dcov # energy 1.7-8 * User level changes: - Poisson goodness-of-fit tests - EVnormal (data) issue fixed - gsl package required * Internal changes - mvnorm.e: use gsl::hyperg_1F1 - poisMstat in energy.c moved to Rcpp poisMstat.cpp # energy 1.7-7 * User level changes: - dcorT.test replaces dcor.ttest, now deprecated. - dcorT replaces dcor.t, now deprecated. - edist method "discoF" removed * Internal changes - BCDCOR function (used in the high dim. dcorT test) has been revised. - edist method "discoB" correction - changes for compatibility with R 4.0.0 # energy 1.7-6 * User level changes: - normal.test (new) implements the energy test of univariate normality based on the null limit distribution for the composite hypothesis (estimated parameters). - dataset EVnormal (new) of eigenvalues for energy test of normality. - mvnorm.test replaces mvnorm.etest, and mvnorm.etest now is a wrapper for mvnorm.test. # energy 1.7-5 * User level changes: - kgroups: (new) implements energy clustering for a specified number k classes by energy distance criterion, analogous to the k classes of the k-means algorithm. - dcov2d and dcor2d: (new) O(n log n) methods to compute the U or V statistics for real x and y - sortrank() function added (a utility) * Internal changes: - B-tree.cpp: Btree_sum and other internal functions implement binary tree search for faster O(n log n) calculation of paired distances in dcov2d - kgroups.cpp: Rcpp implementation of k-groups algorithm - energy.hclust implementation: replaced C++ code with call to stats::hclust; since R > 3.0.3 it is now equivalent for alpha = 1 with method = "ward.D". Input and return value unchanged except heights from hclust are half. # energy 1.7-4 * User level changes - disco: handle the case when the user argument x is dist with conflicting argument distance=FALSE - dcor.t and dcor.ttest: handle the cases when class of argument x or y conflicts with the distance argument - Split manual page of dcovU into two files. - indep.etest and indep.e removed now Defunct (were Deprecated since Version 1.1-0, 2008-04-07; replaced by indep.test). * Internal changes - BCDCOR: handle the cases when class of argument x or y conflicts with the distance argument # energy 1.7-2 * User level changes - Provided new dcor.test function, similar to dcov.test but using the distance correlation as the test statistic. - Number of replicates R for Monte Carlo and permutation tests now matches the argument of the boot::boot function (no default value, user must specify). - If user runs a test with 0 replicates, p-value printed is NA * Internal changes - energy_init.c added for registering routines # energy 1.7-0 * Partial Distance Correlation statistics and tests added - pdcov, pdcor, pdcov.test, pdcor.test - dcovU: unbiased estimator of distance covariance - bcdcor: bias corrected distance correlation - Ucenter, Dcenter, U_center, D_center: double-centering and U-centering utilities - U_product: inner product in U-centered Hilbert space * updated NAMESPACE and DESCRIPTION imports, etc. * revised package Title and Description in DESCRIPTION * package now links to Rcpp * mvnorm c code ported to c++ (mvnorm.cpp); corresponding changes in Emvnorm.R * syntax for bcdcor: "distance" argument removed, now argument can optionally be a dist object * syntax for energy.hclust: first argument must now be a dist object * default number of replicates R in tests: for all tests, R now defaults to 0 or R has no default value. # energy 1.6.2 * inserted GetRNGstate() .. PutRNGState around repl. loop in dcov.c. # energy 1.6.1 * replace Depends with Imports in DESCRIPTION file # energy 1.6.0 * implementation of high-dim distance correlation t-test introduced in JMVA Volume 117, pp. 193-213 (2013). * new functions dcor.t, dcor.ttest in dcorT.R * minor changes to tidy other code in dcov.R * removed unused internal function .dcov.test # energy 1.5.0 * NAMESPACE: insert UseDynLib; remove zzz.R, .First.Lib() # energy 1.4-0 * NAMESPACE added. * (dcov.c, Eindep.c) Unused N was removed. * (dcov.c) In case dcov=0, bypass the unnecessary loop that generates replicates (in dCOVtest and dCovTest). In this case dcor=0 and test is not significant. (dcov=0 if one of the samples is constant.) * (Eqdist.R) in eqdist.e and eqdist.etest, method="disco" is replaced by two options: "discoB" (between sample components) and "discoF" (disco F ratio). * (disco.R) Added disco.between and internal functions that compute the disco between-sample component and corresponding test. * (utilities.c) In permute function replaced rand_unif with runif. * (energy.c) In ksampleEtest the pval computation changed from ek/B to (ek+1)/(B+1) as it should be for a permutation test, and unneeded int* n removed. # energy 1.3-0 * In distance correlation, distance covariance functions (dcov, dcor, DCOR) and dcov.test, arguments x and y can now optionally be distance objects (result of dist function or as.dist). Matrices x and y will always be treated as data. * Functions in dcov.c and utilities.c were modified to support arguments that are distances rather than data. In utilities.c the index_distance function changed. In dcov.c there are many changes. Most importantly for the exported objects, there is now an extra required parameter in the dims argument passed from R. In dCOVtest dims must be a vector c(n, p, q, dst, R) where n is sample size, p and q are dimensions of x and y, dst is logical (TRUE if distances) and R is number of replicates. For dCOV dims must be c(n, p, q, dst). # energy 1.2-0 * disco (distance components) added for one-way layout. * A method argument was added to ksample.e, eqdist.e, and eqdist.etest, method = c("original", "disco"). * A method argument was added to edist, which summarizes cluster distances in a table: method = c("cluster","discoB","discoF")) energy/MD50000644000176200001440000000633714351017352012065 0ustar liggesuserse85005c2111c429c5711efced5691b49 *DESCRIPTION 3eeaedcbec9483e39a2c30f40b577419 *NAMESPACE 93eb4bbd36c9907b215a4e89671c0353 *NEWS.md 8e83d2d85a8d5b50f5de2b1328154d6d *R/Ecluster.R 8b428dedf82ffb7e15422ff9f49addf4 *R/Eeqdist.R b7e4b591f66480d8a5a950559e857092 *R/Eindep.R 819cfd13dc83998ba8f8a4676fd450fb *R/Emvnorm.R 3e54679a0568186c3c3fe779b559aff9 *R/Epoisson.R 590e508b1c6f61837a438d6f67170d8c *R/RcppExports.R 25f5594b3278f42521643dc117844d5e *R/centering.R 1343cc6dfc935ea74a4ebe3168798451 *R/dcorT.R f44a90e249cd31861f62d39d6603c5a9 *R/dcov.R 6c4131da9ab7527b89e3cfa0113ea548 *R/dcov2d.R 29451dd38fa9756bc4aaba8e6ffe05ef *R/dcovu.R cc690c1e9534097cffdfa4bf1e1ce2de *R/disco.R f522f49e669e143bc9c801fc451464b9 *R/edist.R 239b63f2293937538b6c684e0229be21 *R/energy-defunct.R 9f0d5f4032755f17e3d3c691db5b7944 *R/energy-deprecated.R 61dd5005a0b03370beb62c85ef859b22 *R/kgroups.R eeedb8d6f3cbf2ff6d604dcf84fd828b *R/mutual-indep.R d09b6df9dac87fafeee043b0d138d123 *R/pdcor.R 085981e8e418faabc1d18dad3917632c *R/pdcov-test.R 9fa3e75883252d3d1a136d9628ba292e *R/util.R 8ab90b7760f971c9b444b931fa17bb2e *README.md cb4924f3573383fca290a0a8e2ca5294 *build/partial.rdb 05994c5a68aed671284220d94539c7e7 *data/EVnormal.rda 07c07091e554cb8ebc96346574fe44ef *man/U_product.Rd 2286a6f06d2d9f66698230fa6ffd4fa8 *man/centering.Rd 0b4dbb284407dee1a1e902f15d99e81c *man/dcorT.Rd 42df2f7e0131850e1ee5be962d3d471a *man/dcov.Rd a47b3029f82d8dc0e7b94bffc61aeb92 *man/dcov.test.Rd f83145ab57b98a140a26fc4cf755c4c5 *man/dcov2d.Rd 663b4420e78ce8aa61deb2190cd1c768 *man/dcovU_stats.Rd db3c34b9e1aa44c04534aa55f57388af *man/dcovu.Rd e55676cc724ef7a859f65c574a7c3c37 *man/disco.Rd 91fa6eaffa30a4f3e4105a8cb400d86c *man/dmatrix.Rd 4d1ce263595159db8bccda9ff9cd5279 *man/edist.Rd c9c2f726d35a8d67eb2d0270b1ca5ca3 *man/eigen.Rd bb0eb6109269e2454e68c5226e98cf56 *man/energy-defunct.Rd d1e78ed68572e231459f89107a40c058 *man/energy-deprecated.Rd 81fb28c5d416346f4fea6ab0d8e02b15 *man/energy-package.Rd 00fc5e4dde57b294d3b3601d987f45c8 *man/energy.hclust.Rd 8324a19800d882b6af612aaf6a00a091 *man/eqdist.etest.Rd 675fa2cac0881b2f5372211e83e823de *man/indep.test.Rd 805c2039f6e6b8614a07a02517b72f64 *man/kgroups.Rd 14c09b5c8a859cff8677054f2be3375f *man/mutualIndep.Rd 2d726d1bc1448372edc755883e91d4bd *man/mvI.test.Rd 988447c253b885192c5a09c45e4dc5b7 *man/mvnorm-test.Rd c389351a67a0b66642c8308cdd3beb9d *man/normalGOF.Rd c4789bf7b3ff1615f09b38c95bb338eb *man/pdcor.Rd 80cd59d16c408c5dca17771271f9e298 *man/poisson.Rd 9d7e5a0ae85d2c9960b688f8498b8760 *man/sortrank.Rd e24afcf73f91f493b4cb1a4104107ff4 *src/B-tree.cpp aae8b925991fa87d1b4743280b21db06 *src/Eindep.c 3ffa6d6a3719cef4313383c8c80f0fe5 *src/Rcpp-utilities.cpp e7f9e32793125c2f0bb4d5bda7590c0b *src/RcppExports.cpp 31a302ddf4e13584143ec0c4bc904f6c *src/U-product.cpp c8e92634d49c26689070db219a523899 *src/centering.cpp c40e492efe8d39dc99a7c0dfb818c31c *src/dcov.c e8a63db215ed2cadc2e058a3c3723d80 *src/dcovU.cpp ac6be091b90f05a300b18cfcec301618 *src/energy.c 52d4eace2bb113e701648afed778c4a6 *src/energy_init.c ce8d73b3251fa3e3e1608172d1b15ce8 *src/kgroups.cpp 28437e258074effcc1d13fb20dac4c72 *src/partial-dcor.cpp 6a69a89eb82066ffb0fdace86b07c7a5 *src/poissonM.cpp 299594f1c1f5e614f97e6b4ce8d6a00c *src/projection.cpp 09aac5c6b279db1f47cc09d38bc61f08 *src/utilities.c