energy/0000755000176200001440000000000014662222332011545 5ustar liggesusersenergy/MD50000644000176200001440000000643014662222332012060 0ustar liggesusers76bd728066692ce3ad9818eb55a9156e *DESCRIPTION 1fc2e76f735ecf5fdbd4af3e8b99f3e0 *NAMESPACE 4a5d14ee5fa36ac47f15db3edcea6c98 *NEWS.md 8e83d2d85a8d5b50f5de2b1328154d6d *R/Ecluster.R 8b428dedf82ffb7e15422ff9f49addf4 *R/Eeqdist.R 819cfd13dc83998ba8f8a4676fd450fb *R/Emvnorm.R 3e54679a0568186c3c3fe779b559aff9 *R/Epoisson.R cdf1108eb847c078529f67200302507b *R/RcppExports.R 25f5594b3278f42521643dc117844d5e *R/centering.R 1343cc6dfc935ea74a4ebe3168798451 *R/dcorT.R f44a90e249cd31861f62d39d6603c5a9 *R/dcov.R 6c4131da9ab7527b89e3cfa0113ea548 *R/dcov2d.R 29451dd38fa9756bc4aaba8e6ffe05ef *R/dcovu.R f05e3c3260bc5dbf8bd2f5b5d1020805 *R/disco.R f522f49e669e143bc9c801fc451464b9 *R/edist.R edf0fcc48e1f0acddc56ad31e6a4de79 *R/energy-defunct.R de7bcbdc385c0dbb70c981e3a0591e43 *R/energy-deprecated.R ea3fbf05a2ba5d118e07c6bcdda52638 *R/indep-deprecated.R 61dd5005a0b03370beb62c85ef859b22 *R/kgroups.R eeedb8d6f3cbf2ff6d604dcf84fd828b *R/mutual-indep.R 8a7e3d482ec3a09ef840b3604fdb602c *R/mvI.R d09b6df9dac87fafeee043b0d138d123 *R/pdcor.R 085981e8e418faabc1d18dad3917632c *R/pdcov-test.R 9fa3e75883252d3d1a136d9628ba292e *R/util.R 8ab90b7760f971c9b444b931fa17bb2e *README.md aa5e5a2cbfaea3d9b6cdbe378ac48cee *build/partial.rdb 05994c5a68aed671284220d94539c7e7 *data/EVnormal.rda ff869e82ae29da355ce3ad654b96546c *man/U_product.Rd dab72e9d2cbc4a76880e7b9681b9316f *man/centering.Rd 0b4dbb284407dee1a1e902f15d99e81c *man/dcorT.Rd 42df2f7e0131850e1ee5be962d3d471a *man/dcov.Rd a47b3029f82d8dc0e7b94bffc61aeb92 *man/dcov.test.Rd f83145ab57b98a140a26fc4cf755c4c5 *man/dcov2d.Rd 663b4420e78ce8aa61deb2190cd1c768 *man/dcovU_stats.Rd db3c34b9e1aa44c04534aa55f57388af *man/dcovu.Rd e55676cc724ef7a859f65c574a7c3c37 *man/disco.Rd 91fa6eaffa30a4f3e4105a8cb400d86c *man/dmatrix.Rd 4d1ce263595159db8bccda9ff9cd5279 *man/edist.Rd c9c2f726d35a8d67eb2d0270b1ca5ca3 *man/eigen.Rd 7cbcbe1caa9e3ccba6fe1a166c4a1917 *man/energy-defunct.Rd 092eb8656e69ac2990f27022ae300dcf *man/energy-deprecated.Rd 810dd8fb98a35968f7831d2e3ccee82c *man/energy-package.Rd 00fc5e4dde57b294d3b3601d987f45c8 *man/energy.hclust.Rd 8324a19800d882b6af612aaf6a00a091 *man/eqdist.etest.Rd 07c992c262ba56b2b1fa12ae7b508a1e *man/indep-deprecated.Rd 805c2039f6e6b8614a07a02517b72f64 *man/kgroups.Rd 14c09b5c8a859cff8677054f2be3375f *man/mutualIndep.Rd dcbf81fd036a3a8aadf92a7594c630ea *man/mvI.test.Rd 988447c253b885192c5a09c45e4dc5b7 *man/mvnorm-test.Rd c389351a67a0b66642c8308cdd3beb9d *man/normalGOF.Rd c4789bf7b3ff1615f09b38c95bb338eb *man/pdcor.Rd 80cd59d16c408c5dca17771271f9e298 *man/poisson.Rd 9d7e5a0ae85d2c9960b688f8498b8760 *man/sortrank.Rd e24afcf73f91f493b4cb1a4104107ff4 *src/B-tree.cpp 3ffa6d6a3719cef4313383c8c80f0fe5 *src/Rcpp-utilities.cpp 3f9fb4ce8043bc83756d8ea0a59a498a *src/RcppExports.cpp 31a302ddf4e13584143ec0c4bc904f6c *src/U-product.cpp c8e92634d49c26689070db219a523899 *src/centering.cpp c41a59b832ea916d700ba2df6ebcc5c4 *src/dcov.c e8a63db215ed2cadc2e058a3c3723d80 *src/dcovU.cpp 1aad88d09159988f80c25c83f0fb9f11 *src/energy.c b5aa16649433a8aab9207c6973a57c23 *src/energy_init.c ce8d73b3251fa3e3e1608172d1b15ce8 *src/kgroups.cpp 1aa28da95b1d6cf29da4266da4fff619 *src/mvI.cpp 28437e258074effcc1d13fb20dac4c72 *src/partial-dcor.cpp 6a69a89eb82066ffb0fdace86b07c7a5 *src/poissonM.cpp 299594f1c1f5e614f97e6b4ce8d6a00c *src/projection.cpp 447fd14e535ab32c23283f4a327d1570 *src/utilities.c energy/R/0000755000176200001440000000000014662213476011757 5ustar liggesusersenergy/R/energy-defunct.R0000644000176200001440000000053514661117203015012 0ustar liggesusers## defunct functions from the energy package dcor.ttest <- function(x, y, distance=FALSE) { .Defunct(new = "dcorT.test", package = "energy", msg = "dcort.ttest replaced by dcorT.test") } dcor.t <- function(x, y, distance=FALSE) { .Deprecated(new = "dcorT", package = "energy", msg = "dcor.t replaced by dcorT") } energy/R/RcppExports.R0000644000176200001440000000214414662213476014374 0ustar liggesusers# Generated by using Rcpp::compileAttributes() -> do not edit by hand # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 Btree_sum <- function(y, z) { .Call(`_energy_Btree_sum`, y, z) } calc_dist <- function(x) { .Call(`_energy_calc_dist`, x) } U_product <- function(U, V) { .Call(`_energy_U_product`, U, V) } D_center <- function(Dx) { .Call(`_energy_D_center`, Dx) } U_center <- function(Dx) { .Call(`_energy_U_center`, Dx) } dcovU_stats <- function(Dx, Dy) { .Call(`_energy_dcovU_stats`, Dx, Dy) } kgroups_start <- function(x, k, clus, iter_max, distance) { .Call(`_energy_kgroups_start`, x, k, clus, iter_max, distance) } Istat <- function(Dx, Dy) { .Call(`_energy_Istat`, Dx, Dy) } Istats <- function(Dx, Dy, R) { .Call(`_energy_Istats`, Dx, Dy, R) } partial_dcor <- function(Dx, Dy, Dz) { .Call(`_energy_partial_dcor`, Dx, Dy, Dz) } partial_dcov <- function(Dx, Dy, Dz) { .Call(`_energy_partial_dcov`, Dx, Dy, Dz) } .poisMstat <- function(x) { .Call(`_energy_poisMstat`, x) } projection <- function(Dx, Dz) { .Call(`_energy_projection`, Dx, Dz) } energy/R/Ecluster.R0000644000176200001440000000100414005374454013656 0ustar liggesusers energy.hclust <- function(dst, alpha = 1) { if (!inherits(dst, "dist")) stop("The first argument must be a dist object.") d <- dst n <- attr(d, "Size") if (!isTRUE(all.equal(alpha, 1))) { if (alpha > 2) warning("Exponent alpha should be in (0,2]") if (alpha < 0) stop("Cannot use negative exponent on distance.") d <- d^alpha } ## heights of hclust are half of energy; otherwise equivalent return(hclust(d, method = "ward.D")) } energy/R/dcov.R0000644000176200001440000001004514251416000013013 0ustar liggesusersdcov.test <- function(x, y, index=1.0, R=NULL) { ## check for valid number of replicates R method <- "Specify the number of replicates R (R > 0) for an independence test" if (! is.null(R)) { R <- floor(R) if (R < 1) R <- 0 if (R > 0) method <- "dCov independence test (permutation test)" } else { R <- 0 } Dx <- .arg2dist.matrix(x) Dy <- .arg2dist.matrix(y) if (!isTRUE(all.equal(index, 1.0))) { Dx <- Dx^index Dy <- Dy^index } n <- nrow(Dx) m <- nrow(Dy) if (n != m) stop("Sample sizes must agree") stat <- dcorr <- reps <- 0 dcov <- rep(0, 4) if (R > 0) reps <- rep(0, R) pval <- 1 dims <- c(n, ncol(Dx), ncol(Dy), R) # dcov = [dCov,dCor,dVar(x),dVar(y)] a <- .C("dCOVtest", x = as.double(t(Dx)), y = as.double(t(Dy)), nrow = as.integer(nrow(Dx)), nreps = as.integer(R), reps = as.double(reps), DCOV = as.double(dcov), pval = as.double(pval), PACKAGE = "energy") # test statistic is n times the square of dCov statistic stat <- n * a$DCOV[1]^2 dcorr <- a$DCOV V <- dcorr[[1]] names(stat) <- "nV^2" names(V) <- "dCov" dataname <- paste("index ", index, ", replicates ", R, sep="") pval <- ifelse (R < 1, NA, a$pval) e <- list( statistic = stat, method = method, estimate = V, estimates = dcorr, p.value = pval, replicates = n* a$reps^2, n = n, data.name = dataname) class(e) <- "htest" return(e) } dcor.test <- function(x, y, index=1.0, R) { # distance correlation test for multivariate independence # like dcov.test but using dcor as the test statistic if (missing(R)) R <- 0 R <- ifelse(R > 0, floor(R), 0) RESULT <- dcov.test(x, y, index=index, R) # this test statistic is n times the square of dCov statistic DCOVteststat <- RESULT$statistic DCOVreplicates <- RESULT$replicates # RESULT$estimates = [dCov,dCor,dVar(x),dVar(y)] # dVar are invariant under permutation of sample indices estimates = RESULT$estimates names(estimates) <- c("dCov", "dCor", "dVar(X)", "dVar(Y)") DCORteststat <- RESULT$estimates[2] dvarX <- RESULT$estimates[3] dvarY <- RESULT$estimates[4] n <- RESULT$n if (R > 0) { DCORreps <- sqrt(DCOVreplicates / n) / sqrt(dvarX * dvarY) p.value <- (1 + sum(DCORreps >= DCORteststat)) / (1 + R) } else { p.value <- NA DCORreps <- NA } names(DCORteststat) <- "dCor" dataname <- paste("index ", index, ", replicates ", R, sep="") method <- ifelse(R > 0, "dCor independence test (permutation test)", "Specify the number of replicates R>0 for an independence test") e <- list( method = method, statistic = DCORteststat, estimates = estimates, p.value = p.value, replicates = DCORreps, n = n, data.name = dataname) class(e) <- "htest" return(e) } .dcov <- function(x, y, index=1.0) { # distance covariance statistic for independence # dcov = [dCov,dCor,dVar(x),dVar(y)] (vector) # this function provides the fast method for computing dCov # it is called by the dcov and dcor functions Dx <- .arg2dist.matrix(x) Dy <- .arg2dist.matrix(y) if (!isTRUE(all.equal(index, 1.0))) { Dx <- Dx^index Dy <- Dy^index } n <- nrow(Dx) m <- nrow(Dy) if (n != m) stop("Sample sizes must agree") dims <- c(n, ncol(Dx), ncol(Dy)) idx <- 1:dims[1] DCOV <- numeric(4) a <- .C("dCOV", x = as.double(t(Dx)), y = as.double(t(Dy)), nrow = as.integer(n), DCOV = as.double(DCOV), PACKAGE = "energy") return(a$DCOV) } dcov <- function(x, y, index=1.0) { # distance correlation statistic for independence return(.dcov(x, y, index)[1]) } dcor <- function(x, y, index=1.0) { # distance correlation statistic for independence return(.dcov(x, y, index)[2]) } energy/R/pdcov-test.R0000644000176200001440000000437614251251214014166 0ustar liggesuserspdcov.test <- function(x, y, z, R) { if (missing(R)) R <- 0 Dx <- .arg2dist.matrix(x) Dy <- .arg2dist.matrix(y) Dz <- .arg2dist.matrix(z) n <- nrow(Dx) Pxz <- projection(Dx, Dz) #U-center and compute projections Pyz <- projection(Dy, Dz) #PxzU <- U_center(Pxz) #not necessary, because of invariance #PyzU <- U_center(Pyz) teststat <- n * U_product(Pxz, Pyz) ## calc. pdcor den <- sqrt(U_product(Pxz, Pxz) * U_product(Pyz, Pyz)) if (den > 0.0) { estimate <- teststat / (n * den) } else estimate <- 0.0 bootfn <- function(Pxz, i, Pyz) { # generate the permutation replicates of dcovU(Pxz, Pyz) # PxzU and PyzU are the U-centered matrices U_product(Pxz[i, i], Pyz) #RcppExports } if (R > 0 && den > 0.0) { reps <- replicate(R, expr= { i <- sample(1:n) bootfn(Pxz, i, Pyz=Pyz) }) replicates <- n * reps pval <- (1 + sum(replicates > teststat)) / (1 + R) #df <- n * (n-3) / 2 - 2 } else { pval <- NA replicates <- NA } dataname <- paste("replicates ", R, sep="") if (! R>0) dataname <- "Specify R>0 replicates for a test" condition <- (den > 0.0) names(estimate) <- "pdcor" names(teststat) <- "n V^*" e <- list( call = match.call(), method = paste("pdcov test", sep = ""), statistic = teststat, estimate = estimate, p.value = pval, n = n, replicates = replicates, condition = condition, data.name = dataname) class(e) <- "htest" return(e) } pdcor.test <- function(x, y, z, R) { ## x, y, z must be dist. objects or data matrices (no dist matrix) ## all required calc. done in pdcov.test if (missing(R)) R <- 0 result <- pdcov.test(x, y, z, R=R) if (result$condition) { ## if (A*A)(B*B) > 0 nRootV <- result$statistic / result$estimate pdcor_reps <- result$replicates / nRootV } else pdcor_reps <- NA e <- list( call = match.call(), method = paste("pdcor test", sep = ""), statistic = result$estimate, estimate = result$estimate, p.value = result$p.value, n = result$n, replicates = pdcor_reps, condition = result$condition, data.name = result$data.name) class(e) <- "htest" return(e) } energy/R/dcov2d.R0000644000176200001440000001200014300165200013230 0ustar liggesusersdcor2d<- function(x, y, type = c("V", "U")) { ## computes dcor^2 or bias-corrected dcor^2 by O(n log n) algorithm ## bivariate data only: (x,y) in R^2 ## should be faster than direct calc. for big n type <- match.arg(type) ## argument checking in dcov2d stat <- dcov2d(x, y, type, all.stats=TRUE) dvarX <- stat[2] dvarY <- stat[3] R2 <- 0.0 if (abs(dvarX*dvarY > 10*.Machine$double.eps)) R2 <- stat[1] / sqrt(dvarX*dvarY) return (R2) } dcov2d<- function(x, y, type=c("V", "U"), all.stats=FALSE) { ## O(n log n) computation of dcovU or dcov^2 (V^2) for (x, y) in R^2 only type <- match.arg(type) if (!is.vector(x) || !is.vector(y)) { if (NCOL(x) > 1 || NCOL(y) > 1) stop("this method is only for univariate x and y") } x <- as.vector(x) y <- as.vector(y) n <- length(x) if (n != length(y)) stop("sample sizes must agree") Sums <- .dcovSums2d(x, y, all.sums=all.stats) if (type =="V") { d1 <- n^2 d2 <- n^3 d3 <- n^4 } else { d1 <- n * (n - 3) d2 <- d1 * (n - 2) d3 <- d2 * (n - 1) } dCov2d <- Sums$S1/d1 - 2*Sums$S2/d2 + Sums$S3/d3 if (all.stats) { dvarX <- Sums$S1a/d1 - 2*Sums$S2a/d2 + Sums$S3a/d3 dvarY <- Sums$S1b/d1 - 2*Sums$S2b/d2 + Sums$S3b/d3 } rval <- ifelse(type=="V", c(V=dCov2d), c(U=dCov2d)) if (all.stats) rval <- c(rval, dvarX=dvarX, dvarY=dvarY) return (rval) } .dcovSums2d <- function(x, y, all.sums = FALSE) { ## compute the sums S1, S2, S3 of distances for dcov^2 ## dCov^2 <- S1/d1 - 2 * S2/d2 + S3/d3 ## denominators differ for U-statistic, V-statisic ## if all.sums==TRUE, also return sums for dVar and kernel if (is.matrix(x) || is.matrix(y)) { if (ncol(x) > 1 || ncol(y) > 1) stop("Found multivariate (x,y) in .dcovSums2d, expecting bivariate") } n <- length(x) SRx <- sortrank(x) SRy <- sortrank(y) ## compute the rowSums of the distance matrices a. <- .rowSumsDist1(x, SRx) b. <- .rowSumsDist1(y, SRy) S2 <- sum(a. * b.) a.. <- sum(a.) b.. <- sum(b.) S3 <- sum(a.) * sum(b.) ## also need order and rank for y[order(x)] in gamma1() x1 <- SRx$x y1 <- y[SRx$ix] SRy1 <- sortrank(y1) ones <- rep(1, n) g_1 <- .gamma1(x1=x1, y1=y1, z1=ones, SRx=SRx, SRy1=SRy1) g_x <- .gamma1(x1=x1, y1=y1, z1=x1, SRx=SRx, SRy1=SRy1) g_y <- .gamma1(x1=x1, y1=y1, z1=y1, SRx=SRx, SRy1=SRy1) g_xy <- .gamma1(x1=x1, y1=y1, z1=x1*y1, SRx=SRx, SRy1=SRy1) S1 <- sum(x * y * g_1 + g_xy - x * g_y - y * g_x) L <- list(S1=S1, S2=S2, S3=S3, S1a=NA, S1b=NA, S2a=NA, S2b=NA, S3a=NA, S3b=NA, rowsumsA=NA, rowsumsB=NA, sumA=NA, sumB=NA) if (all.sums) { L$S1a <- 2 * n * (n-1) * var(x) L$S1b <- 2 * n * (n-1) * var(y) L$S2a <- sum(a.^2) L$S2b <- sum(b.^2) L$S3a <- a..^2 L$S3b <- b..^2 L$rowsumsA <- a. L$rowsumsB <- b. L$sumA <- a.. L$sumB <- b.. } return (L); } .dvarU2 <- function(x, SRx = NULL) { ## O(n log n) computation of dvarU for univariate x only ## this is an internal function that will do a stand-alone dVar calc. ## but it is not faster than dcovU2(x, x) unless we supply ## the precomputed sort + rank results in SRx n <- length(x) ## compute the rowSums of the distance matrices if (is.null(SRx)) SRx <- sortrank(x) a. <- .rowSumsDist1(x, SRx) S2 <- sum(a. * a.) S3 <- sum(a.)^2 ## also need order and rank for y[order(x)] in gamma1() x1 <- SRx$x x2 <- x1 SRx1 <- sortrank(x1) ones <- rep(1, n) g_1 <- .gamma1(x1=x1, y1=x2, z1=ones, SRx, SRx1) g_x <- .gamma1(x1=x1, y1=x2, z1=x1, SRx, SRx1) g_xx <- .gamma1(x1=x1, y1=x2, z1=x1*x2, SRx, SRx1) S1 <- sum(x^2 * g_1 + g_xx - 2 * x * g_x) d1 <- n * (n - 3) d2 <- d1 * (n - 2) d3 <- d2 * (n - 1) dVar <- S1/d1 - 2 * S2/d2 + S3/d3 return(dVar) } .gamma1 <- function(x1, y1, z1, SRx, SRy1) { # computes the terms of the sum (ab) in dcovU # original sample (x_i, y_i, z_i) # triples (x1_i, y1_i, z1_i) are sorted by ix=order(x) # SRx is the result of sortrank(x), original order # SRy1 is the result of sortrank(y1), y1=y[order(x)] # pre-compute SRx, SRy1 to avoid repeated sort and rank # n <- length(x1) ix <- SRx$ix #order(x) rankx <- SRx$r #ranks of original sample x ## ranks and order vector for this permutation of sample y1 iy1 <- SRy1$ix #order(y1) ranky1 <- SRy1$r #rank(y1) ## the partial sums in the formula g_1 psumsy1 <- (cumsum(as.numeric(z1[iy1])) - z1[iy1])[ranky1] psumsx1 <- cumsum(as.numeric(z1)) - z1 gamma1 <- Btree_sum(y=ranky1, z=z1) #y1 replaced by rank(y1) g <- sum(z1) - z1 - 2 * psumsx1 - 2 * psumsy1 + 4 * gamma1 g <- g[rankx] } .rowSumsDist1 <- function(x, Sx = NULL) { ## for univariate samples, equivalent to rowSums(as.matrix(dist(x))) ## but much faster ## Sx is a sortrank object usually pre-computed here ## x is the data vector, Sx$x is sort(x) if (is.null(Sx)) Sx <- sortrank(x) n <- length(x) r <- Sx$r #ranks z <- Sx$x #ordered sample x psums1 <- (cumsum(as.numeric(z)) - z)[r] (2*(r-1)-n)*x + sum(x) - 2*psums1 } energy/R/centering.R0000644000176200001440000000127614005374454014061 0ustar liggesusers## use the Rcpp exported function U_center or D_center ## the utilities in this file are provided for reference and historical reasons Dcenter <- function(x) { ## x is a dist object or data matrix if (!inherits(x, "dist")) x <- dist(x) d <- as.matrix(x) n <- nrow(d) m <- rowSums(d) M <- sum(m) / n^2 m <- m / n a <- sweep(d, 1, m) b <- sweep(a, 2, m) B <- b + M } Ucenter <- function(x) { ## x is a dist object or data matrix if (!inherits(x, "dist")) x <- dist(x) d <- as.matrix(x) n <- nrow(d) m <- rowSums(d) M <- sum(m) / ((n-1)*(n-2)) m <- m / (n-2) a <- sweep(d, 1, m) b <- sweep(a, 2, m) B <- b + M diag(B) <- 0 B } energy/R/kgroups.R0000644000176200001440000000373614005374454013600 0ustar liggesusers kgroups <- function(x, k, iter.max = 10, nstart = 1, cluster = NULL) { distance <- inherits(x, "dist") x <- as.matrix(x) if (!is.numeric(x)) stop("x must be numeric") n <- nrow(x) if (is.null(cluster)) { cluster <- sample(0:(k-1), size = n, replace = TRUE) } else { ## recode cluster as 0,1,...,k-1 cluster <- factor(cluster) if(length(levels(cluster)) != k) stop("cluster vector does not have k clusters") cluster <- as.integer(cluster) - 1 if(length(cluster) != n) stop("data and length of cluster vector must match") } value <- kgroups_start(x, k, cluster, iter.max, distance = distance) if (nstart > 1) { objective <- rep(0, nstart) objective[1] <- value$W values <- vector("list", nstart) values[[1]] <- value for (j in 2:nstart) { ## random initialization of cluster labels cluster <- sample(0:(k-1), size = n, replace = TRUE) values[[j]] <- kgroups_start(x, k, cluster, iter.max, distance = distance) objective[j] <- values[[j]]$W } best <- which.min(objective) value <- values[[best]] } obj <- structure(list( call = match.call(), cluster = value$cluster + 1, sizes = value$sizes, within = value$within, W = sum(value$within), count = value$count, iterations = value$it, k = k), class = "kgroups") return (obj) } print.kgroups <- function(x, ...) { cat("\n"); print(x$call) cat("\nK-groups cluster analysis\n") cat(x$k, " groups of size ", x$sizes, "\n") cat("Within cluster distances:\n", x$within) cat("\nIterations: ", x$iterations, " Count: ", x$count, "\n") } fitted.kgroups <- function(object, method = c("labels", "groups"), ...) { method = match.arg(method) if (method == "groups") { k <- object$k CList <- vector("list", k) for (i in 1:k) CList[[i]] <- which(object$cluster == i) return (CList) } return (object$cluster) } energy/R/dcorT.R0000644000176200001440000000432014005374454013147 0ustar liggesusers### dcorT.R ### implementation of the distance correlation t-test ### for high dimension Astar <- function(d) { ## d is a distance matrix or distance object ## modified or corrected doubly centered distance matrices ## denoted A* (or B*) in JMVA t-test paper (2013) if (inherits(d, "dist")) d <- as.matrix(d) n <- nrow(d) if (n != ncol(d)) stop("Argument d should be distance") m <- rowMeans(d) M <- mean(d) a <- sweep(d, 1, m) b <- sweep(a, 2, m) A <- b + M #same as plain A #correction to get A^* A <- A - d/n diag(A) <- m - M (n / (n-1)) * A } BCDCOR <- function(x, y) { ## compute bias corrected distance correlation ## internal function not in NAMESPACE (external: use bcdcor) ## revised version from v. 1.7-7 if (!inherits(x, "dist")) { x <- as.matrix(dist(x)) } else { x <- as.matrix(x) } if (!inherits(y, "dist")) { y <- as.matrix(dist(y)) } else { y <- as.matrix(y) } n <- NROW(x) AA <- Astar(x) BB <- Astar(y) XY <- sum(AA*BB) - (n/(n-2)) * sum(diag(AA*BB)) XX <- sum(AA*AA) - (n/(n-2)) * sum(diag(AA*AA)) YY <- sum(BB*BB) - (n/(n-2)) * sum(diag(BB*BB)) list(bcR=XY / sqrt(XX*YY), XY=XY/n^2, XX=XX/n^2, YY=YY/n^2, n=n) } dcorT <- function(x, y) { # computes the t statistic for corrected high-dim dCor # should be approximately student T # x and y are observed samples or distance objects r <- BCDCOR(x, y) Cn <- r$bcR n <- r$n M <- n*(n-3)/2 sqrt(M-1) * Cn / sqrt(1-Cn^2) } dcorT.test <- function(x, y) { # x and y are observed samples or distance objects dname <- paste(deparse(substitute(x)),"and", deparse(substitute(y))) stats <- BCDCOR(x, y) bcR <- stats$bcR n <- stats$n M <- n * (n-3) / 2 df <- M - 1 names(df) <- "df" tstat <- sqrt(M-1) * bcR / sqrt(1-bcR^2) names(tstat) <- "T" estimate <- bcR names(estimate) <- "Bias corrected dcor" pval <- 1 - pt(tstat, df=df) method <- "dcor t-test of independence for high dimension" rval <- list(statistic = tstat, parameter = df, p.value = pval, estimate=estimate, method=method, data.name=dname) class(rval) <- "htest" return(rval) } energy/R/dcovu.R0000644000176200001440000000164314251434432013215 0ustar liggesusers## dcovu.R ## unbiased dcov^2 and bias-corrected dcor^2 ## bcdcor <- function(x, y) { ## compute bias corrected distance correlation dcorU(x, y) } dcovU <- function(x, y) { ## unbiased dcov^2 if (!inherits(x, "dist")) x <- dist(x) if (!inherits(y, "dist")) y <- dist(y) x <- as.matrix(x) y <- as.matrix(y) n <- nrow(x) m <- nrow(y) if (n != m) stop("sample sizes must agree") if (! (all(is.finite(c(x, y))))) stop("data contains missing or infinite values") estimates <- dcovU_stats(x, y) #RcppExports return (estimates[1]) } dcorU <- function(x, y) { ## unbiased dcov^2 x <- .arg2dist.matrix(x) y <- .arg2dist.matrix(y) n <- nrow(x) m <- nrow(y) if (n != m) stop("sample sizes must agree") if (! (all(is.finite(c(x, y))))) stop("data contains missing or infinite values") estimates <- dcovU_stats(x, y) #RcppExports return (estimates[2]) } energy/R/mvI.R0000644000176200001440000000252114661474727012644 0ustar liggesusersindep.test<- function(x, y, method = c("dcov","mvI"), index = 1, R) { # two energy tests for multivariate independence type <- match.arg(method) if (type == "dcov") return(dcov.test(x, y, index, R)) else if (type == "mvI") return(mvI.test(x, y, R)) } mvI <- function(x, y) { # energy statistic for multivariate independence # returns dependence coefficient I_n n <- NROW(x) m <- NROW(y) Dx <- .arg2dist.matrix(x) Dy <- .arg2dist.matrix(y) return(Istat(Dx, Dy)) #Rcpp } mvI.test<- function(x, y, R) { # an energy test for multivariate independence # not based on dCov or dCor n <- NROW(x) m <- NROW(y) if (n != m || n < 2) stop("Sample sizes must agree") Dx <- .arg2dist.matrix(x) Dy <- .arg2dist.matrix(y) stats <- Istats(Dx, Dy, R) stat <- n * stats[1]^2 est <- stats[1] names(est) <- "I" names(stat) <- "n I^2" dataname <- paste("x (",n," by ",ncol(x), "), y(",n," by ", ncol(y), "), replicates ", R, sep="") if (R > 0) { p.value = (1 + sum(stats[-1] > est)) / (R+1) } else { p.value = NA } e <- list( method = "mvI energy test of independence", statistic = stat, estimate = est, replicates = stats[-1], p.value = p.value, data.name = dataname) class(e) <- "htest" e } energy/R/Epoisson.R0000644000176200001440000000651214253334600013672 0ustar liggesuserspoisson.tests <- function(x, R, test="all") { # parametric bootstrap tests of Poisson distribution # poisson.e is the energy GOF statistic # poisson.m is the mean distance statistic # (not related to the test stats::poisson.test) if (!is.integer(x) || any(x < 0)) { warning("sample must be non-negative integers") return(NULL) } test <- tolower(test) poisson.stats <- function(x) { c(poisson.m(x), poisson.e(x)) } stat <- switch(test, "m" = poisson.m, "e" = poisson.e, poisson.stats) method <- switch(test, m=c("M-CvM","M-AD"), e="Energy", c("M-CvM","M-AD","Energy")) method <- paste(method, " test", sep="") n <- length(x) lambda <- mean(x) if (missing(R) || is.null(R)) { R <- 0 message("Specify R > 0 replicates for MC test") } bootobj <- boot::boot(x, statistic = stat, R = R, sim = "parametric", ran.gen = function(x, y) {rpois(n, lambda)}) N <- length(bootobj$t0) p <- rep(NA, times=N) if (R > 0) { for (i in 1:N) { p[i] <- 1 - mean(bootobj$t[,i] < bootobj$t0[i]) } } # a data frame, not an htest object # comparable to broom::tidy on an htest object RVAL <- data.frame(estimate=lambda, statistic=bootobj$t0, p.value=p, method=method) return(RVAL) } poisson.mtest <- function(x, R=NULL) { if (is.null(R)) R <- 0 rval <- poisson.tests(x, R, test="M") DNAME <- paste(deparse1(substitute(x)), "replicates: ", R) stat <- rval$statistic[1] names(stat) <- "M-CvM" e <- list( method = paste("Poisson M-test", sep = ""), statistic = stat, p.value = rval$p.value[1], data.name = DNAME, estimate = rval$estimate[1]) class(e) <- "htest" e } poisson.etest <- function(x, R=NULL) { if (is.null(R)) R <- 0 rval <- poisson.tests(x, R, test="E") DNAME <- paste(deparse1(substitute(x)), "replicates: ", R) stat <- rval$statistic names(stat) <- "E" e <- list( method = paste("Poisson E-test", sep = ""), statistic = stat, p.value = rval$p.value, data.name = paste("replicates: ", R, sep=""), estimate = rval$estimate) class(e) <- "htest" e } poisson.m <- function(x) { # mean distance statistic for Poissonity if (any(!is.integer(x)) || any(x < 0)) { warning("sample must be non-negative integers") return(NULL) } stats <- .poisMstat(x) names(stats) <- c("M-CvM", "M-AD") return(stats) } poisson.e <- function(x) { # energy GOF statistic for Poissonity if (any(!is.integer(x)) || any(x < 0)) { warning("sample must be non-negative integers") return(NULL) } lambda <- mean(x) n <- length(x) ## E|y-X| for X Poisson(lambda) (vectorized) Px <- ppois(x, lambda) Px1 <- ppois(x-1, lambda) meanvec <- 2*x*Px - 2*lambda*Px1 + lambda - x ## second mean E|X-X'| a <- 2 * lambda EXX <- a * exp(-a) * (besselI(a, 0) + besselI(a, 1)) ## third mean = sum_{i,j} |x_i - x_j| / n^2 K <- seq(1 - n, n - 1, 2) y <- sort(x) meanxx <- 2 * sum(K * y) / n^2 stat <- n * (2 * mean(meanvec) - EXX - meanxx) names(stat) <- "E" return(stat) } energy/R/Eeqdist.R0000644000176200001440000000712614005374454013501 0ustar liggesuserseqdist.e <- function(x, sizes, distance = FALSE, method = c("original","discoB","discoF")) { ## multivariate E-statistic for testing equal distributions ## x: matrix of pooled sample or distance matrix ## sizes: vector of sample sizes ## distance: logical, TRUE if x is a distance matrix, otherwise false ## method: original (default) or disco between components, or disco F ratio method <-match.arg(method) if (method=="discoB") { g <- as.factor(rep(1:length(sizes), sizes)) RVAL <- disco(x, factors=g, distance=distance, R=0, method=method) } else { RVAL <- eqdist.etest(x, sizes, distance = distance, R=0, method=method)$statistic } RVAL } eqdist.etest <- function(x, sizes, distance = FALSE, method = c("original","discoB","discoF"), R) { ## multivariate E-test of the multisample hypothesis of equal distributions ## x: matrix of pooled sample or distance matrix ## sizes: vector of sample sizes ## distance: logical, TRUE if x is a distance matrix, otherwise false ## method: original (default) or disco components ## R: number of replicates ## method <-match.arg(method) if (method=="discoB" || method=="discoF") { g <- as.factor(rep(1:length(sizes), sizes)) # for other index use disco() function directly return(disco(x, factors=g, distance=distance, index=1.0, R=R, method=method)) } nsamples <- length(sizes) if (nsamples < 2) return (NA) if (min(sizes) < 1) return (NA) if (!is.null(attr(x, "Size"))) distance <- TRUE x <- as.matrix(x) if (NROW(x) != sum(sizes)) stop("nrow(x) should equal sum(sizes)") if (distance == FALSE && nrow(x) == ncol(x)) warning("square data matrix with distance==FALSE") d <- NCOL(x) if (distance == TRUE) d <- 0 str <- "Multivariate " if (d == 1) str <- "Univariate " if (d == 0) str <- "" e0 <- 0.0 repl <- rep(0, R) pval <- 1.0 b <- .C("ksampleEtest", x = as.double(t(x)), byrow = as.integer(1), nsamples = as.integer(nsamples), sizes = as.integer(sizes), dim = as.integer(d), R = as.integer(R), e0 = as.double(e0), e = as.double(repl), pval = as.double(pval), PACKAGE = "energy") names(b$e0) <- "E-statistic" sz <- paste(sizes, collapse = " ", sep = "") methodname <- paste(str, length(sizes), "-sample E-test of equal distributions", sep = "") dataname <- paste("sample sizes ", sz, ", replicates ", R, sep="") e <- list( call = match.call(), method = methodname, statistic = b$e0, p.value = b$pval, data.name = dataname) class(e) <- "htest" e } ksample.e <- function(x, sizes, distance = FALSE, method = c("original","discoB","discoF"), ix = 1:sum(sizes)) { ## computes k-sample E-statistics for equal distributions ## retained for backward compatibility or use with boot ## (this function simply passes arguments to eqdist.e) ## ## x: pooled sample or distance matrix ## sizes: vector of sample sizes ## distance: TRUE if x is a distance matrix, otherwise FALSE ## method: default (original) or disco between components or disco F ratio ## ix: a permutation of row indices of x ## x <- as.matrix(x) method <- match.arg(method) eqdist.e(x[ix,], sizes=sizes, distance=distance, method=method) } energy/R/indep-deprecated.R0000644000176200001440000000072114661662303015273 0ustar liggesusers# deprecated independence test indep.test<- function(x, y, method = c("dcov","mvI"), index = 1, R) { # two energy tests for multivariate independence .Deprecated(new = "dcov.test", package = "energy", msg = "indep.test is deprecated, replaced by dcov.test or mvI.test") type <- match.arg(method) if (type == "dcov") return(dcov.test(x, y, index, R)) else if (type == "mvI") return(mvI.test(x, y, R)) } energy/R/mutual-indep.R0000644000176200001440000000161514300172713014474 0ustar liggesusersmutualIndep.test <- function(x, R) { if (NCOL(x) < 2) { stop("Expecting two or more samples") } bootfn <- function(x, i) { d <- ncol(x) dc <- numeric(d-1) for (k in 1:(d-1)) { dc[k] <- energy::bcdcor(x[i,k], x[,(k+1):d]) } return (dc) } b <- boot::boot(x, bootfn, sim="permutation", R=R) t0 <- sum(b$t0) tp <- rowSums(b$t) pval <- (1 + sum(tp > t0)) / (R + 1) estimate <- round(b$t0, 3) names(t0) <- "Sum(R*)" names(estimate) <- paste0("R*", 1:length(b$t0)) method <- paste("Energy Test of Mutual Independence") call <- match.call() NOTE <- "statistic=sum(bcdcor); permutation test" rval <- list(statistic = t0, p.value = pval, call = call, data.name=paste(deparse(substitute(x))," dim ", paste(dim(x), collapse=",")), estimate=estimate, method=method, note=NOTE) class(rval) <- "power.htest" return(rval) } energy/R/edist.R0000644000176200001440000000443214005374454013210 0ustar liggesusersedist <- function(x, sizes, distance = FALSE, ix = 1:sum(sizes), alpha = 1, method = c("cluster","discoB")) { # computes the e-dissimilarity matrix between k samples or clusters # x: pooled sample or Euclidean distances # sizes: vector of sample (cluster) sizes # distance: TRUE if x is a distance matrix, otherwise FALSE # ix: a permutation of row indices of x # alpha: distance exponent # method: cluster distances or disco statistics # k <- length(sizes) if (k == 1) return (as.dist(0.0)) if (k < 1) return (NA) e <- matrix(nrow=k, ncol=k) n <- cumsum(sizes) m <- 1 + c(0, n[1:(k-1)]) if (is.vector(x)) x <- matrix(x, ncol=1) if (inherits(x, "dist")) distance <- TRUE if (distance) dst <- as.matrix(x) else dst <- as.matrix(dist(x)) N <- NROW(dst) if (NCOL(dst) != N) stop("distance==TRUE but first argument is not distance") if (alpha != 1) { if (alpha <= 0 || alpha > 2) warning("exponent alpha should be in (0,2]") dst <- dst^alpha } type <- match.arg(method) if (type == "cluster") { for (i in 1:(k - 1)) { e[i, i] <- 0.0 for (j in (i + 1):k) { n1 <- sizes[i] n2 <- sizes[j] ii <- ix[m[i]:n[i]] jj <- ix[m[j]:n[j]] w <- n1 * n2 / (n1 + n2) m11 <- sum(dst[ii, ii]) / (n1 * n1) m22 <- sum(dst[jj, jj]) / (n2 * n2) m12 <- sum(dst[ii, jj]) / (n1 * n2) e[i, j] <- e[j, i] <- w * ((m12 + m12) - (m11 + m22)) } } } if (type == "discoB") { #disco statistics for testing F=G for (i in 1:(k - 1)) { e[i, i] <- 0.0 for (j in (i + 1):k) { n1 <- sizes[i] n2 <- sizes[j] ii <- ix[m[i]:n[i]] jj <- ix[m[j]:n[j]] J <- c(ii,jj) d <- dst[J, J] e[i, j] <- eqdist.e(d, sizes=c(n1, n2), distance=TRUE) e[j, i] <- e[i, j] <- e[i, j] * (n1 + n2) } } e <- 0.5 * e / sum(sizes) #discoB formula } e <- as.dist(e) attr(e,"method") <- paste(method,": index= ", alpha) e } energy/R/disco.R0000644000176200001440000001417414661740435013211 0ustar liggesusers### disco tests - implementation of DIStance COmponents methods in: ### ### Rizzo, M.L. and Szekely, G.J. (2010) "DISCO Analysis: A Nonparametric ### Extension of Analysis of Variance, Annals of Applied Statistics ### Vol. 4, No. 2, 1034-1055. ### ### disco: computes the decomposition and test using F ratio ### disco.between: statistic and test using between component ### .disco1: internal computations for one factor ### .disco1stat, .disco1Bstat: internal for boot function ### ### disco <- function(x, factors, distance = FALSE, index = 1, R, method = c("disco", "discoB", "discoF")) { ## x is response or Euclidean distance matrix or dist() object factors ## is a matrix or data frame of group labels distance=TRUE if x is ## distance, otherwise FALSE index is the exponent on distance, in (0,2] ## R is number of replicates for test method: use F ratio (default) or ## between component (discoB) disco method is currently alias for discoF method <- match.arg(method) factors <- data.frame(factors) if (inherits(x, "dist")) distance <- TRUE if (method == "discoB") return(disco.between(x, factors = factors, distance = distance, index = index, R = R)) nfactors <- NCOL(factors) if (distance || inherits(x, "dist")) dst <- as.matrix(x) else dst <- as.matrix(dist(x)) N <- NROW(dst) if (NCOL(dst) != N) stop("distance==TRUE but first argument is not distance") if (!isTRUE(all.equal(index, 1))) dst <- dst^index stats <- matrix(0, nfactors, 6) colnames(stats) <- c("Trt", "Within", "df1", "df2", "Stat", "p-value") for (j in 1:nfactors) { trt <- factors[, j] stats[j, 1:4] <- .disco1(trt = trt, dst = dst) if (R > 0) { b <- boot::boot(data = dst, statistic = .disco1stat, sim = "permutation", R = R, trt = trt) stats[j, 5] <- b$t0 stats[j, 6] <- (sum(b$t > b$t0) + 1)/(R + 1) } else { stats[j, 5] <- .disco1stat(dst, i = 1:nrow(dst), trt = trt) stats[j, 6] <- NA } } methodname <- "DISCO (F ratio)" dataname <- deparse(substitute(x)) total <- sum(stats[1, 1:2]) within <- total - sum(stats[, 1]) Df.trt <- stats[, 3] factor.names <- names(factors) factor.levels <- sapply(factors, nlevels) sizes <- sapply(factors, tabulate) e <- list(call = match.call(), method = methodname, statistic = stats[, 5], p.value = stats[, 6], k = nfactors, N = N, between = stats[, 1], withins = stats[, 2], within = within, total = total, Df.trt = Df.trt, Df.e = nrow(dst) - sum(Df.trt) - 1, index = index, factor.names = factor.names, factor.levels = factor.levels, sample.sizes = sizes, stats = stats) class(e) <- "disco" e } disco.between <- function(x, factors, distance = FALSE, index = 1, R) { ## disco test based on the between-sample component similar to disco ## except that 'disco' test is based on the F ratio disco.between test ## for one factor (balanced) is asymptotically equivalent to k-sample E ## test (test statistics are proportional in that case but not in ## general). x is response or Euclidean distance matrix or dist() ## object factors is a matrix or data frame of group labels ## distance=TRUE if x is distance, otherwise FALSE index is the exponent ## on distance, in (0,2] factors <- data.frame(factors) nfactors <- NCOL(factors) if (nfactors > 1) stop("More than one factor is not implemented in disco.between") if (distance || inherits(x, "dist")) dst <- as.matrix(x) else dst <- as.matrix(dist(x)) N <- NROW(dst) if (NCOL(dst) != N) stop("distance==TRUE but first argument is not distance") if (!isTRUE(all.equal(index, 1))) dst <- dst^index trt <- factors[, 1] if (R > 0) { b <- boot::boot(data = dst, statistic = .disco1Bstat, sim = "permutation", R = R, trt = trt) between <- b$t0 reps <- b$t pval <- (1+sum(reps > between)) / (R+1) } else { between <- .disco1Bstat(dst, i = 1:nrow(dst), trt = trt) pval <- NA } if (R == 0) return(between) methodname <- "DISCO (Between-sample)" dataname <- deparse(substitute(x)) names(between) <- "DISCO between statistic" e <- list(call = match.call(), method = methodname, statistic = between, p.value = pval, data.name = dataname) class(e) <- "htest" e } .disco1 <- function(trt, dst) { ## dst is Euclidean distance matrix or power of it trt is the treatment, ## a factor trt <- factor(trt) k <- nlevels(trt) n <- tabulate(trt) N <- sum(n) total <- sum(dst)/(2 * N) y <- as.vector(dst[, 1]) M <- model.matrix(y ~ 0 + trt) G <- t(M) %*% dst %*% M withins <- diag(G)/(2 * n) W <- sum(withins) B <- total - W c(B, W, k - 1, N - k) } .disco1stat <- function(dst, i, trt) { ## i is permuation vector supplied by bootstrap dst is Euclidean ## distance matrix or power of it trt is the treatment, a factor returns ## the disco 'F' ratio idx <- 1:nrow(dst) d <- .disco1(trt = trt[idx[i]], dst = dst) statistic <- (d[1]/d[3])/(d[2]/d[4]) } .disco1Bstat <- function(dst, i, trt) { ## i is permuation vector supplied by bootstrap dst is Euclidean ## distance matrix or power of it trt is the treatment, a factor returns ## the between-sample component (for one factor) idx <- 1:nrow(dst) .disco1(trt = trt[idx[i]], dst = dst)[1] } print.disco <- function(x, ...) { k <- x$k md1 <- x$between/x$Df.trt md2 <- x$within/x$Df.e f0 <- x$statistic print(x$call) cat(sprintf("\nDistance Components: index %5.2f\n", x$index)) cat(sprintf("%-15s %4s %10s %10s %9s %9s\n", "Source", "Df", "Sum Dist", "Mean Dist", "F-ratio", "p-value")) fabb <- abbreviate(x$factor.names, minlength=12) for (i in 1:k) { fname <- fabb[i] cat(sprintf("%-15s %4d %10.5f %10.5f %9.3f %9s\n", fname, x$Df.trt[i], x$between[i], md1[i], f0[i], format.pval(x$p.value[i]))) } cat(sprintf("%-15s %4d %10.5f %10.5f\n", "Within", x$Df.e, x$within, md2)) cat(sprintf("%-15s %4d %10.5f\n", "Total", x$N - 1, x$total)) } energy/R/Emvnorm.R0000644000176200001440000001022314172025577013522 0ustar liggesusersmvnorm.test <- function(x, R) { # parametric bootstrap E-test for multivariate normality if (missing(R)) { method = "Energy test of multivariate normality: (Specify R > 0 for MC test)" R <- 0 } else { method = "Energy test of multivariate normality: estimated parameters" } if (is.vector(x) || NCOL(x)==1) { n <- NROW(x) d <- 1 bootobj <- boot::boot(x, statistic = normal.e, R = R, sim = "parametric", ran.gen = function(x, y) { return(rnorm(n)) }) } else { n <- nrow(x) d <- ncol(x) bootobj <- boot::boot(x, statistic = mvnorm.e, R = R, sim = "parametric", ran.gen = function(x, y) { return(matrix(rnorm(n * d), nrow = n, ncol = d)) }) } if (R > 0) p <- 1 - mean(bootobj$t < bootobj$t0) else p <- NA names(bootobj$t0) <- "E-statistic" e <- list(statistic = bootobj$t0, p.value = p, method = method, data.name = paste("x, sample size ", n, ", dimension ", d, ", replicates ", R, sep = "")) class(e) <- "htest" e } mvnorm.etest <- function(x, R) { return(mvnorm.test(x, R)) } mvnorm.e <- function(x) { # E-statistic for multivariate normality if (is.vector(x) || NCOL(x)==1) return(normal.e(x)) n <- nrow(x) d <- ncol(x) if (n < 2) { warning("sample size must be at least 2") return(NA) } # subtract column means and compute S^(-1/2) z <- scale(x, scale = FALSE) ev <- eigen(var(x), symmetric = TRUE) P <- ev$vectors lambda <- ev$values D <- diag(d) diag(D) <- 1 / sqrt(lambda) y <- z %*% (P %*% D %*% t(P)) if (any(!is.finite(y))) { warning("missing or non-finite y") return(NA) } if (requireNamespace("gsl", quietly=TRUE)) { const <- exp(lgamma((d+1)/2) - lgamma(d/2)) mean2 <- 2*const ysq <- rowSums(y^2) mean1 <- sqrt(2) * const * mean(gsl::hyperg_1F1(-1/2, d/2, -ysq/2)) mean3 <- 2*sum(dist(y)) / n^2 return(n * (2*mean1 - mean2 - mean3)) } else { warning("package gsl required but not found") return (NA) } } normal.e <- function(x) { ## Case 4: unknown parameters x <- as.vector(x) n <- length(x) s <- sd(x) if (!is.finite(s) || !(s > 0)) { warning("sd(x)>0 required") return(NA) } y <- (x - mean(x)) / sd(x) y <- sort(y) K <- seq(1 - n, n - 1, 2) return(2 * (sum(2 * y * pnorm(y) + 2 * dnorm(y)) - n/sqrt(pi) - mean(K * y))) } normal.test <- function(x, method=c("mc", "limit"), R) { ## implements the test for for d=1 ## Case 4: composite hypothesis method <- match.arg(method) estimate <- c(mean(x), sd(x)) names(estimate) <- c("mean", "sd") if (method == "mc") { ## Monte Carlo approach if (missing(R)) R <- 0 e <- energy::mvnorm.etest(x, R=R) e$method <- "Energy test of normality" e$method <- ifelse(R > 0, paste0(e$method,": estimated parameters"), paste0(e$method, " (Specify R > 0 for MC test)")) e$estimate <- estimate return(e) } ## implement test using asymptotic distribution for p-value if (!is.numeric(x) || (!is.vector(x) && NCOL(x) > 1)) { warning("x must be a numeric vector") return (NA) } else { x <- as.vector(x, mode="numeric") } n <- length(x) t0 <- normal.e(x) names(t0) <- "statistic" ## load pre-computed eigenvalues ev <- energy::EVnormal[, "Case4"] if (requireNamespace("CompQuadForm", quietly=TRUE)) { p <- CompQuadForm::imhof(t0, ev)$Qq } else { warning("limit distribution method requires CompQuadForm package for p-value") p <- NA } estimate <- c(mean(x), sd(x)) names(estimate) <- c("mean", "sd") e <- list(statistic = t0, p.value = p, method = paste("Energy test of normality: limit distribution"), estimate = estimate, data.name = "Case 4: composite hypothesis, estimated parameters") class(e) <- "htest" e } energy/R/energy-deprecated.R0000644000176200001440000000244314661117013015461 0ustar liggesusers## deprecated functions in energy package DCOR <- function(x, y, index=1.0) { # distance covariance and correlation statistics # alternate method, implemented in R without .C call # this method is usually slower than the C version .Deprecated(new = "dcor", package = "energy", msg = "DCOR is deprecated, replaced by dcor or dcov") if (!inherits(x, "dist")) x <- dist(x) if (!inherits(y, "dist")) y <- dist(y) x <- as.matrix(x) y <- as.matrix(y) n <- nrow(x) m <- nrow(y) if (n != m) stop("Sample sizes must agree") if (! (all(is.finite(c(x, y))))) stop("Data contains missing or infinite values") if (index < 0 || index > 2) { warning("index must be in [0,2), using default index=1") index=1.0} stat <- 0 dims <- c(n, ncol(x), ncol(y)) Akl <- function(x) { d <- as.matrix(x)^index m <- rowMeans(d) M <- mean(d) a <- sweep(d, 1, m) b <- sweep(a, 2, m) return(b + M) } A <- Akl(x) B <- Akl(y) dCov <- sqrt(mean(A * B)) dVarX <- sqrt(mean(A * A)) dVarY <- sqrt(mean(B * B)) V <- sqrt(dVarX * dVarY) if (V > 0) dCor <- dCov / V else dCor <- 0 return(list(dCov=dCov, dCor=dCor, dVarX=dVarX, dVarY=dVarY)) } energy/R/util.R0000644000176200001440000000454614300425313013050 0ustar liggesusers## util.R ## ## utilities for the energy package ## Author: Maria Rizzo ## github.com/mariarizzo/energy ## .arg2dist.matrix <- function(x) { ## argument check and conversion for energy functions ## that take optionally data or distance object arguments ## check type of argument, return a distance matrix ## supported argument types: matrix, vector, data.frame, tibble, factor, dist if (anyNA(x)) warning("missing values not supported") if (inherits(x, "dist")) { Dx <- as.matrix(x) return(Dx) } if (is.factor(x)) { z <- as.matrix(as.integer(x)) Dx <- calc_dist(z) if (!is.ordered(x) && nlevels(x) > 2) { # need a 0-1 matrix Dx <- matrix(as.integer(Dx > 0), nrow=nrow(Dx)) } return(Dx) } if (is.vector(x) || is.data.frame(x)) { ## also for tibble Dx <- calc_dist(as.matrix(x)) } if (is.matrix(x)) { if (is.dmatrix(x)) { Dx <- x } else { ## should be data matrix Dx <- calc_dist(x) } } return(Dx) ## if here, arg type is not supported stop(paste("cannot compute distances for", class(x))) return(NA) } is.dmatrix <- function(x, tol = 100 * .Machine$double.eps) { ## check if zero diagonal, symmetric, non-negative square matrix ## i.e., distance matrix or dissimilarity matrix value <- FALSE if (is.matrix(x)) { if (nrow(x) == ncol(x)) { if (max(abs(diag(x)) < tol) && (max(abs(x - t(x)) < tol))) { if (! any(x < 0.0)) value <- TRUE } } } return (value) } perm.matrix <- function(n, R) { ## Generate the same matrix as boot.array with ## sim="permutation" and default other arguments ## with same seed we get boot.array(boot.out, indices=T) pfn <- function(x, n) x[sample.int(n)] perms <- matrix(1:n, n, R) perms <- t(apply(perms, 2, pfn, n=n)) } permutation <- function(n) { ## call the internal permute() function using permute_check() J <- 1:n a <- .C("permute_check", J = as.integer(J), n = as.integer(n), PACKAGE = "energy") return (a$J) } sortrank <- function(x) { ## sort and rank data with one call to order() ## faster than calling sort and rank separately ## returns an object identical to: ## list(x=sort(x), ix=order(x), r=rank(x, ties.method = "first")) o <- order(x) n <- length(o) N <- 1:n N[o] <- N return(list(x=x[o], ix=o, r=N)) } energy/R/pdcor.R0000644000176200001440000000047714251254420013205 0ustar liggesusers## pdcor.R ## ## pdcor <- function(x, y, z) { x <- .arg2dist.matrix(x) y <- .arg2dist.matrix(y) z <- .arg2dist.matrix(z) partial_dcor(x, y, z)["pdcor"] } pdcov <- function(x, y, z) { x <- .arg2dist.matrix(x) y <- .arg2dist.matrix(y) z <- .arg2dist.matrix(z) partial_dcov(x, y, z) } energy/data/0000755000176200001440000000000014570120462012455 5ustar liggesusersenergy/data/EVnormal.rda0000644000176200001440000000777714005374454014717 0ustar liggesusersBZh91AY&SYrBÜ)ÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿà žèø½öj«Ït: Q@ÖªjdÓÔÍFš2 ©êm!êzšmOSiha©‘ê#Œš=OS#@Í@42 M4M©êi´F†šb44Ñ Úž“ÓQ¡´žˆÓM§¤z&Ôi“FšhETü©‘‚iê`€Ñ£GêL†Œ„ÈôžQ aOP3hôòhšiå2=@Hñ'µ=& 4 =#jz'¨È=OQ 24ƈbŒi1 €LÂ@ €ÀM LLiE50@4ÐÈ Èˆdd¡¦&šhšA£M Ð4Ðhh UTüF€CL‚bz&a1bdÓBz4LAL0›Fƒ x§¦jcSešžOOMi¦€i2fš&›CC@ÔÔ®Ç<%UUUUUUUUUUUUUUUUUUUUï !%')+-/1356789:;<=>?þ ¡!¢"£#¤$¥%¦&§'¨(©)ª*Ú¡!a­pöÈ‹lMºßpx}F¡ ´ (ˆˆËüsó"æ#ƒÛd¿¨DD´[òÍÕ«VCDAgÄc¨`&O‚¡_x!ÁAˆÃ=0$”.î”Ô¿C;4Šî¢ð†¯!½……§d²Qò »  óùLŠAYbêhÅôß5ÄuµëÒø=B¸m$ÍàÄܹۗ[š¦'bcüú4=«óÅ(Ì»Ql­OF«rÚpMÕé+Å Õ˜ÀÔg pCõÖ~Öå†ÁÞ ô2FíUSÝÌ™KnÈó3iw‘ßõY”(é¼CÆS—í ­ž7Ìe ’—À\&_Lø±7úA„ÍŒlØuèm½CBî&/^TñFÕ+q’û1y£H(8vå7Ä^ÌCøÂç7 iF™#öj WDõª'`ÃÃ)|tEP#˜äÌÐC·0Ø:I ã\)>F5å܇Q´–7÷Þ3p“Y…þ‹rCÁåuxDî,Àè5+õ,M.é iØ·q„Œžz¢˜´#¿äónÂá‡Ô³ŸøƒñÍy©û”<ê ÜÛmû–±ã^*-YU_ßÞP_MËoBVåóÍÃ3œ CbC˃“7.d|ä.ì› Në»U4æ‰8Žp…CÙs>Õ¬¢Ñqç"¡8þïä5ÞˆSј¸VãôõX“-,N¡a.lèK)ZZ†p…üR|RÌ‹G¸Æ­Âyò`fsŒ¹gñ–”í¬Ò²,Õ닎äN&tÔŒÇSÐÅ3BXê¸Xº*ûÉþùYÍûØC™'[.)“ÖêQ`G2Òr…A›\¹]{²<ÀGJ!qͯEá¬Ì Ee"ë¡Hm‹¬YoÌÖ ƒYT"%?»®cè;g¨W2/6߆ŒH]¤U3Nª‘äpŸDU«”jã‡L¹*VmÉu3¥hÜo{ì[½*TñBž˜ÝV[OXΣÔ&»ED¹5äy‹é¿w…/â&ÉtšîTË%kÜ¥mSXÇ*z¯'ݱѭÍð‘Æ—“mI‚bÝ™+âð)5»ü=ï6¤Á2ËÕgî-Îé×{T ÆjM«•9Jœ‰§è§Ÿ4Du%ÁpïálhE‘ë(˜án«[}ŸŽ£Ã@!˜Ú*ÐçH0ôŸ¡A›¹%Ì—EPü¯åQNOžµÞ9Qô(NPBOÉk„Ô›…I…%&œ£i «m"¥qÈ(S§-2—æÍÁÔ.)Ö–Zvñ)³Þ–¿} ë¨Ü÷·»Z7÷Þ¦a§UO¶« j½žRC¾YÂ]£^Û7W²’Ë$¨e6éÁP«{Ñ[—1d^Æå:á݉4ÂÉ™${‡§ :/ñÕ‘”2W€ƒLÔë’+zËm›ýUƒUc"þœÊߌ„»¼ÒxÉInj"k^¯g¡,ýÖ}˜±%³p1ó¥]ê²%Õ‘Xmçès¥äš2iB!“˜T"ùÿÑ^! |ÿ¾Nño|·Ñ8›’D&ëO53ûJ:„È ç.7fß©:,J,;V3¹ÙZbjºM]úfDØÝ«`“Ê>rv%Ïàye¯è‘7Y±ë=Lß!ÚùýJ(IÄ_Ü '¨)ìlü GNKô"þgªJaŽ]Û:[ä'HÓÁÌDY°AVfb9„a2°tÊÔdÛ‘¤²vr¦á¼Ñiáºñ22ŽŠ9¡Ø*—3H!Ø’I$’I$’sJ¤’I$¡(J„¡(J„¡(J„¡(Jƒéÿm#¢•2…òÁ|R¤;ËÄÔ+8üÌùú“BlPÁY^ü“› M¨W<"#ô) ¨ÇÊi(O©ÉýÇÔÃ%ô~ZØ ݪȸLS0e·¾â'àŸÞjˆïÚ=&ð5 ’„ìR²™\$¸”a‡ M /ˆ}ËŠY±9ÂÓj)PvHóà‡løafŽÖ­Ohœ¨Ðà‰^ËÚ(u‰/K˜¢îkO€”ÞV 'BÁ)órGÜ`'g†O‚Íkf#X£'luýMO0A˜«à˜Ô£Ó3iÛÃÔ®<:ä …L èÜ¡f¨Ù‰1#æ¤J/b"³)F’ùÑ.Rƒêr-ýS·j+wßJH¡¼CTkR”F…sÔ-46c°Õ¸8s E»Èo ² {WF µÐ™hD:K§XiÅm[aæ i$C8±Oq fÄÆé» dƒP¯¥×‘&ðwãƒ0ÃM{ˆb@K«ÞH4©3¸Ó V7áÈ/yÐ+ù*óBn¡—ñ@ZœàGÈXËà? ŽbÓÆÄSFîD®dú)¸!x²Ùh£Ü| šÄg±îmÑ Í–n]ÃˆŠ ;À“ã1_UhVeÞ„£ç!4 Á sd*T¨Y å•yu–°è,:5°þ,BeÞ7HaŸD½%ç’Ùï&Çý0îd#ÊÆC‚¨z‚ÝPÑœòw¨5¯2…ä…ÛÃ…aßµø‹R,"FÍð¦ŽÄh(VÕc€ÏÇk/ÏžqVÑqÓJöå+Æ»ÃY•dæõÚÉÁ°ë’»â Aœ*ɾ{ä„z.éOVÇê#„;-jï&ìoºÒŸÔJ\¡e‡"KÞyðŠNKñ‹Œùý÷gvÚñi±#‡ãÁdPBY£2Ö”8õ>¿Ý3£¥)º(`xÄë£t)‰Ÿ¿ËÿnÜÎZUCa aOx€PRì­ö¢¬ù ,bDwµ «hÓjJòkC‡/ÆqîûÍ™}ØÔ!·>&½æëÕÉ~øÐØk4fŒ‰ªŽÇƒÈX╪¼ ½ûHÄú3?8dDÎt´áºœ ¨-½øž-í„qçÚÅ`iì?çÊ2“z½.àÑÙ©ÐìÛGæ £²‚L­ µ`R ëî€À0>­N Ðb~6sÌnGÑ„$X°}uÁÇ&Äìýv“æ'{4ñùý/sd«.¬0Òïeg4j:å€îe—‘Ì­œú™R)¦Ó&!SæàëÊž%ÒæO‘gÅ¸Ï Ǭq×Htžïß>#ÐÀo<§°áÍå*_AÍiÏ.™î‹0½yEe)úf‹~iI?qþÂäæ!fJm™qr\æíµ³| úR;ñùÈ=÷ÊGå`À.d“u>+Äýnƒ³}\'o^D[F— .‰Ëñk\2"¿Þ.ùÄp¥¬kÓ®(*/Iû(„‘¶7uô”4W/,ÛsI¥™•îæÎqx¹ø¶CÑ“´Ù˜!"DÚÒW®?9`InäÇY½8ÕÁ:ßÄ9SúÁôPbÍ—,¼÷¢ý‚Út‡dhòÕF×p#‰9î Ñ¡%ˆ;Z#ê%ˆÚ"¯ÎÉ\Vðþ|a †e<甕È:ο ¹##xþ>%°°v<%ò¼ÅOg ´q†·“æåéO ¥Ø³ž®£nP-.nðÎCˆ”vÃ& YéÍ£tB•ôQ -E Àx«™²ãQl*Aã@~ǰ}-f†ê™‘´1F;ô#Šeçú}¯Påè@ µA •Di„$qˆÌ8UrtŽ1cÅ´õ‚lÂôéP;‚÷(c ²“«R@„Œ °ˆ5‹Ä.vV…¨}´"\T$©X6è‹óvä"APŽx"Iøq†®tãEÄÖiÆäv e"d.†ðƒYBÉÈ7EÏLø\QÅêšaõ—Bb’uÇ~G\³ÑÐw2†rX¿˜†¾?jqåÅyïwÝ!asô«ÝcŽÈ‚Øm "ÎÔãZíö²ÿ÷Ú›?Ò¹a¾ÕQ¨l¡Û&࡞Ȟƒ$ YÊ do not edit by hand // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 #include using namespace Rcpp; #ifdef RCPP_USE_GLOBAL_ROSTREAM Rcpp::Rostream& Rcpp::Rcout = Rcpp::Rcpp_cout_get(); Rcpp::Rostream& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get(); #endif // Btree_sum NumericVector Btree_sum(IntegerVector y, NumericVector z); RcppExport SEXP _energy_Btree_sum(SEXP ySEXP, SEXP zSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< IntegerVector >::type y(ySEXP); Rcpp::traits::input_parameter< NumericVector >::type z(zSEXP); rcpp_result_gen = Rcpp::wrap(Btree_sum(y, z)); return rcpp_result_gen; END_RCPP } // calc_dist NumericMatrix calc_dist(NumericMatrix x); RcppExport SEXP _energy_calc_dist(SEXP xSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< NumericMatrix >::type x(xSEXP); rcpp_result_gen = Rcpp::wrap(calc_dist(x)); return rcpp_result_gen; END_RCPP } // U_product double U_product(NumericMatrix U, NumericMatrix V); RcppExport SEXP _energy_U_product(SEXP USEXP, SEXP VSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< NumericMatrix >::type U(USEXP); Rcpp::traits::input_parameter< NumericMatrix >::type V(VSEXP); rcpp_result_gen = Rcpp::wrap(U_product(U, V)); return rcpp_result_gen; END_RCPP } // D_center NumericMatrix D_center(NumericMatrix Dx); RcppExport SEXP _energy_D_center(SEXP DxSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< NumericMatrix >::type Dx(DxSEXP); rcpp_result_gen = Rcpp::wrap(D_center(Dx)); return rcpp_result_gen; END_RCPP } // U_center NumericMatrix U_center(NumericMatrix Dx); RcppExport SEXP _energy_U_center(SEXP DxSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< NumericMatrix >::type Dx(DxSEXP); rcpp_result_gen = Rcpp::wrap(U_center(Dx)); return rcpp_result_gen; END_RCPP } // dcovU_stats NumericVector dcovU_stats(NumericMatrix Dx, NumericMatrix Dy); RcppExport SEXP _energy_dcovU_stats(SEXP DxSEXP, SEXP DySEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< NumericMatrix >::type Dx(DxSEXP); Rcpp::traits::input_parameter< NumericMatrix >::type Dy(DySEXP); rcpp_result_gen = Rcpp::wrap(dcovU_stats(Dx, Dy)); return rcpp_result_gen; END_RCPP } // kgroups_start List kgroups_start(NumericMatrix x, int k, IntegerVector clus, int iter_max, bool distance); RcppExport SEXP _energy_kgroups_start(SEXP xSEXP, SEXP kSEXP, SEXP clusSEXP, SEXP iter_maxSEXP, SEXP distanceSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< NumericMatrix >::type x(xSEXP); Rcpp::traits::input_parameter< int >::type k(kSEXP); Rcpp::traits::input_parameter< IntegerVector >::type clus(clusSEXP); Rcpp::traits::input_parameter< int >::type iter_max(iter_maxSEXP); Rcpp::traits::input_parameter< bool >::type distance(distanceSEXP); rcpp_result_gen = Rcpp::wrap(kgroups_start(x, k, clus, iter_max, distance)); return rcpp_result_gen; END_RCPP } // Istat double Istat(NumericMatrix Dx, NumericMatrix Dy); RcppExport SEXP _energy_Istat(SEXP DxSEXP, SEXP DySEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< NumericMatrix >::type Dx(DxSEXP); Rcpp::traits::input_parameter< NumericMatrix >::type Dy(DySEXP); rcpp_result_gen = Rcpp::wrap(Istat(Dx, Dy)); return rcpp_result_gen; END_RCPP } // Istats NumericVector Istats(NumericMatrix Dx, NumericMatrix Dy, int R); RcppExport SEXP _energy_Istats(SEXP DxSEXP, SEXP DySEXP, SEXP RSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< NumericMatrix >::type Dx(DxSEXP); Rcpp::traits::input_parameter< NumericMatrix >::type Dy(DySEXP); Rcpp::traits::input_parameter< int >::type R(RSEXP); rcpp_result_gen = Rcpp::wrap(Istats(Dx, Dy, R)); return rcpp_result_gen; END_RCPP } // partial_dcor NumericVector partial_dcor(NumericMatrix Dx, NumericMatrix Dy, NumericMatrix Dz); RcppExport SEXP _energy_partial_dcor(SEXP DxSEXP, SEXP DySEXP, SEXP DzSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< NumericMatrix >::type Dx(DxSEXP); Rcpp::traits::input_parameter< NumericMatrix >::type Dy(DySEXP); Rcpp::traits::input_parameter< NumericMatrix >::type Dz(DzSEXP); rcpp_result_gen = Rcpp::wrap(partial_dcor(Dx, Dy, Dz)); return rcpp_result_gen; END_RCPP } // partial_dcov double partial_dcov(NumericMatrix Dx, NumericMatrix Dy, NumericMatrix Dz); RcppExport SEXP _energy_partial_dcov(SEXP DxSEXP, SEXP DySEXP, SEXP DzSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< NumericMatrix >::type Dx(DxSEXP); Rcpp::traits::input_parameter< NumericMatrix >::type Dy(DySEXP); Rcpp::traits::input_parameter< NumericMatrix >::type Dz(DzSEXP); rcpp_result_gen = Rcpp::wrap(partial_dcov(Dx, Dy, Dz)); return rcpp_result_gen; END_RCPP } // poisMstat NumericVector poisMstat(IntegerVector x); RcppExport SEXP _energy_poisMstat(SEXP xSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< IntegerVector >::type x(xSEXP); rcpp_result_gen = Rcpp::wrap(poisMstat(x)); return rcpp_result_gen; END_RCPP } // projection NumericMatrix projection(NumericMatrix Dx, NumericMatrix Dz); RcppExport SEXP _energy_projection(SEXP DxSEXP, SEXP DzSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< NumericMatrix >::type Dx(DxSEXP); Rcpp::traits::input_parameter< NumericMatrix >::type Dz(DzSEXP); rcpp_result_gen = Rcpp::wrap(projection(Dx, Dz)); return rcpp_result_gen; END_RCPP } energy/src/dcov.c0000644000176200001440000001243214660667241013447 0ustar liggesusers/* dcov.c: distance correlation and covariance statistics and dCov test for multivariate independence Szekely, G.J., Rizzo, M.L., and Bakirov, N.K. (2007) "Measuring and testing dependence by correlation of distances" Annals of Statistics, Vol. 35 No. 6, pp. 2769-2794. Author: Maria L. Rizzo energy package github.com/mariarizzo/energy */ #include #include void dCOVtest(double *x, double *y, int *nrow, int *nreps, double *reps, double *DCOV, double *pval); void dCOV(double *x, double *y, int *nrow, double *DCOV); double Akl(double **akl, double **A, int n); /* functions in utilities.c */ extern double **alloc_matrix(int r, int c); extern int **alloc_int_matrix(int r, int c); extern void free_matrix(double **matrix, int r, int c); extern void free_int_matrix(int **matrix, int r, int c); extern void permute(int *J, int n); extern void roworder(double *x, int *byrow, int r, int c); extern void Euclidean_distance(double *x, double **Dx, int n, int d); extern void index_distance(double **Dx, int n, double index); extern void vector2matrix(double *x, double **y, int N, int d, int isroworder); void dCOVtest(double *x, double *y, int *nrow, int *nreps, double *reps, double *DCOV, double *pval) { /* input vectors must expand to distance matrices any exponent must be pre-computed in R computes dCov(x,y), dCor(x,y), dVar(x), dVar(y) V-statistic is n*dCov^2 where n*dCov^2 --> Q DCOV : vector [dCov, dCor, dVar(x), dVar(y), mean(A), mean(B)] */ int i, j, k, r, J, K, M; int n = nrow[0], R = nreps[0]; int* perm; double **Dx, **Dy, **A, **B; double dcov, V; double n2 = (double) n * n; Dx = alloc_matrix(n, n); Dy = alloc_matrix(n, n); vector2matrix(x, Dx, n, n, 1); vector2matrix(y, Dy, n, n, 1); A = alloc_matrix(n, n); B = alloc_matrix(n, n); Akl(Dx, A, n); Akl(Dy, B, n); free_matrix(Dx, n, n); free_matrix(Dy, n, n); /* compute dCov(x,y), dVar(x), dVar(y) */ for (k=0; k<4; k++) DCOV[k] = 0.0; for (k=0; k 0) DCOV[k] = sqrt(DCOV[k]); else DCOV[k] = 0.0; } /* compute dCor(x, y) */ V = DCOV[2]*DCOV[3]; if (V > DBL_EPSILON) DCOV[1] = DCOV[0] / sqrt(V); else DCOV[1] = 0.0; if (R > 0) { /* compute the replicates */ if (DCOV[1] > 0.0) { perm = R_Calloc(n, int); M = 0; for (i=0; i= DCOV[0]) M++; } *pval = (double) (M+1) / (double) (R+1); PutRNGstate(); R_Free(perm); } else { *pval = 1.0; } } free_matrix(A, n, n); free_matrix(B, n, n); return; } void dCOV(double *x, double *y, int *nrow, double *DCOV) { /* input vectors must expand to distance matrices any exponent must be pre-computed in R computes dCov(x,y), dCor(x,y), dVar(x), dVar(y) V-statistic is n*dCov^2 where n*dCov^2 --> Q DCOV : vector [dCov, dCor, dVar(x), dVar(y)] */ int j, k, n = nrow[0]; double **Dx, **Dy, **A, **B; double V, n2 = (double) n * n; Dx = alloc_matrix(n, n); Dy = alloc_matrix(n, n); vector2matrix(x, Dx, n, n, 1); vector2matrix(y, Dy, n, n, 1); A = alloc_matrix(n, n); B = alloc_matrix(n, n); Akl(Dx, A, n); Akl(Dy, B, n); free_matrix(Dx, n, n); free_matrix(Dy, n, n); n2 = ((double) n) * n; /* compute dCov(x,y), dVar(x), dVar(y) */ for (k=0; k<4; k++) DCOV[k] = 0.0; for (k=0; k 0) DCOV[k] = sqrt(DCOV[k]); else DCOV[k] = 0.0; } /* compute dCor(x, y) */ V = DCOV[2]*DCOV[3]; if (V > DBL_EPSILON) DCOV[1] = DCOV[0] / sqrt(V); else DCOV[1] = 0.0; free_matrix(A, n, n); free_matrix(B, n, n); return; } double Akl(double **akl, double **A, int n) { /* -computes the A_{kl} or B_{kl} distances from the distance matrix (a_{kl}) or (b_{kl}) for dCov, dCor, dVar dCov = mean(Akl*Bkl), dVar(X) = mean(Akl^2), etc. */ int j, k; double *akbar; double abar; akbar = R_Calloc(n, double); abar = 0.0; for (k=0; k using namespace Rcpp; /* Author: Maria L. Rizzo energy package github.com/mariarizzo/energy */ // [[Rcpp::export]] NumericMatrix calc_dist(NumericMatrix x) { int n = x.nrow(), d = x.ncol(), i, j, k; double dsum, dk; NumericMatrix Dx(n, n); for (i = 0; i < n; i++) { for (j = i; j < n; j++) { if (i == j) { Dx(i, i) = 0.0; } else { dsum = 0.0; for (k = 0; k < d; k++) { dk = x(i,k) - x(j,k); dsum += dk * dk; } Dx(i, j) = sqrt(dsum); Dx(j, i) = Dx(i, j); } } } return Dx; } energy/src/partial-dcor.cpp0000644000176200001440000000534414246144104015425 0ustar liggesusers#include using namespace Rcpp; /* Author: Maria L. Rizzo energy package github.com/mariarizzo/energy */ NumericVector partial_dcor(NumericMatrix Dx, NumericMatrix Dy, NumericMatrix Dz); double partial_dcov(NumericMatrix Dx, NumericMatrix Dy, NumericMatrix Dz); NumericMatrix U_center(NumericMatrix); double U_product(NumericMatrix U, NumericMatrix V); NumericMatrix projection(NumericMatrix Dx, NumericMatrix Dz); // [[Rcpp::export]] NumericVector partial_dcor(NumericMatrix Dx, NumericMatrix Dy, NumericMatrix Dz) { /* partial distance correlation, second formulation Dx, Dy, Dz are symmetric distance or dissimilarity matrices with zero diagonals partial_dcor : vector length 4, partial_dcor[0] is pdcor partial_dcor returns vector [Rxyz, Rxy, Rxz, Ryz] starred versions */ int n = Dx.nrow(); NumericMatrix A(n, n), B(n, n), C(n, n); double Rxy=0.0, Rxz=0.0, Ryz=0.0, Rxyz=0.0, den; double AB, AC, BC, AA, BB, CC, pDCOV; double eps = std::numeric_limits::epsilon(); //machine epsilon A = U_center(Dx); /* U-centering to get A^U etc. */ B = U_center(Dy); C = U_center(Dz); AB = U_product(A, B); AC = U_product(A, C); BC = U_product(B, C); AA = U_product(A, A); BB = U_product(B, B); CC = U_product(C, C); pDCOV = U_product(projection(Dx, Dz), projection(Dy, Dz)); den = sqrt(AA*BB); if (den > eps) Rxy = AB / den; den = sqrt(AA*CC); if (den > eps) Rxz = AC / den; den = sqrt(BB*CC); if (den > eps) Ryz = BC / den; den = sqrt(1 - Rxz*Rxz) * sqrt(1 - Ryz * Ryz); if (den > eps) Rxyz = (Rxy - Rxz * Ryz) / den; else { Rxyz = 0.0; } return NumericVector::create( _["pdcor"] = Rxyz, _["pdcov"] = pDCOV, _["Rxy"] = Rxy, _["Rxz"] = Rxz, _["Ryz"] = Ryz ); } //[[Rcpp::export]] double partial_dcov(NumericMatrix Dx, NumericMatrix Dy, NumericMatrix Dz) { /* pdcov following the definition via projections Dx, Dy, Dz are symmetric distance or dissimilarity matrices with zero diagonals returns pdcov sample coefficient */ int n = Dx.nrow(); int i, j; NumericMatrix A(n, n), B(n, n), C(n, n), Pxz(n, n), Pyz(n, n); double AC, BC, CC, c1, c2; double eps = std::numeric_limits::epsilon(); //machine epsilon A = U_center(Dx); /* U-centering to get A^U etc. */ B = U_center(Dy); C = U_center(Dz); AC = U_product(A, C); BC = U_product(B, C); CC = U_product(C, C); c1 = c2 = 0.0; // if (C,C)==0 then C=0 and both (A,C)=0 and (B,C)=0 if (fabs(CC) > eps) { c1 = AC / CC; c2 = BC / CC; } for (i=0; i using namespace Rcpp; NumericMatrix Dxi(NumericMatrix Dx, IntegerVector ix); // [[Rcpp::export]] double Istat(NumericMatrix Dx, NumericMatrix Dy) { // compute independence coefficient I_n (the square root) // Dx and Dy are the Euclidean distance matrices int n = Dx.nrow(); int i, j, k, m; double n2 = n*n, n3 = n*n2, n4 = n2*n2; double Cx, Cy, zd, zbar, z; IntegerVector ix(n), iy(n); NumericMatrix Dx2(n, n), Dy2(n, n); Cx = 0.0; Cy = 0.0; z = 0.0; for (i=0; i using namespace Rcpp; /* Author: Maria L. Rizzo energy package github.com/mariarizzo/energy */ // [[Rcpp::export(.poisMstat)]] NumericVector poisMstat(IntegerVector x) { /* computes the Poisson mean distance statistic */ int i, j, k, n=x.size(); double eps=1.0e-10; double ad, cvm, d, lambda, m, q; double Mcdf1, Mcdf0, Mpdf1, cdf1, cdf0; NumericVector stats(2); lambda = mean(x); q = R::qpois(1.0-eps, lambda, TRUE, FALSE) + 1; m = 0.0; for (j=0; j 1) Mcdf1 = 1.0; cdf1 = R::ppois(i, lambda, TRUE, FALSE); /* MLE of F(i) */ d = Mcdf1 - cdf1; cvm += d * d * (cdf1 - cdf0); ad += d * d * (cdf1 - cdf0) / (cdf1 * (1-cdf1)); cdf0 = cdf1; Mcdf0 = Mcdf1; } cvm *= n; ad *= n; stats(0) = cvm; stats(1) = ad; return stats; } energy/src/dcovU.cpp0000644000176200001440000000200514246144104014113 0ustar liggesusers#include using namespace Rcpp; // Author: Maria L. Rizzo // energy package // github.com/mariarizzo/energy NumericMatrix U_center(NumericMatrix); //[[Rcpp::export]] NumericVector dcovU_stats(NumericMatrix Dx, NumericMatrix Dy) { // x and y must be square distance matrices NumericMatrix A = U_center(Dx); NumericMatrix B = U_center(Dy); double ab = 0.0, aa = 0.0, bb = 0.0; double V, dcorU = 0.0; double eps = std::numeric_limits::epsilon(); //machine epsilon int n = Dx.nrow(); int n2 = n * (n - 3); for (int i=0; i eps) dcorU = ab / sqrt(V); return NumericVector::create( _["dCovU"] = ab, _["bcdcor"] = dcorU, _["dVarXU"] = aa, _["dVarYU"] = bb ); } energy/src/energy_init.c0000644000176200001440000000462514661474702015034 0ustar liggesusers#include #include #include // for NULL #include /* Author: Maria L. Rizzo energy package github.com/mariarizzo/energy */ /* declarations to register native routines in this package */ /* .C calls */ extern void dCOV(void *, void *, void *, void *); extern void dCOVtest(void *, void *, void *, void *, void *, void *, void *); extern void ksampleEtest(void *, void *, void *, void *, void *, void *, void *, void *, void *); extern void permute_check(void *, void *); /* .Call calls */ extern SEXP _energy_D_center(SEXP); extern SEXP _energy_dcovU_stats(SEXP, SEXP); extern SEXP _energy_partial_dcor(SEXP, SEXP, SEXP); extern SEXP _energy_partial_dcov(SEXP, SEXP, SEXP); extern SEXP _energy_poisMstat(SEXP); extern SEXP _energy_projection(SEXP, SEXP); extern SEXP _energy_U_center(SEXP); extern SEXP _energy_U_product(SEXP, SEXP); extern SEXP _energy_Btree_sum(SEXP, SEXP); extern SEXP _energy_kgroups_start(SEXP, SEXP, SEXP, SEXP, SEXP); extern SEXP _energy_calc_dist(SEXP); extern SEXP _energy_Istat(SEXP, SEXP); extern SEXP _energy_Istats(SEXP, SEXP, SEXP); static const R_CMethodDef CEntries[] = { {"dCOV", (DL_FUNC) &dCOV, 4}, {"dCOVtest", (DL_FUNC) &dCOVtest, 7}, {"ksampleEtest", (DL_FUNC) &ksampleEtest, 9}, {"permute_check",(DL_FUNC) &permute_check,2}, {NULL, NULL, 0} }; static const R_CallMethodDef CallEntries[] = { {"_energy_D_center", (DL_FUNC) &_energy_D_center, 1}, {"_energy_dcovU_stats", (DL_FUNC) &_energy_dcovU_stats, 2}, {"_energy_Istat", (DL_FUNC) &_energy_Istat, 2}, {"_energy_Istats", (DL_FUNC) &_energy_Istats, 3}, {"_energy_partial_dcor", (DL_FUNC) &_energy_partial_dcor, 3}, {"_energy_partial_dcov", (DL_FUNC) &_energy_partial_dcov, 3}, {"_energy_poisMstat", (DL_FUNC) &_energy_poisMstat, 1}, {"_energy_projection", (DL_FUNC) &_energy_projection, 2}, {"_energy_U_center", (DL_FUNC) &_energy_U_center, 1}, {"_energy_U_product", (DL_FUNC) &_energy_U_product, 2}, {"_energy_Btree_sum", (DL_FUNC) &_energy_Btree_sum, 2}, {"_energy_kgroups_start", (DL_FUNC) &_energy_kgroups_start, 5}, {"_energy_calc_dist", (DL_FUNC) &_energy_calc_dist, 1}, {NULL, NULL, 0} }; void R_init_energy(DllInfo *dll) { R_registerRoutines(dll, CEntries, CallEntries, NULL, NULL); R_useDynamicSymbols(dll, FALSE); } energy/src/B-tree.cpp0000644000176200001440000000633314246144110014156 0ustar liggesusers#include using namespace Rcpp; // Author: Maria L. Rizzo // energy package // github.com/mariarizzo/energy // compute partial sum using binary search algorithm like AVL // pre-compute powers of two to save repeated calculations IntegerVector containerNodes (int y, IntegerVector pwr2, IntegerVector psum); NumericVector gamma1_direct(IntegerVector y, NumericVector z); IntegerVector p2sum(IntegerVector pwr2); IntegerVector powers2 (int L); NumericVector rowsumsDist(NumericVector x, NumericVector sorted, IntegerVector ranks); IntegerVector subNodes (int y, IntegerVector pwr2, IntegerVector psum); // [[Rcpp::export]] NumericVector Btree_sum (IntegerVector y, NumericVector z) { // // y is a permutation of the integers 1:n // z is a numeric vector of length n // compute gamma1(i) = sum(j 0) gamma1(i) += sums(node); } } return gamma1; } IntegerVector containerNodes (int y, IntegerVector pwr2, IntegerVector psum) { /* * get the indices of all nodes of binary tree whose closed * intervals contain integer y */ int i, L = pwr2.length(); IntegerVector nodes(L); nodes(0) = y; for (i = 0; i < L-1; i++) { nodes(i+1) = ceil((double) y / pwr2(i)) + psum(i); } return nodes; } IntegerVector subNodes (int y, IntegerVector pwr2, IntegerVector psum) { /* * get indices of nodes whose intervals disjoint union is 1:y */ int L = psum.length(); int idx, k, level, p2; IntegerVector nodes(L); std::fill(nodes.begin(), nodes.end(), -1L); k = y; for (level = L - 1; level > 0; level --) { p2 = pwr2(level - 1); if (k >= p2) { // at index of left node plus an offset idx = psum(level - 1) + (y / p2); nodes(L - level - 1) = idx; k -= p2; } } if (k > 0) nodes(L - 1) = y; return nodes; } IntegerVector powers2 (int L) { // (2, 4, 8, ..., 2^L, 2^(L+1)) int k; IntegerVector pwr2(L); pwr2(0) = 2; for (k = 1; k < L; k++) pwr2(k) = pwr2(k-1) * 2; return pwr2; } IntegerVector p2sum(IntegerVector pwr2) { // computes the cumsum of 2^L, 2^(L-1), ..., 2^2, 2 int i, L = pwr2.length(); IntegerVector psum(L); std::fill(psum.begin(), psum.end(), pwr2(L-1)); for (i = 1; i < L; i++) psum(i) = psum(i-1) + pwr2(L-i-1); return psum; } NumericVector gamma1_direct(IntegerVector y, NumericVector z) { // utility: direct computation of the sum gamm1 // for the purpose of testing and benchmarks int n = y.length(); int i, j; NumericVector gamma1(n); for (i = 1; i < n; i++) { for (j = 0; j < i; j++) { if (y(j) < y(i)) { gamma1(i) += z(j); } } } return gamma1; } energy/src/utilities.c0000644000176200001440000001376314660667235014542 0ustar liggesusers/* utilities.c: some utilities for the energy package Author: Maria L. Rizzo github.com/mariarizzo/energy alloc_matrix, alloc_int_matrix, free_matrix, free_int_matrix: use R_Calloc, R_Free instead of Calloc, Free for memory management permute permutes the first n elements of an integer vector row_order converts arg from column order to row order vector2matrix copies double* arg into double** arg distance computes Euclidean distance matrix from double** Euclidean_distance computes Euclidean distance matrix from double* index_distance computes Euclidean distance matrix D then D^index sumdist sums the distance matrix without creating the matrix Notes: 1. index_distance (declaration and body of the function) revised in energy 1.3-0, 2/2011. */ #include #include double **alloc_matrix(int r, int c); int **alloc_int_matrix(int r, int c); void free_matrix(double **matrix, int r, int c); void free_int_matrix(int **matrix, int r, int c); void permute(int *J, int n); void permute_check(int *J, int *N); void roworder(double *x, int *byrow, int r, int c); void vector2matrix(double *x, double **y, int N, int d, int isroworder); void distance(double **bxy, double **D, int N, int d); void Euclidean_distance(double *x, double **Dx, int n, int d); void index_distance(double **Dx, int n, double index); void sumdist(double *x, int *byrow, int *nrow, int *ncol, double *lowersum); double **alloc_matrix(int r, int c) { /* allocate a matrix with r rows and c columns */ int i; double **matrix; matrix = R_Calloc(r, double *); for (i = 0; i < r; i++) matrix[i] = R_Calloc(c, double); return matrix; } int **alloc_int_matrix(int r, int c) { /* allocate an integer matrix with r rows and c columns */ int i; int **matrix; matrix = R_Calloc(r, int *); for (i = 0; i < r; i++) matrix[i] = R_Calloc(c, int); return matrix; } void free_matrix(double **matrix, int r, int c) { /* free a matrix with r rows and c columns */ int i; for (i = 0; i < r; i++) R_Free(matrix[i]); R_Free(matrix); } void free_int_matrix(int **matrix, int r, int c) { /* free an integer matrix with r rows and c columns */ int i; for (i = 0; i < r; i++) R_Free(matrix[i]); R_Free(matrix); } void permute(int *J, int n) { /* permute the first n integers of J if n is length(J), returns a permutation vector equal to rev(Rcpp::sample(n, n, false)) */ int i, j, j0, m=n; for (i=0; i DBL_EPSILON) { for (i=0; i using namespace Rcpp; NumericMatrix D_center(NumericMatrix Dx); NumericMatrix U_center(NumericMatrix Dx); // [[Rcpp::export]] NumericMatrix D_center(NumericMatrix Dx) { /* computes the double centered distance matrix for distance matrix Dx for dCov, dCor, etc. a_{ij} - a_{i.}/n - a_{.j}/n + a_{..}/n^2, all i, j */ int j, k; int n = Dx.nrow(); NumericVector akbar(n); NumericMatrix A(n, n); double abar = 0.0; for (k=0; k using namespace Rcpp; /* Author: Maria L. Rizzo energy package github.com/mariarizzo/energy */ int kgroups_update(NumericMatrix x, int k, IntegerVector clus, IntegerVector sizes, NumericVector within, bool distance); List kgroups_start(NumericMatrix x, int k, IntegerVector clus, int iter_max, bool distance); int kgroups_update(NumericMatrix x, int k, IntegerVector clus, IntegerVector sizes, NumericVector w, bool distance) { /* * k-groups one pass through sample moving one point at a time * x: data matrix or distance * k: number of clusters * clus: clustering vector clus(i)==j ==> x_i is in cluster j * sizes: cluster sizes * within: vector of within cluster dispersions * distance: true if x is distance matrix * update clus, sizes, and withins * return count = number of points moved */ int n = x.nrow(), d = x.ncol(); int i, j, I, J, ix, nI, nJ; NumericVector rowdst(k), e(k); int best, count = 0; double dsum, dif; for (ix = 0; ix < n; ix++) { I = clus(ix); nI = sizes(I); if (nI > 1) { // calculate the E-distances of this point to each cluster rowdst.fill(0.0); for (i = 0; i < n; i++) { J = clus(i); if (distance == true) { rowdst(J) += x(ix, i); } else { dsum = 0.0; for (j = 0; j < d; j++) { dif = x(ix, j) - x(i, j); dsum += dif * dif; } rowdst(J) += sqrt(dsum); } } for (J = 0; J < k; J++) { nJ = sizes(J); e(J) = (2.0 / (double) nJ) * (rowdst(J) - w(J)); } best = Rcpp::which_min(e); if (best != I) { // move this point and update nI = sizes(I); nJ = sizes(best); w(best) = (((double) nJ) * w(best) + rowdst(best)) / ((double) (nJ + 1)); w(I) = (((double) nI) * w(I) - rowdst(I)) / ((double) (nI - 1)); clus(ix) = best; sizes(I) = nI - 1; sizes(best) = nJ + 1; count ++; // number of moves } } } return count; } // [[Rcpp::export]] List kgroups_start(NumericMatrix x, int k, IntegerVector clus, int iter_max, bool distance) { // k-groups clustering with initial clustering vector clus // up to iter_max iterations of n possible moves each // distance: true if x is distance matrix NumericVector within(k, 0.0); IntegerVector sizes(k, 0); double dif, dsum; int I, J, h, i, j; int n = x.nrow(), d = x.ncol(); for (i = 0; i < n; i++) { I = clus(i); sizes(I)++; for (j = 0; j < i; j++) { J = clus(j); if (I == J) { if (distance == true) { within(I) += x(i, j); } else { dsum = 0.0; for (h = 0; h < d; h++) { dif = x(i, h) - x(j, h); dsum += dif * dif; } within(I) += sqrt(dsum); } } } } for (I = 0; I < k; I++) within(I) /= ((double) sizes(I)); int it = 1, count = 1; count = kgroups_update(x, k, clus, sizes, within, distance); while (it < iter_max && count > 0) { count = kgroups_update(x, k, clus, sizes, within, distance); it++; } double W = Rcpp::sum(within); return List::create( _["within"] = within, _["W"] = W, _["sizes"] = sizes, _["cluster"] = clus, _["iterations"] = it, _["count"] = count); } energy/src/energy.c0000644000176200001440000002017314661742126014003 0ustar liggesusers/* energy.c: energy package Author: Maria L. Rizzo github.com/mariarizzo/energy */ #include #include void ksampleEtest(double *x, int *byrow, int *nsamples, int *sizes, int *dim, int *R, double *e0, double *e, double *pval); void E2sample(double *x, int *sizes, int *dim, double *stat); double edist(double **D, int m, int n); double multisampleE(double **D, int nsamples, int *sizes, int *perm); double twosampleE(double **D, int m, int n, int *xrows, int *yrows); double E2(double **x, int *sizes, int *start, int ncol, int *perm); double Eksample(double *x, int *byrow, int r, int d, int K, int *sizes, int *ix); void distance(double **bxy, double **D, int N, int d); /* utilities.c */ extern double **alloc_matrix(int r, int c); extern int **alloc_int_matrix(int r, int c); extern void free_matrix(double **matrix, int r, int c); extern void free_int_matrix(int **matrix, int r, int c); extern void permute(int *J, int n); extern void roworder(double *x, int *byrow, int r, int c); extern void vector2matrix(double *x, double **y, int N, int d, int isroworder); extern void distance(double **bxy, double **D, int N, int d); extern void Euclidean_distance(double *x, double **Dx, int n, int d); extern void index_distance(double *x, double **Dx, int n, int d, double index); // // void E2sample(double *x, int *sizes, int *dim, double *stat) { // /* // compute test statistic *stat for testing H:F=G // does not store distance matrix // x must be in row order: x=as.double(t(x)) where // x is pooled sample in matrix sum(en) by dim // */ // int m=sizes[0], n=sizes[1], d=(*dim); // int i, j, k, p, q; // double dif, dsum, sumxx, sumxy, sumyy, w; // // sumxy = 0.0; // for (i=0; i 0) { data = alloc_matrix(N, d); /* sample matrix */ vector2matrix(x, data, N, d, *byrow); distance(data, D, N, d); free_matrix(data, N, d); } else vector2matrix(x, D, N, N, *byrow); *e0 = multisampleE(D, K, sizes, perm); /* bootstrap */ if (B > 0) { ek = 0; GetRNGstate(); for (b=0; b using namespace Rcpp; /* Author: Maria L. Rizzo energy package github.com/mariarizzo/energy */ // [[Rcpp::export]] double U_product(NumericMatrix U, NumericMatrix V) { // U and V are U-centered dissimilarity matrices of the two samples int n = U.nrow(); int i, j; double sums = 0.0; for (i = 0; i < n; i++) for (j=0; j using namespace Rcpp; /* Author: Maria L. Rizzo energy package github.com/mariarizzo/energy */ NumericMatrix U_center(NumericMatrix); double U_product(NumericMatrix, NumericMatrix); // [[Rcpp::export]] NumericMatrix projection(NumericMatrix Dx, NumericMatrix Dz) { /* returns the projection of A(x) distance matrix Dx onto the orthogonal complement of C(z) distance matrix; both Dx and Dz are n by n distance or dissimilarity matrices the projection is an n by n matrix */ int n = Dx.nrow(); int i, j; NumericMatrix A(n, n), C(n, n), P(n, n); double AC, CC, c1; double eps = std::numeric_limits::epsilon(); //machine epsilon A = U_center(Dx); // U-centering to get A^U etc. C = U_center(Dz); AC = U_product(A, C); // (A,C) = dcov^U CC = U_product(C, C); c1 = 0.0; // if (C,C)==0 then C==0 so c1=(A,C)=0 if (fabs(CC) > eps) c1 = AC / CC; for (i=0; i 3.0.3 it is now equivalent for alpha = 1 with method = "ward.D". Input and return value unchanged except heights from hclust are half. # energy 1.7-4 * User level changes - disco: handle the case when the user argument x is dist with conflicting argument distance=FALSE - dcor.t and dcor.ttest: handle the cases when class of argument x or y conflicts with the distance argument - Split manual page of dcovU into two files. - indep.etest and indep.e removed now Defunct (were Deprecated since Version 1.1-0, 2008-04-07; replaced by indep.test). * Internal changes - BCDCOR: handle the cases when class of argument x or y conflicts with the distance argument # energy 1.7-2 * User level changes - Provided new dcor.test function, similar to dcov.test but using the distance correlation as the test statistic. - Number of replicates R for Monte Carlo and permutation tests now matches the argument of the boot::boot function (no default value, user must specify). - If user runs a test with 0 replicates, p-value printed is NA * Internal changes - energy_init.c added for registering routines # energy 1.7-0 * Partial Distance Correlation statistics and tests added - pdcov, pdcor, pdcov.test, pdcor.test - dcovU: unbiased estimator of distance covariance - bcdcor: bias corrected distance correlation - Ucenter, Dcenter, U_center, D_center: double-centering and U-centering utilities - U_product: inner product in U-centered Hilbert space * updated NAMESPACE and DESCRIPTION imports, etc. * revised package Title and Description in DESCRIPTION * package now links to Rcpp * mvnorm c code ported to c++ (mvnorm.cpp); corresponding changes in Emvnorm.R * syntax for bcdcor: "distance" argument removed, now argument can optionally be a dist object * syntax for energy.hclust: first argument must now be a dist object * default number of replicates R in tests: for all tests, R now defaults to 0 or R has no default value. # energy 1.6.2 * inserted GetRNGstate() .. PutRNGState around repl. loop in dcov.c. # energy 1.6.1 * replace Depends with Imports in DESCRIPTION file # energy 1.6.0 * implementation of high-dim distance correlation t-test introduced in JMVA Volume 117, pp. 193-213 (2013). * new functions dcor.t, dcor.ttest in dcorT.R * minor changes to tidy other code in dcov.R * removed unused internal function .dcov.test # energy 1.5.0 * NAMESPACE: insert UseDynLib; remove zzz.R, .First.Lib() # energy 1.4-0 * NAMESPACE added. * (dcov.c, Eindep.c) Unused N was removed. * (dcov.c) In case dcov=0, bypass the unnecessary loop that generates replicates (in dCOVtest and dCovTest). In this case dcor=0 and test is not significant. (dcov=0 if one of the samples is constant.) * (Eqdist.R) in eqdist.e and eqdist.etest, method="disco" is replaced by two options: "discoB" (between sample components) and "discoF" (disco F ratio). * (disco.R) Added disco.between and internal functions that compute the disco between-sample component and corresponding test. * (utilities.c) In permute function replaced rand_unif with runif. * (energy.c) In ksampleEtest the pval computation changed from ek/B to (ek+1)/(B+1) as it should be for a permutation test, and unneeded int* n removed. # energy 1.3-0 * In distance correlation, distance covariance functions (dcov, dcor, DCOR) and dcov.test, arguments x and y can now optionally be distance objects (result of dist function or as.dist). Matrices x and y will always be treated as data. * Functions in dcov.c and utilities.c were modified to support arguments that are distances rather than data. In utilities.c the index_distance function changed. In dcov.c there are many changes. Most importantly for the exported objects, there is now an extra required parameter in the dims argument passed from R. In dCOVtest dims must be a vector c(n, p, q, dst, R) where n is sample size, p and q are dimensions of x and y, dst is logical (TRUE if distances) and R is number of replicates. For dCOV dims must be c(n, p, q, dst). # energy 1.2-0 * disco (distance components) added for one-way layout. * A method argument was added to ksample.e, eqdist.e, and eqdist.etest, method = c("original", "disco"). * A method argument was added to edist, which summarizes cluster distances in a table: method = c("cluster","discoB","discoF")) energy/README.md0000644000176200001440000000116014005374454013026 0ustar liggesusers# energy energy package for R The energy package for R implements several methods in multivariate analysis and multivariate inference based on the energy distance, which characterizes equality of distributions. Distance correlation (multivariate independence), disco (nonparametric extension of ANOVA), and goodness-of-fit tests are examples of some of the methods included. energy is named based on the analogy with potential energy in physics. See the references in the manual for more details. [![CRAN_Status_Badge](http://www.r-pkg.org/badges/version/energy)]https://cran.r-project.org/package=energy)energy/build/0000755000176200001440000000000014662213500012641 5ustar liggesusersenergy/build/partial.rdb0000644000176200001440000000007514662213500014770 0ustar liggesusers‹‹àb```b`aad`b1…À€… H02°0piÖ¼ÄÜÔb C"Éðh¿eÍ7energy/man/0000755000176200001440000000000014661663346012335 5ustar liggesusersenergy/man/kgroups.Rd0000644000176200001440000000730014005374454014305 0ustar liggesusers\name{kgroups} \alias{kgroups} \title{ K-Groups Clustering } \description{ Perform k-groups clustering by energy distance. } \usage{ kgroups(x, k, iter.max = 10, nstart = 1, cluster = NULL) } \arguments{ \item{x}{Data frame or data matrix or distance object} \item{k}{number of clusters} \item{iter.max}{maximum number of iterations} \item{nstart}{number of restarts} \item{cluster}{initial clustering vector} } \details{ K-groups is based on the multisample energy distance for comparing distributions. Based on the disco decomposition of total dispersion (a Gini type mean distance) the objective function should either maximize the total between cluster energy distance, or equivalently, minimize the total within cluster energy distance. It is more computationally efficient to minimize within distances, and that makes it possible to use a modified version of the Hartigan-Wong algorithm (1979) to implement K-groups clustering. The within cluster Gini mean distance is \deqn{G(C_j) = \frac{1}{n_j^2} \sum_{i,m=1}^{n_j} |x_{i,j} - x_{m,j}|} and the K-groups within cluster distance is \deqn{W_j = \frac{n_j}{2}G(C_j) = \frac{1}{2 n_j} \sum_{i,m=1}^{n_j} |x_{i,j} - x_{m,j}.} If z is the data matrix for cluster \eqn{C_j}, then \eqn{W_j} could be computed as \code{sum(dist(z)) / nrow(z)}. If cluster is not NULL, the clusters are initialized by this vector (can be a factor or integer vector). Otherwise clusters are initialized with random labels in k approximately equal size clusters. If \code{x} is not a distance object (class(x) == "dist") then \code{x} is converted to a data matrix for analysis. Run up to \code{iter.max} complete passes through the data set until a local min is reached. If \code{nstart > 1}, on second and later starts, clusters are initialized at random, and the best result is returned. } \value{ An object of class \code{kgroups} containing the components \item{call}{the function call} \item{cluster}{vector of cluster indices} \item{sizes}{cluster sizes} \item{within}{vector of Gini within cluster distances} \item{W}{sum of within cluster distances} \item{count}{number of moves} \item{iterations}{number of iterations} \item{k}{number of clusters} \code{cluster} is a vector containing the group labels, 1 to k. \code{print.kgroups} prints some of the components of the kgroups object. Expect that count is 0 if the algorithm converged to a local min (that is, 0 moves happened on the last iteration). If iterations equals iter.max and count is positive, then the algorithm did not converge to a local min. } \author{ Maria Rizzo and Songzi Li } \references{ Li, Songzi (2015). "K-groups: A Generalization of K-means by Energy Distance." Ph.D. thesis, Bowling Green State University. Li, S. and Rizzo, M. L. (2017). "K-groups: A Generalization of K-means Clustering". ArXiv e-print 1711.04359. https://arxiv.org/abs/1711.04359 Szekely, G. J., and M. L. Rizzo. "Testing for equal distributions in high dimension." InterStat 5, no. 16.10 (2004). Rizzo, M. L., and G. J. Szekely. "Disco analysis: A nonparametric extension of analysis of variance." The Annals of Applied Statistics (2010): 1034-1055. Hartigan, J. A. and Wong, M. A. (1979). "Algorithm AS 136: A K-means clustering algorithm." Applied Statistics, 28, 100-108. doi: 10.2307/2346830. } \examples{ x <- as.matrix(iris[ ,1:4]) set.seed(123) kg <- kgroups(x, k = 3, iter.max = 5, nstart = 2) kg fitted(kg) \donttest{ d <- dist(x) set.seed(123) kg <- kgroups(d, k = 3, iter.max = 5, nstart = 2) kg kg$cluster fitted(kg) fitted(kg, method = "groups") } } \keyword{ cluster } \keyword{ multivariate } energy/man/dcov.Rd0000644000176200001440000001146714250476236013562 0ustar liggesusers\name{distance correlation} \alias{dcor} \alias{dcov} \title{ Distance Correlation and Covariance Statistics} \description{ Computes distance covariance and distance correlation statistics, which are multivariate measures of dependence. } \usage{ dcov(x, y, index = 1.0) dcor(x, y, index = 1.0) } \arguments{ \item{x}{ data or distances of first sample} \item{y}{ data or distances of second sample} \item{index}{ exponent on Euclidean distance, in (0,2]} } \details{ \code{dcov} and \code{dcor} compute distance covariance and distance correlation statistics. The sample sizes (number of rows) of the two samples must agree, and samples must not contain missing values. The \code{index} is an optional exponent on Euclidean distance. Valid exponents for energy are in (0, 2) excluding 2. Argument types supported are numeric data matrix, data.frame, or tibble, with observations in rows; numeric vector; ordered or unordered factors. In case of unordered factors a 0-1 distance matrix is computed. Optionally pre-computed distances can be input as class "dist" objects or as distance matrices. For data types of arguments, distance matrices are computed internally. Distance correlation is a new measure of dependence between random vectors introduced by Szekely, Rizzo, and Bakirov (2007). For all distributions with finite first moments, distance correlation \eqn{\mathcal R}{R} generalizes the idea of correlation in two fundamental ways: (1) \eqn{\mathcal R(X,Y)}{R(X,Y)} is defined for \eqn{X} and \eqn{Y} in arbitrary dimension. (2) \eqn{\mathcal R(X,Y)=0}{R(X,Y)=0} characterizes independence of \eqn{X} and \eqn{Y}. Distance correlation satisfies \eqn{0 \le \mathcal R \le 1}{0 \le R \le 1}, and \eqn{\mathcal R = 0}{R = 0} only if \eqn{X} and \eqn{Y} are independent. Distance covariance \eqn{\mathcal V}{V} provides a new approach to the problem of testing the joint independence of random vectors. The formal definitions of the population coefficients \eqn{\mathcal V}{V} and \eqn{\mathcal R}{R} are given in (SRB 2007). The definitions of the empirical coefficients are as follows. The empirical distance covariance \eqn{\mathcal{V}_n(\mathbf{X,Y})}{V_n(X,Y)} with index 1 is the nonnegative number defined by \deqn{ \mathcal{V}^2_n (\mathbf{X,Y}) = \frac{1}{n^2} \sum_{k,\,l=1}^n A_{kl}B_{kl} }{ V^2_n (X,Y) = (1/n^2) sum_{k,l=1:n} A_{kl}B_{kl} } where \eqn{A_{kl}} and \eqn{B_{kl}} are \deqn{ A_{kl} = a_{kl}-\bar a_{k.}- \bar a_{.l} + \bar a_{..} } \deqn{ B_{kl} = b_{kl}-\bar b_{k.}- \bar b_{.l} + \bar b_{..}. } Here \deqn{ a_{kl} = \|X_k - X_l\|_p, \quad b_{kl} = \|Y_k - Y_l\|_q, \quad k,l=1,\dots,n, }{ a_{kl} = ||X_k - X_l||_p, b_{kl} = ||Y_k - Y_l||_q, k,l=1,\dots,n, } and the subscript \code{.} denotes that the mean is computed for the index that it replaces. Similarly, \eqn{\mathcal{V}_n(\mathbf{X})}{V_n(X)} is the nonnegative number defined by \deqn{ \mathcal{V}^2_n (\mathbf{X}) = \mathcal{V}^2_n (\mathbf{X,X}) = \frac{1}{n^2} \sum_{k,\,l=1}^n A_{kl}^2. }{ V^2_n (X) = V^2_n (X,X) = (1/n^2) sum_{k,l=1:n} A_{kl}^2. } The empirical distance correlation \eqn{\mathcal{R}_n(\mathbf{X,Y})}{R(\mathbf{X,Y})} is the square root of \deqn{ \mathcal{R}^2_n(\mathbf{X,Y})= \frac {\mathcal{V}^2_n(\mathbf{X,Y})} {\sqrt{ \mathcal{V}^2_n (\mathbf{X}) \mathcal{V}^2_n(\mathbf{Y})}}. }{ R^2_n(X,Y)= V^2_n(X,Y) / sqrt(V^2_n (X) V^2_n(Y)). } See \code{\link{dcov.test}} for a test of multivariate independence based on the distance covariance statistic. } \value{ \code{dcov} returns the sample distance covariance and \code{dcor} returns the sample distance correlation. } \note{ Note that it is inefficient to compute dCor by: square root of \code{dcov(x,y)/sqrt(dcov(x,x)*dcov(y,y))} because the individual calls to \code{dcov} involve unnecessary repetition of calculations. } \seealso{ \code{\link{dcov2d}} \code{\link{dcor2d}} \code{\link{bcdcor}} \code{\link{dcovU}} \code{\link{pdcor}} \code{\link{dcov.test}} \code{\link{dcor.test}} \code{\link{pdcor.test}} } \references{ Szekely, G.J., Rizzo, M.L., and Bakirov, N.K. (2007), Measuring and Testing Dependence by Correlation of Distances, \emph{Annals of Statistics}, Vol. 35 No. 6, pp. 2769-2794. \cr \doi{10.1214/009053607000000505} Szekely, G.J. and Rizzo, M.L. (2009), Brownian Distance Covariance, \emph{Annals of Applied Statistics}, Vol. 3, No. 4, 1236-1265. \cr \doi{10.1214/09-AOAS312} Szekely, G.J. and Rizzo, M.L. (2009), Rejoinder: Brownian Distance Covariance, \emph{Annals of Applied Statistics}, Vol. 3, No. 4, 1303-1308. } \author{ Maria L. Rizzo \email{mrizzo@bgsu.edu} and Gabor J. Szekely } \examples{ x <- iris[1:50, 1:4] y <- iris[51:100, 1:4] dcov(x, y) dcov(dist(x), dist(y)) #same thing } \keyword{ multivariate } \concept{ independence } \concept{ distance correlation } \concept{ distance covariance } \concept{ energy statistics } energy/man/normalGOF.Rd0000644000176200001440000000544614227267045014453 0ustar liggesusers\name{normal.test} \alias{normal.test} \alias{normal.e} \title{Energy Test of Univariate Normality} \description{ Performs the energy test of univariate normality for the composite hypothesis Case 4, estimated parameters. } \usage{ normal.test(x, method=c("mc","limit"), R) normal.e(x) } \arguments{ \item{x}{ univariate data vector} \item{method}{ method for p-value} \item{R}{ number of replications if Monte Carlo method} } \details{ If \code{method="mc"} this test function applies the parametric bootstrap method implemented in \code{\link{mvnorm.test}}. If \code{method="limit"}, the p-value of the test is computed from the asymptotic distribution of the test statistic under the null hypothesis. The asymptotic distribution is a quadratic form of centered Gaussian random variables, which has the form \deqn{\sum_{k=1}^\infty \lambda_k Z_k^2,} where \eqn{\lambda_k} are positive constants (eigenvalues) and \eqn{Z_k} are iid standard normal variables. Eigenvalues are pre-computed and stored internally. A p-value is computed using Imhof's method as implemented in the \pkg{CompQuadForm} package. Note that the "limit" method is intended for moderately large samples because it applies the asymptotic distribution. The energy test of normality was proposed and implemented by Szekely and Rizzo (2005). See \code{\link{mvnorm.test}} for more details. } \value{ \code{normal.e} returns the energy goodness-of-fit statistic for a univariate sample. \code{normal.test} returns a list with class \code{htest} containing \item{statistic}{observed value of the test statistic} \item{p.value}{p-value of the test} \item{estimate}{sample estimates: mean, sd} \item{data.name}{description of data} } \seealso{ \code{\link{mvnorm.test}} and \code{\link{mvnorm.e}} for the energy test of multivariate normality and the test statistic for multivariate samples. } \references{ Szekely, G. J. and Rizzo, M. L. (2005) A New Test for Multivariate Normality, \emph{Journal of Multivariate Analysis}, 93/1, 58-80, \doi{10.1016/j.jmva.2003.12.002}. Mori, T. F., Szekely, G. J. and Rizzo, M. L. "On energy tests of normality." Journal of Statistical Planning and Inference 213 (2021): 1-15. Rizzo, M. L. (2002). A New Rotation Invariant Goodness-of-Fit Test, Ph.D. dissertation, Bowling Green State University. J. P. Imhof (1961). Computing the Distribution of Quadratic Forms in Normal Variables, \emph{Biometrika}, Volume 48, Issue 3/4, 419-426. } \author{ Maria L. Rizzo \email{mrizzo@bgsu.edu} and Gabor J. Szekely } \examples{ x <- iris[1:50, 1] normal.e(x) normal.test(x, R=199) normal.test(x, method="limit") } \keyword{ htest } \concept{ goodness-of-fit} \concept{ normal distribution} \concept{ energy statistics } energy/man/sortrank.Rd0000644000176200001440000000167214227267124014465 0ustar liggesusers\name{sortrank} \alias{sortrank} \title{ Sort, order and rank a vector } \description{ A utility that returns a list with the components equivalent to sort(x), order(x), rank(x, ties.method = "first"). } \usage{ sortrank(x) } \arguments{ \item{x}{ vector compatible with sort(x)} } \details{ This utility exists to save a little time on large vectors when two or all three of the sort(), order(), rank() results are required. In case of ties, the ranks component matches \code{rank(x, ties.method = "first")}. } \value{ A list with components \item{x}{the sorted input vector x} \item{ix}{the permutation = order(x) which rearranges x into ascending order} \item{r}{the ranks of x} } \note{ This function was benchmarked faster than the combined calls to \code{sort} and \code{rank}. } \examples{ sortrank(rnorm(5)) } \references{ See \code{\link{sort}}. } \author{ Maria L. Rizzo \email{mrizzo@bgsu.edu} } energy/man/centering.Rd0000644000176200001440000000441414405132115014562 0ustar liggesusers\name{centering distance matrices} \alias{Ucenter} \alias{Dcenter} \alias{U_center} \alias{D_center} \title{ Double centering and U-centering } \description{ Stand-alone double centering and U-centering functions that are applied in unbiased distance covariance, bias corrected distance correlation, and partial distance correlation. } \usage{ Dcenter(x) Ucenter(x) U_center(Dx) D_center(Dx) } \arguments{ \item{x}{ dist object or data matrix} \item{Dx}{ distance or dissimilarity matrix} } \details{ In \code{Dcenter} and \code{Ucenter}, \code{x} must be a \code{dist} object or a data matrix. Both functions return a doubly centered distance matrix. Note that \code{pdcor}, etc. functions include the centering operations (in C), so that these stand alone versions of centering functions are not needed except in case one wants to compute just a double-centered or U-centered matrix. \code{U_center} is the Rcpp export of the cpp function. \code{D_center} is the Rcpp export of the cpp function. } \value{ All functions return a square symmetric matrix. \code{Dcenter} returns a matrix \deqn{A_{ij}=a_{ij} - \bar a_{i.} - \bar a_{.j} + \bar a_{..}} as in classical multidimensional scaling. \code{Ucenter} returns a matrix \deqn{\tilde A_{ij}=a_{ij} - \frac{a_{i.}}{n-2} - \frac{a_{.j}}{n-2} + \frac{a_{..}}{(n-1)(n-2)},\quad i \neq j,} with zero diagonal, and this is the double centering applied in \code{pdcov} and \code{pdcor} as well as the unbiased dCov and bias corrected dCor statistics. } \note{ The c++ versions \code{D_center} and \code{U_center} should typically be faster. R versions are retained for historical reasons. } \references{ Szekely, G.J. and Rizzo, M.L. (2014), Partial Distance Correlation with Methods for Dissimilarities, \emph{Annals of Statistics}, Vol. 42, No. 6, pp. 2382-2412. } \author{ Maria L. Rizzo \email{mrizzo@bgsu.edu} and Gabor J. Szekely } \examples{ x <- iris[1:10, 1:4] dx <- dist(x) Dx <- as.matrix(dx) M <- U_center(Dx) all.equal(M, U_center(M)) #idempotence all.equal(M, D_center(M)) #invariance } \keyword{ multivariate } \concept{ multivariate } \concept{ distance correlation } \concept{ distance covariance } \concept{ energy statistics } energy/man/dcovu.Rd0000644000176200001440000000461414251434573013742 0ustar liggesusers\name{Unbiased distance covariance} \alias{bcdcor} \alias{dcovU} \title{Unbiased dcov and bias-corrected dcor statistics} \description{ These functions compute unbiased estimators of squared distance covariance and a bias-corrected estimator of (squared) distance correlation. } \usage{ bcdcor(x, y) dcovU(x, y) } \arguments{ \item{x}{ data or dist object of first sample} \item{y}{ data or dist object of second sample} } \details{ The unbiased (squared) dcov is inner product definition of dCov, in the Hilbert space of U-centered distance matrices. The sample sizes (number of rows) of the two samples must agree, and samples must not contain missing values. Argument types supported are numeric data matrix, data.frame, or tibble, with observations in rows; numeric vector; ordered or unordered factors. In case of unordered factors a 0-1 distance matrix is computed. } \value{ \code{dcovU} returns the unbiased estimator of squared dcov. \code{bcdcor} returns a bias-corrected estimator of squared dcor. } \note{ Unbiased distance covariance (SR2014) corresponds to the biased (original) \eqn{\mathrm{dCov^2}}{dCov^2}. Since \code{dcovU} is an unbiased statistic, it is signed and we do not take the square root. For the original distance covariance test of independence (SRB2007, SR2009), the distance covariance test statistic is the V-statistic \eqn{\mathrm{n\, dCov^2} = n \mathcal{V}_n^2}{n V_n^2} (not dCov). Similarly, \code{bcdcor} is bias-corrected, so we do not take the square root as with dCor. } \references{ Szekely, G.J. and Rizzo, M.L. (2014), Partial Distance Correlation with Methods for Dissimilarities. \emph{Annals of Statistics}, Vol. 42 No. 6, 2382-2412. Szekely, G.J., Rizzo, M.L., and Bakirov, N.K. (2007), Measuring and Testing Dependence by Correlation of Distances, \emph{Annals of Statistics}, Vol. 35 No. 6, pp. 2769-2794. \cr \doi{10.1214/009053607000000505} Szekely, G.J. and Rizzo, M.L. (2009), Brownian Distance Covariance, \emph{Annals of Applied Statistics}, Vol. 3, No. 4, 1236-1265. \cr \doi{10.1214/09-AOAS312} } \author{ Maria L. Rizzo \email{mrizzo@bgsu.edu} and Gabor J. Szekely } \examples{ x <- iris[1:50, 1:4] y <- iris[51:100, 1:4] dcovU(x, y) bcdcor(x, y) } \keyword{ multivariate } \keyword{ nonparametric } \concept{ independence } \concept{ multivariate } \concept{ distance correlation } \concept{ distance covariance } \concept{ energy statistics } energy/man/mvI.test.Rd0000644000176200001440000001171614661661014014331 0ustar liggesusers\name{mvI.test} \alias{mvI.test} \alias{mvI} \title{ Independence Coefficient and Test} \description{ Computes a type of multivariate nonparametric E-statistic and test of independence based on independence coefficient \eqn{\mathcal I_n}{I_n}. This coefficient pre-dates and is different from distance covariance or distance correlation.} \usage{ mvI.test(x, y, R) mvI(x, y) } \arguments{ \item{x}{ matrix: first sample, observations in rows} \item{y}{ matrix: second sample, observations in rows} \item{R}{ number of replicates} } \details{ \code{mvI} computes the coefficient \eqn{\mathcal I_n}{I_n} and \code{mvI.test} performs a nonparametric test of independence. The test decision is obtained via permutation bootstrap, with \code{R} replicates. The sample sizes (number of rows) of the two samples must agree, and samples must not contain missing values. Historically this is the first energy test of independence. The distance covariance test \code{\link{dcov.test}}, distance correlation \code{\link{dcor}}, and related methods are more recent (2007, 2009). The distance covariance test \code{\link{dcov.test}} and distance correlation test \code{\link{dcor.test}} are much faster and have different properties than \code{mvI.test}. All are based on a population independence coefficient that characterizes independence and of these tests are statistically consistent. However, dCor is scale invariant while \eqn{I_n}{I_n} is not. In applications \code{\link{dcor.test}} or \code{\link{dcov.test}} are the recommended tests. Computing formula from Bakirov, Rizzo, and Szekely (2006), equation (2): Suppose the two samples are \eqn{X_1,\dots,X_n \in R^p} and \eqn{Y_1,\dots,Y_n \in R^q}. Define \eqn{Z_{kl} = (X_k, Y_l) \in R^{p+q}.} The independence coefficient \eqn{\mathcal I_n}{I_n} is defined \deqn{ \mathcal I_n = \sqrt{\frac{2\bar z - z_d - z}{x + y - z}}, } where \deqn{z_d= \frac{1}{n^2} \sum_{k,l=1}^n |Z_{kk}-Z_{ll}|_{p+q},} \deqn{z= \frac{1}{n^4} \sum_{k,l=1}^n \sum_{i,j=1}^n |Z_{kl}-Z_{ij}|_{p+q},} \deqn{\bar z= \frac{1}{n^3} \sum_{k=1}^n \sum_{i,j=1}^n |Z_{kk}-Z_{ij}|_{p+q},} \deqn{x= \frac{1}{n^2} \sum_{k,l=1}^n |X_{k}-X_{l}|_p,} \deqn{y= \frac{1}{n^2} \sum_{k,l=1}^n |Y_{k}-Y_{l}|_q.} Some properties: \itemize{ \item \eqn{0 \leq \mathcal I_n \leq 1} (Theorem 1). \item Large values of \eqn{n \mathcal I_n^2} (or \eqn{\mathcal I_n}) support the alternative hypothesis that the sampled random variables are dependent. \item \eqn{\mathcal I_n} is invariant to shifts and orthogonal transformations of X and Y. \item \eqn{\sqrt{n} \, \mathcal I_n} determines a statistically consistent test of independence against all fixed dependent alternatives (Corollary 1). \item The population independence coefficient \eqn{\mathcal I} is a normalized distance between the joint characteristic function and the product of the marginal characteristic functions. \eqn{\mathcal I_n} converges almost surely to \eqn{\mathcal I} as \eqn{n \to \infty}. X and Y are independent if and only if \eqn{\mathcal I(X, Y) = 0}. See the reference below for more details. }} \value{ \code{mvI} returns the statistic. \code{mvI.test} returns a list with class \code{htest} containing \item{ method}{ description of test} \item{ statistic}{ observed value of the test statistic \eqn{n\mathcal I_n^2}{n I_n^2}} \item{ estimate}{ \eqn{\mathcal I_n}{I_n}} \item{ replicates}{ permutation replicates} \item{ p.value}{ p-value of the test} \item{ data.name}{ description of data} } \references{ Bakirov, N.K., Rizzo, M.L., and Szekely, G.J. (2006), A Multivariate Nonparametric Test of Independence, \emph{Journal of Multivariate Analysis} 93/1, 58-80. Szekely, G.J., Rizzo, M.L., and Bakirov, N.K. (2007), Measuring and Testing Dependence by Correlation of Distances, \emph{Annals of Statistics}, Vol. 35 No. 6, pp. 2769-2794. Szekely, G.J. and Rizzo, M.L. (2009), Brownian Distance Covariance, \emph{Annals of Applied Statistics}, Vol. 3, No. 4, 1236-1265. } \note{ On scale invariance: Distance correlation (\code{\link{dcor}}) has the property that if we change the scale of X from e.g., meters to kilometers, and the scale of Y from e.g. grams to ounces, the statistic and the test are not changed. \eqn{\mathcal I_n}{I_n} does not have this property; it is invariant only under a common rescaling of X and Y by the same constant. Thus, if the units of measurement change for either or both variables, dCor is invariant, but \eqn{\mathcal I_n}{I_n} and possibly the \code{mvI.test} decision changes. } \author{ Maria L. Rizzo \email{mrizzo@bgsu.edu} and Gabor J. Szekely } \examples{ mvI(iris[1:25, 1], iris[1:25, 2]) \donttest{ mvI.test(iris[1:25, 1], iris[1:25, 2], R=99) } } \seealso{ \code{ \link{dcov.test} } \code{ \link{dcov} } \code{ \link{dcor.test} } \code{ \link{dcor} } \code{ \link{dcov2d} } \code{ \link{dcor2d} } \code{ \link{indep.test} } } \keyword{ htest } \keyword{ multivariate } \keyword{ nonparametric } \concept{ independence } \concept{ energy statistics } energy/man/energy-deprecated.Rd0000644000176200001440000000124414661135547016211 0ustar liggesusers\name{energy-deprecated} \alias{DCOR} \title{ Deprecated Functions} \description{ These deprecated functions have been replaced by revised functions and will be removed in future releases of the energy package.} \usage{ DCOR(x, y, index=1.0) } \arguments{ \item{x}{ data or distances of first sample} \item{y}{ data or distances of second sample} \item{index}{ exponent on Euclidean distance in (0, 2)} } \details{ DCOR is an R version replaced by faster compiled code. } \keyword{ multivariate } \keyword{ nonparametric } \concept{ independence } \concept{ multivariate } \concept{ distance correlation } \concept{ distance covariance } \concept{ energy statistics } energy/man/dcov.test.Rd0000644000176200001440000001237214250476236014534 0ustar liggesusers\name{dcov.test} \alias{distance covariance} \alias{dcov.test} \alias{dcor.test} \title{ Distance Covariance Test and Distance Correlation test} \description{ Distance covariance test and distance correlation test of multivariate independence. Distance covariance and distance correlation are multivariate measures of dependence.} \usage{ dcov.test(x, y, index = 1.0, R = NULL) dcor.test(x, y, index = 1.0, R) } \arguments{ \item{x}{ data or distances of first sample} \item{y}{ data or distances of second sample} \item{R}{ number of replicates} \item{index}{ exponent on Euclidean distance, in (0,2]} } \details{ \code{dcov.test} and \code{dcor.test} are nonparametric tests of multivariate independence. The test decision is obtained via permutation bootstrap, with \code{R} replicates. The sample sizes (number of rows) of the two samples must agree, and samples must not contain missing values. The \code{index} is an optional exponent on Euclidean distance. Valid exponents for energy are in (0, 2) excluding 2. Argument types supported are numeric data matrix, data.frame, or tibble, with observations in rows; numeric vector; ordered or unordered factors. In case of unordered factors a 0-1 distance matrix is computed. Optionally pre-computed distances can be input as class "dist" objects or as distance matrices. For data types of arguments, distance matrices are computed internally. The \code{dcov} test statistic is \eqn{n \mathcal V_n^2}{nV_n^2} where \eqn{\mathcal V_n(x,y)}{V_n(x,y)} = dcov(x,y), which is based on interpoint Euclidean distances \eqn{\|x_{i}-x_{j}\|}{||x_{i}-x_{j}||}. The \code{index} is an optional exponent on Euclidean distance. Similarly, the \code{dcor} test statistic is based on the normalized coefficient, the distance correlation. (See the manual page for \code{dcor}.) Distance correlation is a new measure of dependence between random vectors introduced by Szekely, Rizzo, and Bakirov (2007). For all distributions with finite first moments, distance correlation \eqn{\mathcal R}{R} generalizes the idea of correlation in two fundamental ways: (1) \eqn{\mathcal R(X,Y)}{R(X,Y)} is defined for \eqn{X} and \eqn{Y} in arbitrary dimension. (2) \eqn{\mathcal R(X,Y)=0}{R(X,Y)=0} characterizes independence of \eqn{X} and \eqn{Y}. Characterization (2) also holds for powers of Euclidean distance \eqn{\|x_i-x_j\|^s}{|x_i-x_j|^s}, where \eqn{0 50 n <- 100 x <- rnorm(100) y <- rnorm(100) all.equal(dcov(x, y)^2, dcov2d(x, y), check.attributes = FALSE) all.equal(bcdcor(x, y), dcor2d(x, y, "U"), check.attributes = FALSE) x <- rlnorm(400) y <- rexp(400) dcov.test(x, y, R=199) #permutation test dcor.test(x, y, R=199) } } \keyword{ htest } \keyword{ nonparametric } \concept{ independence } \concept{ distance correlation } \concept{ distance covariance } \concept{ energy statistics } energy/man/poisson.Rd0000644000176200001440000001032314227267103014302 0ustar liggesusers\name{Poisson Tests} \alias{poisson.tests} \alias{poisson.e} \alias{poisson.etest} \alias{poisson.m} \alias{poisson.mtest} \title{ Goodness-of-Fit Tests for Poisson Distribution} \description{ Performs the mean distance goodness-of-fit test and the energy goodness-of-fit test of Poisson distribution with unknown parameter. } \usage{ poisson.e(x) poisson.m(x) poisson.etest(x, R) poisson.mtest(x, R) poisson.tests(x, R, test="all") } \arguments{ \item{x}{ vector of nonnegative integers, the sample data } \item{R}{ number of bootstrap replicates } \item{test}{ name of test(s) } } \details{ Two distance-based tests of Poissonity are applied in \code{poisson.tests}, "M" and "E". The default is to do all tests and return results in a data frame. Valid choices for \code{test} are "M", "E", or "all" with default "all". If "all" tests, all tests are performed by a single parametric bootstrap computing all test statistics on each sample. The "M" choice is two tests, one based on a Cramer-von Mises distance and the other an Anderson-Darling distance. The "E" choice is the energy goodness-of-fit test. \code{R} must be a positive integer for a test. If \code{R} is missing or 0, a warning is printed but test statistics are computed (without testing). The mean distance test of Poissonity (M-test) is based on the result that the sequence of expected values E|X-j|, j=0,1,2,... characterizes the distribution of the random variable X. As an application of this characterization one can get an estimator \eqn{\hat F(j)} of the CDF. The test statistic (see \code{\link{poisson.m}}) is a Cramer-von Mises type of distance, with M-estimates replacing the usual EDF estimates of the CDF: \deqn{M_n = n\sum_{j=0}^\infty (\hat F(j) - F(j\;; \hat \lambda))^2 f(j\;; \hat \lambda).}{M_n = n sum [j>=0] (\hat F(j) - F(j; \hat \lambda))^2 f(j; \hat \lambda).} In \code{poisson.tests}, an Anderson-Darling type of weight is also applied when \code{test="M"} or \code{test="all"}. The tests are implemented by parametric bootstrap with \code{R} replicates. An energy goodness-of-fit test (E) is based on the test statistic \deqn{Q_n = n (\frac{2}{n} \sum_{i=1}^n E|x_i - X| - E|X-X'| - \frac{1}{n^2} \sum_{i,j=1}^n |x_i - x_j|, }{Q_n = n((2/n) sum[1:n] E|x_i-X| - E|X-X'| - (1/n^2) sum[1:n,1:n] |x_i-x_j|),} where X and X' are iid with the hypothesized null distribution. For a test of H: X ~ Poisson(\eqn{\lambda}), we can express E|X-X'| in terms of Bessel functions, and E|x_i - X| in terms of the CDF of Poisson(\eqn{\lambda}). If test=="all" or not specified, all tests are run with a single parametric bootstrap. \code{poisson.mtest} implements only the Poisson M-test with Cramer-von Mises type distance. \code{poisson.etest} implements only the Poisson energy test. } \value{ The functions \code{poisson.m} and \code{poisson.e} return the test statistics. The function \code{poisson.mtest} or \code{poisson.etest} return an \code{htest} object containing \item{method}{Description of test} \item{statistic}{observed value of the test statistic} \item{p.value}{approximate p-value of the test} \item{data.name}{replicates R} \item{estimate}{sample mean} \code{poisson.tests} returns "M-CvM test", "M-AD test" and "Energy test" results in a data frame with columns \item{estimate}{sample mean} \item{statistic}{observed value of the test statistic} \item{p.value}{approximate p-value of the test} \item{method}{Description of test} which can be coerced to a \code{tibble}. } \note{The running time of the M test is much faster than the E-test.} \references{ Szekely, G. J. and Rizzo, M. L. (2004) Mean Distance Test of Poisson Distribution, \emph{Statistics and Probability Letters}, 67/3, 241-247. \doi{10.1016/j.spl.2004.01.005}. Szekely, G. J. and Rizzo, M. L. (2005) A New Test for Multivariate Normality, \emph{Journal of Multivariate Analysis}, 93/1, 58-80, \doi{10.1016/j.jmva.2003.12.002}. } \author{ Maria L. Rizzo \email{mrizzo@bgsu.edu} and Gabor J. Szekely } \examples{ x <- rpois(50, 2) poisson.m(x) poisson.e(x) \donttest{ poisson.etest(x, R=199) poisson.mtest(x, R=199) poisson.tests(x, R=199) } } \keyword{ htest } \keyword{ energy } energy/man/eqdist.etest.Rd0000644000176200001440000001330714227266732015240 0ustar liggesusers\name{eqdist.etest} \alias{eqdist.etest} \alias{eqdist.e} \alias{ksample.e} \title{Multisample E-statistic (Energy) Test of Equal Distributions} \description{ Performs the nonparametric multisample E-statistic (energy) test for equality of multivariate distributions. } \usage{ eqdist.etest(x, sizes, distance = FALSE, method=c("original","discoB","discoF"), R) eqdist.e(x, sizes, distance = FALSE, method=c("original","discoB","discoF")) ksample.e(x, sizes, distance = FALSE, method=c("original","discoB","discoF"), ix = 1:sum(sizes)) } \arguments{ \item{x}{ data matrix of pooled sample} \item{sizes}{ vector of sample sizes} \item{distance}{logical: if TRUE, first argument is a distance matrix} \item{method}{ use original (default) or distance components (discoB, discoF)} \item{R}{ number of bootstrap replicates } \item{ix}{ a permutation of the row indices of x } } \details{ The k-sample multivariate \eqn{\mathcal{E}}{E}-test of equal distributions is performed. The statistic is computed from the original pooled samples, stacked in matrix \code{x} where each row is a multivariate observation, or the corresponding distance matrix. The first \code{sizes[1]} rows of \code{x} are the first sample, the next \code{sizes[2]} rows of \code{x} are the second sample, etc. The test is implemented by nonparametric bootstrap, an approximate permutation test with \code{R} replicates. The function \code{eqdist.e} returns the test statistic only; it simply passes the arguments through to \code{eqdist.etest} with \code{R = 0}. The k-sample multivariate \eqn{\mathcal{E}}{E}-statistic for testing equal distributions is returned. The statistic is computed from the original pooled samples, stacked in matrix \code{x} where each row is a multivariate observation, or from the distance matrix \code{x} of the original data. The first \code{sizes[1]} rows of \code{x} are the first sample, the next \code{sizes[2]} rows of \code{x} are the second sample, etc. The two-sample \eqn{\mathcal{E}}{E}-statistic proposed by Szekely and Rizzo (2004) is the e-distance \eqn{e(S_i,S_j)}, defined for two samples \eqn{S_i, S_j} of size \eqn{n_i, n_j} by \deqn{e(S_i,S_j)=\frac{n_i n_j}{n_i+n_j}[2M_{ij}-M_{ii}-M_{jj}], }{e(S_i, S_j) = (n_i n_j)(n_i+n_j)[2M_(ij)-M_(ii)-M_(jj)],} where \deqn{M_{ij}=\frac{1}{n_i n_j}\sum_{p=1}^{n_i} \sum_{q=1}^{n_j} \|X_{ip}-X_{jq}\|,}{ M_{ij} = 1/(n_i n_j) sum[1:n_i, 1:n_j] ||X_(ip) - X_(jq)||,} \eqn{\|\cdot\|}{|| ||} denotes Euclidean norm, and \eqn{X_{ip}}{ X_(ip)} denotes the p-th observation in the i-th sample. The original (default method) k-sample \eqn{\mathcal{E}}{E}-statistic is defined by summing the pairwise e-distances over all \eqn{k(k-1)/2} pairs of samples: \deqn{\mathcal{E}=\sum_{1 \leq i < j \leq k} e(S_i,S_j). }{\emph{E} = sum[i 0 tests for significance using the test statistic disco "F" ratio (default \code{method="disco"}), or using the between component statistic (\code{method="discoB"}), each implemented by permutation test. If \code{x} is a \code{dist} object, argument \code{distance} is ignored. If \code{x} is a distance matrix, set \code{distance=TRUE}. In the current release \code{disco} computes the decomposition for one-way models only. } \value{ When \code{method="discoF"}, \code{disco} returns a list similar to the return value from \code{anova.lm}, and the \code{print.disco} method is provided to format the output into a similar table. Details: \code{disco} returns a class \code{disco} object, which is a list containing \item{call}{call} \item{method}{method} \item{statistic}{vector of observed statistics} \item{p.value}{vector of p-values} \item{k}{number of factors} \item{N}{number of observations} \item{between}{between-sample distance components} \item{withins}{one-way within-sample distance components} \item{within}{within-sample distance component} \item{total}{total dispersion} \item{Df.trt}{degrees of freedom for treatments} \item{Df.e}{degrees of freedom for error} \item{index}{index (exponent on distance)} \item{factor.names}{factor names} \item{factor.levels}{factor levels} \item{sample.sizes}{sample sizes} \item{stats}{matrix containing decomposition} When \code{method="discoB"}, \code{disco} passes the arguments to \code{disco.between}, which returns a class \code{htest} object. \code{disco.between} returns a class \code{htest} object, where the test statistic is the between-sample statistic (proportional to the numerator of the F ratio of the \code{disco} test. } \references{ M. L. Rizzo and G. J. Szekely (2010). DISCO Analysis: A Nonparametric Extension of Analysis of Variance, Annals of Applied Statistics, Vol. 4, No. 2, 1034-1055. \cr \doi{10.1214/09-AOAS245} } \note{ The current version does all calculations via matrix arithmetic and boot function. Support for more general additive models and a formula interface is under development. \code{disco} methods have been added to the cluster distance summary function \code{edist}, and energy tests for equality of distribution (see \code{eqdist.etest}). } \seealso{ \code{ \link{edist} } \code{ \link{eqdist.e} } \code{ \link{eqdist.etest} } \code{ \link{ksample.e} } } \author{ Maria L. Rizzo \email{mrizzo@bgsu.edu} and Gabor J. Szekely } \examples{ ## warpbreaks one-way decompositions data(warpbreaks) attach(warpbreaks) disco(breaks, factors=wool, R=99) ## warpbreaks two-way wool+tension disco(breaks, factors=data.frame(wool, tension), R=0) ## warpbreaks two-way wool*tension disco(breaks, factors=data.frame(wool, tension, wool:tension), R=0) ## When index=2 for univariate data, we get ANOVA decomposition disco(breaks, factors=tension, index=2.0, R=99) aov(breaks ~ tension) ## Multivariate response ## Example on producing plastic film from Krzanowski (1998, p. 381) tear <- c(6.5, 6.2, 5.8, 6.5, 6.5, 6.9, 7.2, 6.9, 6.1, 6.3, 6.7, 6.6, 7.2, 7.1, 6.8, 7.1, 7.0, 7.2, 7.5, 7.6) gloss <- c(9.5, 9.9, 9.6, 9.6, 9.2, 9.1, 10.0, 9.9, 9.5, 9.4, 9.1, 9.3, 8.3, 8.4, 8.5, 9.2, 8.8, 9.7, 10.1, 9.2) opacity <- c(4.4, 6.4, 3.0, 4.1, 0.8, 5.7, 2.0, 3.9, 1.9, 5.7, 2.8, 4.1, 3.8, 1.6, 3.4, 8.4, 5.2, 6.9, 2.7, 1.9) Y <- cbind(tear, gloss, opacity) rate <- factor(gl(2,10), labels=c("Low", "High")) ## test for equal distributions by rate disco(Y, factors=rate, R=99) disco(Y, factors=rate, R=99, method="discoB") ## Just extract the decomposition table disco(Y, factors=rate, R=0)$stats ## Compare eqdist.e methods for rate ## disco between stat is half of original when sample sizes equal eqdist.e(Y, sizes=c(10, 10), method="original") eqdist.e(Y, sizes=c(10, 10), method="discoB") ## The between-sample distance component disco.between(Y, factors=rate, R=0) } \keyword{ htest } \keyword{ multivariate } energy/DESCRIPTION0000644000176200001440000000253414662222332013257 0ustar liggesusersPackage: energy Title: E-Statistics: Multivariate Inference via the Energy of Data Version: 1.7-12 Date: 2024-08-22 Authors@R: c( person("Maria", "Rizzo", , "mrizzo@bgsu.edu", c("aut", "cre")), person("Gabor", "Szekely", , , "aut")) Description: E-statistics (energy) tests and statistics for multivariate and univariate inference, including distance correlation, one-sample, two-sample, and multi-sample tests for comparing multivariate distributions, are implemented. Measuring and testing multivariate independence based on distance correlation, partial distance correlation, multivariate goodness-of-fit tests, k-groups and hierarchical clustering based on energy distance, testing for multivariate normality, distance components (disco) for non-parametric analysis of structured data, and other energy statistics/methods are implemented. Imports: Rcpp (>= 0.12.6), stats, boot, gsl LinkingTo: Rcpp Suggests: MASS, CompQuadForm, knitr, rmarkdown Depends: R (>= 3.1) URL: https://github.com/mariarizzo/energy License: GPL (>= 2) LazyData: true NeedsCompilation: yes Repository: CRAN RoxygenNote: 7.2.3 Packaged: 2024-08-23 23:12:00 UTC; maria Author: Maria Rizzo [aut, cre], Gabor Szekely [aut] Maintainer: Maria Rizzo Date/Publication: 2024-08-24 00:10:02 UTC