energy/0000755000176200001440000000000013156600704011545 5ustar liggesusersenergy/src/0000755000176200001440000000000013156540276012343 5ustar liggesusersenergy/src/Ecluster.cc0000644000176200001440000000455513156540276014451 0ustar liggesusers/* Ecluster.cc: energy package Author: Maria Rizzo Created: Created: 12 Dec 2002 Revised: 4 Jan 2004 for R-1.8.1 energy package Revised: 28 Jan 2004 */ #include "R.h" #include "Rmath.h" #include "ECl.h" extern "C" { double **alloc_matrix(int r, int c); void free_matrix(double **matrix, int r, int c); void Emin_hclust(double *diss, int *en, int *merge, double *height, int *order); void lower2square(double **dst, double *diss, int n); void Emin_hclust(double *diss, int *en, int *merge, double *height, int *order) { // performs hierarchical E-clustering by minimum cluster E-distance // diss lower.tri of n by n distance matrix, column order // en sample size n // merge (n-1) by 2 array as a vector in col order; see hclust object // height vector length n-1; see hclust object // order vector length n; see hclust object int i, step, I, J; int n = (*en); double e; double *E; double **Edst; double **dst; int *m1, *m2; ECl c; //clustering object c.init(n); dst = alloc_matrix(n, n); Edst = alloc_matrix(n, n); //E dist between clusters E = Calloc(n, double); m1 = Calloc(n-1, int); m2 = Calloc(n-1, int); // convert lower.tri in vector form to square matrix lower2square(dst, diss, n); //E-hierarchical clustering E[0] = c.init_Edst(dst, Edst); step = 0; while (c.len() > 1) { e = c.merge_minEdst(dst, Edst); c.last_pair(&I, &J); height[step] = c.ht(I); step = c.last_merge(m1+step, m2+step); E[step] = e; } //compute the return values for merge and order E[n-1] = 0.0; for (i=0; i using namespace Rcpp; // [[Rcpp::export]] double U_product(NumericMatrix U, NumericMatrix V) { // U and V are U-centered dissimilarity matrices of the two samples int n = U.nrow(); int i, j; double sums = 0.0; for (i = 0; i < n; i++) for (j=0; j using namespace Rcpp; NumericMatrix U_center(NumericMatrix); //[[Rcpp::export]] NumericVector dcovU_stats(NumericMatrix Dx, NumericMatrix Dy) { // x and y must be square distance matrices NumericMatrix A = U_center(Dx); NumericMatrix B = U_center(Dy); double ab = 0.0, aa = 0.0, bb = 0.0; double V, dcorU = 0.0; double eps = std::numeric_limits::epsilon(); //machine epsilon int n = Dx.nrow(); int n2 = n * (n - 3); for (int i=0; i eps) dcorU = ab / sqrt(V); return NumericVector::create( _["dCovU"] = ab, _["bcdcor"] = dcorU, _["dVarXU"] = aa, _["dVarYU"] = bb ); } energy/src/ECl.cc0000644000176200001440000002170613156540276013323 0ustar liggesusers/* ECl.cc: energy package Author: Maria Rizzo Created: 12 Dec 2002 Revised: 4 Jan 2004 for R-1.8.1 energy package Revised: 28 Jan 2004 */ #include "ECl.h" #include extern "C" { //implementation of Cl and ECl classes Cl::~Cl() { //destructor int i; if (isinit==1) { Free(size); Free(step); Free(height); Free(w); for (i=0; i 0) { I = -m1[0]-1; J = -m2[0]-1; combine(I, J); w[0] = J; w[1] = I; i = 1; while (nclus > k) { I = (m1[i]<0)? -m1[i]-1 : w[m1[i]]; J = (m2[i]<0)? -m2[i]-1 : w[m2[i]]; combine(I, J); i++; w[i] = I; } } nclus = clusters(); return nclus; } int Cl::init(int m, int *G, int base) { //initialize cluster with group membership vector G int g, i; init(m); if (base > 0) for (i=0; i0) { cl[k++] = i; m+= size[i]; } if (k!=nclus) error("nclus error"); if (m!=n) error("total size error"); return nclus; } int Cl::clusters() { //count the number of non-empty clusters, and set nclus int i, k=0; for (i=0; i0) k++; if (k>n || k<1) error("nclus error"); nclus = k; return nclus; } int Cl::combine(int I, int J) { //merge Jth row into Ith row //w is preserved int j, m; if (I==J) error("c:I==J"); if (I<0 || J<0 || I>=n || J>=n) error("c:I,J error"); if (size[I]<=0 || size[J]<=0) error("c:empty cluster"); if (nclus < 2) error("c:1 cluster"); m = size[I]; for (j=0; j 0) { for (j=0; j 0) for (i=0; i 0) for (j=0; j 0) for (i=0; i n) return -1; return 0; } int Cl::proximity(int **p) { //p[i][j] is 1 if (i,j) in same cluster //p[i][j] is 0 if (i,j) in different clusters int a, b, i, j, k; for (i=0; i height[J]) { I=w[1]; J=w[0]; } height[I]=Ed[I][J]; combine(I, J); update_Edst(I, J, dst, Ed); return 0.0; } if (nclus == 1) error("last cluster"); if (nclus < 1) error("nclus<1"); I=J=-1; find_minEdst(Ed, &I, &J); if (I>=0) { if (J < I) { p=I; I=J; J=p; } hI = hJ = 0.0; if (step[I] > 0) hI = height[I]; if (step[J] > 0) hJ = height[J]; if (hJ < hI) { p=I; I=J; J=p; } height[I] = Ed[I][J]; d=combine(I,J); if(!d) error("merge_best_pair error"); pE = E; E = update_Edst(I, J, dst, Ed); } return E; } } //end extern "C" energy/src/utilities.c0000644000176200001440000001402413156540276014523 0ustar liggesusers/* utilities.c: some utilities for the energy package Author: Maria L. Rizzo (see energy package on CRAN or at personal.bgsu.edu/~mrizzo) alloc_matrix, alloc_int_matrix, free_matrix, free_int_matrix: use R (Calloc, Free) instead of C (calloc, free) for memory management permute permutes the first n elements of an integer vector row_order converts arg from column order to row order vector2matrix copies double* arg into double** arg distance computes Euclidean distance matrix from double** Euclidean_distance computes Euclidean distance matrix from double* index_distance computes Euclidean distance matrix D then D^index sumdist sums the distance matrix without creating the matrix Notes: 1. index_distance (declaration and body of the function) revised in energy 1.3-0, 2/2011. */ #include #include double **alloc_matrix(int r, int c); int **alloc_int_matrix(int r, int c); void free_matrix(double **matrix, int r, int c); void free_int_matrix(int **matrix, int r, int c); void permute(int *J, int n); void permute_check(int *J, int *N); void roworder(double *x, int *byrow, int r, int c); void vector2matrix(double *x, double **y, int N, int d, int isroworder); void distance(double **bxy, double **D, int N, int d); void Euclidean_distance(double *x, double **Dx, int n, int d); void index_distance(double **Dx, int n, double index); void sumdist(double *x, int *byrow, int *nrow, int *ncol, double *lowersum); double **alloc_matrix(int r, int c) { /* allocate a matrix with r rows and c columns */ int i; double **matrix; matrix = Calloc(r, double *); for (i = 0; i < r; i++) matrix[i] = Calloc(c, double); return matrix; } int **alloc_int_matrix(int r, int c) { /* allocate an integer matrix with r rows and c columns */ int i; int **matrix; matrix = Calloc(r, int *); for (i = 0; i < r; i++) matrix[i] = Calloc(c, int); return matrix; } void free_matrix(double **matrix, int r, int c) { /* free a matrix with r rows and c columns */ int i; for (i = 0; i < r; i++) Free(matrix[i]); Free(matrix); } void free_int_matrix(int **matrix, int r, int c) { /* free an integer matrix with r rows and c columns */ int i; for (i = 0; i < r; i++) Free(matrix[i]); Free(matrix); } void permute(int *J, int n) { /* permute the first n integers of J if n is length(J), equivalent to R: J <- rev(sample(J, length(J), replace=FALSE)) */ int i, j, j0, m=n; for (i=0; i DBL_EPSILON) { for (i=0; i using namespace Rcpp; NumericVector partial_dcor(NumericMatrix Dx, NumericMatrix Dy, NumericMatrix Dz); double partial_dcov(NumericMatrix Dx, NumericMatrix Dy, NumericMatrix Dz); NumericMatrix U_center(NumericMatrix); double U_product(NumericMatrix U, NumericMatrix V); NumericMatrix projection(NumericMatrix Dx, NumericMatrix Dz); // [[Rcpp::export]] NumericVector partial_dcor(NumericMatrix Dx, NumericMatrix Dy, NumericMatrix Dz) { /* partial distance correlation, second formulation Dx, Dy, Dz are symmetric distance or dissimilarity matrices with zero diagonals partial_dcor : vector length 4, partial_dcor[0] is pdcor partial_dcor returns vector [Rxyz, Rxy, Rxz, Ryz] starred versions */ int n = Dx.nrow(); NumericMatrix A(n, n), B(n, n), C(n, n); double Rxy=0.0, Rxz=0.0, Ryz=0.0, Rxyz=0.0, den; double AB, AC, BC, AA, BB, CC, pDCOV; double eps = std::numeric_limits::epsilon(); //machine epsilon A = U_center(Dx); /* U-centering to get A^U etc. */ B = U_center(Dy); C = U_center(Dz); AB = U_product(A, B); AC = U_product(A, C); BC = U_product(B, C); AA = U_product(A, A); BB = U_product(B, B); CC = U_product(C, C); pDCOV = U_product(projection(Dx, Dz), projection(Dy, Dz)); den = sqrt(AA*BB); if (den > eps) Rxy = AB / den; den = sqrt(AA*CC); if (den > eps) Rxz = AC / den; den = sqrt(BB*CC); if (den > eps) Ryz = BC / den; den = sqrt(1 - Rxz*Rxz) * sqrt(1 - Ryz * Ryz); if (den > eps) Rxyz = (Rxy - Rxz * Ryz) / den; else { Rxyz = 0.0; } return NumericVector::create( _["pdcor"] = Rxyz, _["pdcov"] = pDCOV, _["Rxy"] = Rxy, _["Rxz"] = Rxz, _["Ryz"] = Ryz ); } //[[Rcpp::export]] double partial_dcov(NumericMatrix Dx, NumericMatrix Dy, NumericMatrix Dz) { /* pdcov following the definition via projections Dx, Dy, Dz are symmetric distance or dissimilarity matrices with zero diagonals returns pdcov sample coefficient */ int n = Dx.nrow(); int i, j; NumericMatrix A(n, n), B(n, n), C(n, n), Pxz(n, n), Pyz(n, n); double AC, BC, CC, c1, c2; double eps = std::numeric_limits::epsilon(); //machine epsilon A = U_center(Dx); /* U-centering to get A^U etc. */ B = U_center(Dy); C = U_center(Dz); AC = U_product(A, C); BC = U_product(B, C); CC = U_product(C, C); c1 = c2 = 0.0; // if (C,C)==0 then C=0 and both (A,C)=0 and (B,C)=0 if (fabs(CC) > eps) { c1 = AC / CC; c2 = BC / CC; } for (i=0; i Created: 12 Dec 2002 Revised: 4 Jan 2004 for R-1.8.1 energy package Revised: 28 Jan 2004 */ #define DLLEXPORT #include #include #include #include //declarations for class Cl for hierarchical cluster analysis //and ECl for e-clustering #define EPS DBL_EPSILON*20.0 #define ONE 1.0+DBL_EPSILON*20.0 class DLLEXPORT Cl { //for cluster analysis protected: int n; //number of observations int nclus; //number of clusters int it; //number of changes to clusters int pstep1; int pstep2; int psize1; int psize2; int r1; int r2; int c1; int c2; int temp; int isinit; //is memory allocated for arrays int *size; //sizes of clusters int *step; //step when cluster formed double *height; //distance between merging clusters int *w; int **clus; //indices of observations public: Cl(){isinit=0;}; //no memory is allocated, call init(n) ~Cl(); int init(int m); int init(int n, int *m1, int *m2, int k); int init(int m, int *G, int base); int dim() {return n;} int len() {return nclus;} int len(int i) {return size[i];} int obs(int i, int j) {return clus[i][j];} double ht(int i) {return height[i];} int next_cl(int p) {p++;while(p #include #include // for NULL #include /* declarations to register native routines in this package */ /* .C calls */ extern void dCOV(void *, void *, void *, void *, void *, void *, void *); extern void dCOVtest(void *, void *, void *, void *, void *, void *, void *, void *); extern void Emin_hclust(void *, void *, void *, void *, void *); extern void indepE(void *, void *, void *, void *, void *); extern void indepEtest(void *, void *, void *, void *, void *, void *, void *); extern void ksampleEtest(void *, void *, void *, void *, void *, void *, void *, void *, void *); extern void poisMstat(void *, void *, void *); /* .Call calls */ extern SEXP _energy_D_center(SEXP); extern SEXP _energy_dcovU_stats(SEXP, SEXP); extern SEXP _energy_mvnEstat(SEXP); extern SEXP _energy_partial_dcor(SEXP, SEXP, SEXP); extern SEXP _energy_partial_dcov(SEXP, SEXP, SEXP); extern SEXP _energy_projection(SEXP, SEXP); extern SEXP _energy_U_center(SEXP); extern SEXP _energy_U_product(SEXP, SEXP); static const R_CMethodDef CEntries[] = { {"dCOV", (DL_FUNC) &dCOV, 7}, {"dCOVtest", (DL_FUNC) &dCOVtest, 8}, {"Emin_hclust", (DL_FUNC) &Emin_hclust, 5}, {"indepE", (DL_FUNC) &indepE, 5}, {"indepEtest", (DL_FUNC) &indepEtest, 7}, {"ksampleEtest", (DL_FUNC) &ksampleEtest, 9}, {"poisMstat", (DL_FUNC) &poisMstat, 3}, {NULL, NULL, 0} }; static const R_CallMethodDef CallEntries[] = { {"_energy_D_center", (DL_FUNC) &_energy_D_center, 1}, {"_energy_dcovU_stats", (DL_FUNC) &_energy_dcovU_stats, 2}, {"_energy_mvnEstat", (DL_FUNC) &_energy_mvnEstat, 1}, {"_energy_partial_dcor", (DL_FUNC) &_energy_partial_dcor, 3}, {"_energy_partial_dcov", (DL_FUNC) &_energy_partial_dcov, 3}, {"_energy_projection", (DL_FUNC) &_energy_projection, 2}, {"_energy_U_center", (DL_FUNC) &_energy_U_center, 1}, {"_energy_U_product", (DL_FUNC) &_energy_U_product, 2}, {NULL, NULL, 0} }; void R_init_energy(DllInfo *dll) { R_registerRoutines(dll, CEntries, CallEntries, NULL, NULL); R_useDynamicSymbols(dll, FALSE); } energy/src/mvnorm.cpp0000644000176200001440000000442113156540276014366 0ustar liggesusers// Maria L. Rizzo #include using namespace Rcpp; double mvnEstat(NumericMatrix y); double sumdist(NumericMatrix x); // M_ are constants defined in gnu c math.h //[[Rcpp::export]] double mvnEstat(NumericMatrix y) { // compute E test statistic for multivariate normality // y is a *standardized* multivariate sample int d = y.ncol(), n = y.nrow(); int i, j, k, maxterms=2000; double meanyy, meanyz, meanzz, stat; double delta, eps = 1.0e-7; double normy, yy, dif, sum, sum0, term; double D = (double) d; double lg0 = R::lgammafn(D / (double) 2); double lg1 = R::lgammafn((D+1.0) / (double) 2); double kd, logak, loggk; meanzz = 2.0 * exp(lg1 - lg0); // the second mean // computing the first mean as series meanyz = 0.0; for (i=0; i eps && k < maxterms) { kd = (double) k; sum0 = sum; logak = (kd+1) * log(yy) - R::lgammafn(kd+1) - kd*M_LN2 - log(2*kd+1) - log(2*kd+2); loggk = lg1 + R::lgammafn(kd+1.5) - R::lgammafn(kd+D/2+1); term = exp(logak + loggk); if (k % 2 == 0) sum += term; else sum -= term; delta = fabs(sum - sum0); k++; } if (delta < eps) meanyz += meanzz/M_SQRT2 + M_SQRT_2dPI * sum; else { meanyz += normy; Rf_warning("E|y-Z| did not converge, replaced by %f", normy); } } meanyz /= (double) n; meanyy = sumdist(y); // computing third mean meanyy = (2.0 * meanyy / (double)(n*n)); stat = ((double) n)*(2.0 * meanyz - meanzz - meanyy); return stat; } double sumdist(NumericMatrix x) { // sum the pairwise distances between rows of data matrix x // without storing the distance matrix // lower triangle only // result is equivalent to this in R: sum(dist(x)) int n = x.nrow(), d = x.ncol(); double s = 0.0, dsum, dif; for (int i=1; i using namespace Rcpp; NumericMatrix U_center(NumericMatrix); double U_product(NumericMatrix, NumericMatrix); // [[Rcpp::export]] NumericMatrix projection(NumericMatrix Dx, NumericMatrix Dz) { /* returns the projection of A(x) distance matrix Dx onto the orthogonal complement of C(z) distance matrix; both Dx and Dz are n by n distance or dissimilarity matrices the projection is an n by n matrix */ int n = Dx.nrow(); int i, j; NumericMatrix A(n, n), C(n, n), P(n, n); double AC, CC, c1; double eps = std::numeric_limits::epsilon(); //machine epsilon A = U_center(Dx); // U-centering to get A^U etc. C = U_center(Dz); AC = U_product(A, C); // (A,C) = dcov^U CC = U_product(C, C); c1 = 0.0; // if (C,C)==0 then C==0 so c1=(A,C)=0 if (fabs(CC) > eps) c1 = AC / CC; for (i=0; i do not edit by hand // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 #include using namespace Rcpp; // D_center NumericMatrix D_center(NumericMatrix Dx); RcppExport SEXP _energy_D_center(SEXP DxSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< NumericMatrix >::type Dx(DxSEXP); rcpp_result_gen = Rcpp::wrap(D_center(Dx)); return rcpp_result_gen; END_RCPP } // U_center NumericMatrix U_center(NumericMatrix Dx); RcppExport SEXP _energy_U_center(SEXP DxSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< NumericMatrix >::type Dx(DxSEXP); rcpp_result_gen = Rcpp::wrap(U_center(Dx)); return rcpp_result_gen; END_RCPP } // dcovU_stats NumericVector dcovU_stats(NumericMatrix Dx, NumericMatrix Dy); RcppExport SEXP _energy_dcovU_stats(SEXP DxSEXP, SEXP DySEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< NumericMatrix >::type Dx(DxSEXP); Rcpp::traits::input_parameter< NumericMatrix >::type Dy(DySEXP); rcpp_result_gen = Rcpp::wrap(dcovU_stats(Dx, Dy)); return rcpp_result_gen; END_RCPP } // mvnEstat double mvnEstat(NumericMatrix y); RcppExport SEXP _energy_mvnEstat(SEXP ySEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< NumericMatrix >::type y(ySEXP); rcpp_result_gen = Rcpp::wrap(mvnEstat(y)); return rcpp_result_gen; END_RCPP } // partial_dcor NumericVector partial_dcor(NumericMatrix Dx, NumericMatrix Dy, NumericMatrix Dz); RcppExport SEXP _energy_partial_dcor(SEXP DxSEXP, SEXP DySEXP, SEXP DzSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< NumericMatrix >::type Dx(DxSEXP); Rcpp::traits::input_parameter< NumericMatrix >::type Dy(DySEXP); Rcpp::traits::input_parameter< NumericMatrix >::type Dz(DzSEXP); rcpp_result_gen = Rcpp::wrap(partial_dcor(Dx, Dy, Dz)); return rcpp_result_gen; END_RCPP } // partial_dcov double partial_dcov(NumericMatrix Dx, NumericMatrix Dy, NumericMatrix Dz); RcppExport SEXP _energy_partial_dcov(SEXP DxSEXP, SEXP DySEXP, SEXP DzSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< NumericMatrix >::type Dx(DxSEXP); Rcpp::traits::input_parameter< NumericMatrix >::type Dy(DySEXP); Rcpp::traits::input_parameter< NumericMatrix >::type Dz(DzSEXP); rcpp_result_gen = Rcpp::wrap(partial_dcov(Dx, Dy, Dz)); return rcpp_result_gen; END_RCPP } // projection NumericMatrix projection(NumericMatrix Dx, NumericMatrix Dz); RcppExport SEXP _energy_projection(SEXP DxSEXP, SEXP DzSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< NumericMatrix >::type Dx(DxSEXP); Rcpp::traits::input_parameter< NumericMatrix >::type Dz(DzSEXP); rcpp_result_gen = Rcpp::wrap(projection(Dx, Dz)); return rcpp_result_gen; END_RCPP } // U_product double U_product(NumericMatrix U, NumericMatrix V); RcppExport SEXP _energy_U_product(SEXP USEXP, SEXP VSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< NumericMatrix >::type U(USEXP); Rcpp::traits::input_parameter< NumericMatrix >::type V(VSEXP); rcpp_result_gen = Rcpp::wrap(U_product(U, V)); return rcpp_result_gen; END_RCPP } energy/src/centering.cpp0000644000176200001440000000331213156540276015024 0ustar liggesusers// double centering utilities for the energy package // // Maria L. Rizzo // August, 2016 #include using namespace Rcpp; NumericMatrix D_center(NumericMatrix Dx); NumericMatrix U_center(NumericMatrix Dx); // [[Rcpp::export]] NumericMatrix D_center(NumericMatrix Dx) { /* computes the double centered distance matrix for distance matrix Dx for dCov, dCor, etc. a_{ij} - a_{i.}/n - a_{.j}/n + a_{..}/n^2, all i, j */ int j, k; int n = Dx.nrow(); NumericVector akbar(n); NumericMatrix A(n, n); double abar = 0.0; for (k=0; k #include void dCOVtest(double *x, double *y, int *byrow, int *dims, double *index, double *reps, double *DCOV, double *pval); void dCovTest(double *x, double *y, int *byrow, int *dims, double *index, double *reps, double *Dstat, double *pval); void dCOV(double *x, double *y, int *byrow, int *dims, double *index, int *idx, double *DCOV); double Akl(double **akl, double **A, int n); /* functions in utilities.c */ extern double **alloc_matrix(int r, int c); extern int **alloc_int_matrix(int r, int c); extern void free_matrix(double **matrix, int r, int c); extern void free_int_matrix(int **matrix, int r, int c); extern void permute(int *J, int n); extern void roworder(double *x, int *byrow, int r, int c); extern void Euclidean_distance(double *x, double **Dx, int n, int d); extern void index_distance(double **Dx, int n, double index); extern void vector2matrix(double *x, double **y, int N, int d, int isroworder); void dCOVtest(double *x, double *y, int *byrow, int *dims, double *index, double *reps, double *DCOV, double *pval) { /* computes dCov(x,y), dCor(x,y), dVar(x), dVar(y) V-statistic is n*dCov^2 where n*dCov^2 --> Q dims[0] = n (sample size) dims[1] = p (dimension of X) dims[2] = q (dimension of Y) dims[3] = dst (logical, TRUE if x, y are distances) dims[4] = R (number of replicates) index : exponent for distance DCOV : vector [dCov, dCor, dVar(x), dVar(y), mean(A), mean(B)] */ int i, j, k, n, n2, p, q, r, J, K, M, R; int dst; int* perm; double **Dx, **Dy, **A, **B; double dcov, V; n = dims[0]; p = dims[1]; q = dims[2]; dst = dims[3]; R = dims[4]; if (*byrow == FALSE) { /* avoid this step: use as.double(t(x)) in R */ roworder(x, byrow, n, p); *byrow = FALSE; /* false for y */ roworder(y, byrow, n, q); } /* critical to pass correct flag dst from R */ Dx = alloc_matrix(n, n); Dy = alloc_matrix(n, n); if (dst) { vector2matrix(x, Dx, n, n, 1); vector2matrix(y, Dy, n, n, 1); } else { Euclidean_distance(x, Dx, n, p); Euclidean_distance(y, Dy, n, q); } index_distance(Dx, n, *index); index_distance(Dy, n, *index); A = alloc_matrix(n, n); B = alloc_matrix(n, n); Akl(Dx, A, n); Akl(Dy, B, n); free_matrix(Dx, n, n); free_matrix(Dy, n, n); n2 = ((double) n) * n; /* compute dCov(x,y), dVar(x), dVar(y) */ for (k=0; k<4; k++) DCOV[k] = 0.0; for (k=0; k 0) DCOV[k] = sqrt(DCOV[k]); else DCOV[k] = 0.0; } /* compute dCor(x, y) */ V = DCOV[2]*DCOV[3]; if (V > DBL_EPSILON) DCOV[1] = DCOV[0] / sqrt(V); else DCOV[1] = 0.0; if (R > 0) { /* compute the replicates */ if (DCOV[1] > 0.0) { perm = Calloc(n, int); M = 0; for (i=0; i= DCOV[0]) M++; } *pval = (double) (M+1) / (double) (R+1); PutRNGstate(); Free(perm); } else { *pval = 1.0; } } free_matrix(A, n, n); free_matrix(B, n, n); return; } void dCOV(double *x, double *y, int *byrow, int *dims, double *index, int *idx, double *DCOV) { /* computes dCov(x,y), dCor(x,y), dVar(x), dVar(y) V-statistic is n*dCov^2 where n*dCov^2 --> Q dims[0] = n (sample size) dims[1] = p (dimension of X) dims[2] = q (dimension of Y) dims[3] = dst (logical, TRUE if x, y are distances) index : exponent for distance idx : index vector, a permutation of sample indices DCOV : vector [dCov, dCor, dVar(x), dVar(y)] */ int j, k, n, n2, p, q, dst; double **Dx, **Dy, **A, **B; double V; n = dims[0]; p = dims[1]; q = dims[2]; dst = dims[3]; if (*byrow == FALSE) { /* avoid this step: use as.double(t(x)) in R */ roworder(x, byrow, n, p); *byrow = FALSE; /* false for y */ roworder(y, byrow, n, q); } /* critical to pass correct flag dst from R */ Dx = alloc_matrix(n, n); Dy = alloc_matrix(n, n); if (dst) { vector2matrix(x, Dx, n, n, 1); vector2matrix(y, Dy, n, n, 1); } else { Euclidean_distance(x, Dx, n, p); Euclidean_distance(y, Dy, n, q); } index_distance(Dx, n, *index); index_distance(Dy, n, *index); A = alloc_matrix(n, n); B = alloc_matrix(n, n); Akl(Dx, A, n); Akl(Dy, B, n); free_matrix(Dx, n, n); free_matrix(Dy, n, n); n2 = ((double) n) * n; /* compute dCov(x,y), dVar(x), dVar(y) */ for (k=0; k<4; k++) DCOV[k] = 0.0; for (k=0; k 0) DCOV[k] = sqrt(DCOV[k]); else DCOV[k] = 0.0; } /* compute dCor(x, y) */ V = DCOV[2]*DCOV[3]; if (V > DBL_EPSILON) DCOV[1] = DCOV[0] / sqrt(V); else DCOV[1] = 0.0; free_matrix(A, n, n); free_matrix(B, n, n); return; } double Akl(double **akl, double **A, int n) { /* -computes the A_{kl} or B_{kl} distances from the distance matrix (a_{kl}) or (b_{kl}) for dCov, dCor, dVar dCov = mean(Akl*Bkl), dVar(X) = mean(Akl^2), etc. */ int j, k; double *akbar; double abar; akbar = Calloc(n, double); abar = 0.0; for (k=0; k Q dims[0] = n (sample size) dims[1] = p (dimension of X) dims[2] = q (dimension of Y) dims[3] = B (number of replicates, dimension of reps) index : exponent for distance Dstat : the statistic dCov^2 (V_n^2) and S1, S2, S3 */ int b, i, j, k, n, p , q, B, I, J, M; int *perm; double Cx, Cy, Cxy, C3, S1, S2, S3, n2, n3; double **Dx, **Dy; n = dims[0]; p = dims[1]; q = dims[2]; B = dims[3]; if (*byrow == FALSE) { /* avoid this step: use as.double(t(x)) in R */ roworder(x, byrow, n, p); *byrow = FALSE; /* false for y */ roworder(y, byrow, n, q); } Dx = alloc_matrix(n, n); Dy = alloc_matrix(n, n); Euclidean_distance(x, Dx, n, p); Euclidean_distance(y, Dy, n, q); index_distance(Dx, n, *index); index_distance(Dy, n, *index); Cx = Cy = Cxy = C3 = 0.0; n2 = ((double) n) * n; n3 = n2 * n; /* compute observed test statistic */ for (i=0; i 0) { GetRNGstate(); if (Dstat[0] > 0.0) { perm = Calloc(n, int); M = 0; for (i=0; i= (*Dstat)) M++; } *pval = (double) (M+1) / (double) (B+1); PutRNGstate(); Free(perm); } else { *pval = 1.0; } } /* test statistic (the V-statistic) is nV_n^2 = n*Dstat[0] a normalized version is n*Dstat[0]/Dstat[2] */ free_matrix(Dx, n, n); free_matrix(Dy, n, n); return; } energy/src/energy.c0000644000176200001440000002342013156540276014001 0ustar liggesusers/* energy.c: energy package Author: Maria Rizzo Created: 4 Jan 2004 Updated: 2 April 2008 some functions moved to utilities.c Updated: 25 August 2016 mvnEstat converted to c++ in mvnorm.cpp ksampleEtest() performs the multivariate E-test for equal distributions, complete version, from data matrix E2sample() computes the 2-sample E-statistic without creating distance poisMstat() computes the mean distance test of Poissonity */ #include #include void poisMstat(int *x, int *nx, double *stat); void ksampleEtest(double *x, int *byrow, int *nsamples, int *sizes, int *dim, int *R, double *e0, double *e, double *pval); void E2sample(double *x, int *sizes, int *dim, double *stat); double edist(double **D, int m, int n); double multisampleE(double **D, int nsamples, int *sizes, int *perm); double twosampleE(double **D, int m, int n, int *xrows, int *yrows); double E2(double **x, int *sizes, int *start, int ncol, int *perm); double Eksample(double *x, int *byrow, int r, int d, int K, int *sizes, int *ix); void distance(double **bxy, double **D, int N, int d); /* utilities.c */ extern double **alloc_matrix(int r, int c); extern int **alloc_int_matrix(int r, int c); extern void free_matrix(double **matrix, int r, int c); extern void free_int_matrix(int **matrix, int r, int c); extern void permute(int *J, int n); extern void roworder(double *x, int *byrow, int r, int c); extern void vector2matrix(double *x, double **y, int N, int d, int isroworder); extern void distance(double **bxy, double **D, int N, int d); extern void Euclidean_distance(double *x, double **Dx, int n, int d); extern void index_distance(double *x, double **Dx, int n, int d, double index); extern void sumdist(double *x, int *byrow, int *nrow, int *ncol, double *lowersum); void poisMstat(int *x, int *nx, double *stat) { /* computes the Poisson mean distance statistic */ int i, j, k, n=(*nx); double eps=1.0e-10; double cvm, d, lambda, m, q; double Mcdf1, Mcdf0, Mpdf1, cdf1, cdf0; lambda = 0; for (i=0; i 1) Mcdf1 = 1.0; cdf1 = ppois(i, lambda, TRUE, FALSE); /* MLE of F(i) */ d = Mcdf1 - cdf1; cvm += d * d * (cdf1 - cdf0); cdf0 = cdf1; Mcdf0 = Mcdf1; } cvm *= n; *stat = cvm; } void E2sample(double *x, int *sizes, int *dim, double *stat) { /* compute test statistic *stat for testing H:F=G does not store distance matrix x must be in row order: x=as.double(t(x)) where x is pooled sample in matrix sum(en) by dim */ int m=sizes[0], n=sizes[1], d=(*dim); int i, j, k, p, q; double dif, dsum, sumxx, sumxy, sumyy, w; sumxy = 0.0; for (i=0; i 0) { data = alloc_matrix(N, d); /* sample matrix */ vector2matrix(x, data, N, d, *byrow); distance(data, D, N, d); free_matrix(data, N, d); } else vector2matrix(x, D, N, N, *byrow); *e0 = multisampleE(D, K, sizes, perm); /* bootstrap */ if (B > 0) { ek = 0; GetRNGstate(); for (b=0; b Created: June 15, 2004 (development) Last Modified: April 5, 2008 */ #include #include void indepE(double *x, double *y, int *byrow, int *dims, double *Istat); void indepEtest(double *x, double *y, int *byrow, int *dims, double *Istat, double *reps, double *pval); void squared_distance(double *x, double **D, int n, int d); extern double **alloc_matrix(int r, int c); extern int **alloc_int_matrix(int r, int c); extern void free_matrix(double **matrix, int r, int c); extern void free_int_matrix(int **matrix, int r, int c); extern void permute(int *J, int n); extern void roworder(double *x, int *byrow, int r, int c); extern void Euclidean_distance(double *x, double **D, int n, int d); void indepE(double *x, double *y, int *byrow, int *dims, double *Istat) { /* E statistic for multiv. indep. of X in R^p and Y in R^q statistic returned is I_n^2 [nI_n^2 has a limit dist under indep] dims[0] = n (sample size) dims[1] = p (dimension of X) dims[2] = q (dimension of Y) Istat : the statistic I_n (normalized) */ int i, j, k, m, n, p, q; double Cx, Cy, Cz, C3, C4, n2, n3, n4, v; double **D2x, **D2y; n = dims[0]; p = dims[1]; q = dims[2]; if (*byrow == FALSE) { /* avoid this step: use as.double(t(x)) in R */ roworder(x, byrow, n, p); *byrow = FALSE; /* false for y */ roworder(y, byrow, n, q); } D2x = alloc_matrix(n, n); D2y = alloc_matrix(n, n); Euclidean_distance(x, D2x, n, p); Euclidean_distance(y, D2y, n, q); Cx = Cy = Cz = C3 = C4 = 0.0; n2 = ((double) n) * n; n3 = n2 * n; n4 = n2 * n2; /* compute observed test statistic */ for (i=0; i Q dims[0] = n (sample size) dims[1] = p (dimension of X) dims[2] = q (dimension of Y) dims[3] = B (number of replicates, dimension of reps) Istat : the statistic I_n (normalized) */ int b, i, j, k, m, n, p, q, B, M; int *perm; double Cx, Cy, Cz, C3, C4, n2, n3, n4, v; double **D2x, **D2y; n = dims[0]; p = dims[1]; q = dims[2]; B = dims[3]; if (*byrow == FALSE) { /* avoid this step: use as.double(t(x)) in R */ roworder(x, byrow, n, p); *byrow = FALSE; /* false for y */ roworder(y, byrow, n, q); } D2x = alloc_matrix(n, n); D2y = alloc_matrix(n, n); squared_distance(x, D2x, n, p); squared_distance(y, D2y, n, q); Cx = Cy = Cz = C3 = C4 = 0.0; n2 = ((double) n) * n; n3 = n2 * n; n4 = n2 * n2; /* compute observed test statistic */ for (i=0; i 0) { GetRNGstate(); perm = Calloc(n, int); for (i=0; i= (*Istat)) M++; } *pval = (double) M / (double) B; PutRNGstate(); Free(perm); } free_matrix(D2x, n, n); free_matrix(D2y, n, n); return; } void squared_distance(double *x, double **D2, int n, int d) { /* interpret x as an n by d matrix, in row order (n vectors in R^d) compute the squared distance matrix D2 */ int i, j, k, p, q; double dsum, dif; for (i=1; i 0) p <- 1 - mean(bootobj$t < bootobj$t0) else p <- NA names(bootobj$t0) <- "test statistic" names(lambda) <- "mean" e <- list( method = paste("Mean distance test of Poisson distribution", sep = ""), statistic = bootobj$t0, p.value = p, data.name = paste("sample size ", n, ", replicates ", R, sep=""), estimate = lambda) class(e) <- "htest" e } poisson.m<- function(x) { # mean distance statistic for Poissonity n <- length(x) stat <- 0 e <- .C("poisMstat", x = as.integer(x), nx = as.integer(n), stat = as.double(stat), PACKAGE = "energy")$stat e } energy/R/Eeqdist.R0000644000176200001440000000712613144317702013475 0ustar liggesuserseqdist.e <- function(x, sizes, distance = FALSE, method = c("original","discoB","discoF")) { ## multivariate E-statistic for testing equal distributions ## x: matrix of pooled sample or distance matrix ## sizes: vector of sample sizes ## distance: logical, TRUE if x is a distance matrix, otherwise false ## method: original (default) or disco between components, or disco F ratio method <-match.arg(method) if (method=="discoB") { g <- as.factor(rep(1:length(sizes), sizes)) RVAL <- disco(x, factors=g, distance=distance, R=0, method=method) } else { RVAL <- eqdist.etest(x, sizes, distance = distance, R=0, method=method)$statistic } RVAL } eqdist.etest <- function(x, sizes, distance = FALSE, method = c("original","discoB","discoF"), R) { ## multivariate E-test of the multisample hypothesis of equal distributions ## x: matrix of pooled sample or distance matrix ## sizes: vector of sample sizes ## distance: logical, TRUE if x is a distance matrix, otherwise false ## method: original (default) or disco components ## R: number of replicates ## method <-match.arg(method) if (method=="discoB" || method=="discoF") { g <- as.factor(rep(1:length(sizes), sizes)) # for other index use disco() function directly return(disco(x, factors=g, distance=distance, index=1.0, R=R, method=method)) } nsamples <- length(sizes) if (nsamples < 2) return (NA) if (min(sizes) < 1) return (NA) if (!is.null(attr(x, "Size"))) distance <- TRUE x <- as.matrix(x) if (NROW(x) != sum(sizes)) stop("nrow(x) should equal sum(sizes)") if (distance == FALSE && nrow(x) == ncol(x)) warning("square data matrix with distance==FALSE") d <- NCOL(x) if (distance == TRUE) d <- 0 str <- "Multivariate " if (d == 1) str <- "Univariate " if (d == 0) str <- "" e0 <- 0.0 repl <- rep(0, R) pval <- 1.0 b <- .C("ksampleEtest", x = as.double(t(x)), byrow = as.integer(1), nsamples = as.integer(nsamples), sizes = as.integer(sizes), dim = as.integer(d), R = as.integer(R), e0 = as.double(e0), e = as.double(repl), pval = as.double(pval), PACKAGE = "energy") names(b$e0) <- "E-statistic" sz <- paste(sizes, collapse = " ", sep = "") methodname <- paste(str, length(sizes), "-sample E-test of equal distributions", sep = "") dataname <- paste("sample sizes ", sz, ", replicates ", R, sep="") e <- list( call = match.call(), method = methodname, statistic = b$e0, p.value = b$pval, data.name = dataname) class(e) <- "htest" e } ksample.e <- function(x, sizes, distance = FALSE, method = c("original","discoB","discoF"), ix = 1:sum(sizes)) { ## computes k-sample E-statistics for equal distributions ## retained for backward compatibility or use with boot ## (this function simply passes arguments to eqdist.e) ## ## x: pooled sample or distance matrix ## sizes: vector of sample sizes ## distance: TRUE if x is a distance matrix, otherwise FALSE ## method: default (original) or disco between components or disco F ratio ## ix: a permutation of row indices of x ## x <- as.matrix(x) method <- match.arg(method) eqdist.e(x[ix,], sizes=sizes, distance=distance, method=method) } energy/R/Emvnorm.R0000644000176200001440000000342013144317724013517 0ustar liggesusersmvnorm.etest <- function(x, R) { # parametric bootstrap E-test for multivariate normality if (is.vector(x)) { n <- length(x) d <- 1 bootobj <- boot::boot(x, statistic = normal.e, R = R, sim = "parametric", ran.gen = function(x, y) { return(rnorm(n)) }) } else { n <- nrow(x) d <- ncol(x) bootobj <- boot::boot(x, statistic = mvnorm.e, R = R, sim = "parametric", ran.gen = function(x, y) { return(matrix(rnorm(n * d), nrow = n, ncol = d)) }) } if (R > 0) p <- 1 - mean(bootobj$t < bootobj$t0) else p <- NA names(bootobj$t0) <- "E-statistic" e <- list(statistic = bootobj$t0, p.value = p, method = "Energy test of multivariate normality: estimated parameters", data.name = paste("x, sample size ", n, ", dimension ", d, ", replicates ", R, sep = "")) class(e) <- "htest" e } mvnorm.e <- function(x) { # E-statistic for multivariate normality if (is.vector(x)) return(normal.e(x)) n <- nrow(x) d <- ncol(x) if (n < 2) return(normal.e(x)) # subtract column means and compute S^(-1/2) z <- scale(x, scale = FALSE) ev <- eigen(var(x), symmetric = TRUE) P <- ev$vectors lambda <- ev$values y <- z %*% (P %*% diag(1/sqrt(lambda)) %*% t(P)) if (any(!is.finite(y))) return(NA) return(mvnEstat(y)) } normal.e <- function(x) { x <- as.vector(x) y <- sort(x) n <- length(y) if (y[1] == y[n]) return(NA) y <- scale(y) K <- seq(1 - n, n - 1, 2) return(2 * (sum(2 * y * pnorm(y) + 2 * dnorm(y)) - n/sqrt(pi) - mean(K * y))) } energy/R/dcovu.R0000644000176200001440000000202212757710073013214 0ustar liggesusers## dcovu.R ## unbiased dcov^2 and bias-corrected dcor^2 ## bcdcor <- function(x, y) { ## compute bias corrected distance correlation dcorU(x, y) } dcovU <- function(x, y) { ## unbiased dcov^2 if (!(class(x) == "dist")) x <- dist(x) if (!(class(y) == "dist")) y <- dist(y) x <- as.matrix(x) y <- as.matrix(y) n <- nrow(x) m <- nrow(y) if (n != m) stop("sample sizes must agree") if (! (all(is.finite(c(x, y))))) stop("data contains missing or infinite values") estimates <- dcovU_stats(x, y) #RcppExports return (estimates[1]) } dcorU <- function(x, y) { ## unbiased dcov^2 if (!(class(x) == "dist")) x <- dist(x) if (!(class(y) == "dist")) y <- dist(y) x <- as.matrix(x) y <- as.matrix(y) n <- nrow(x) m <- nrow(y) if (n != m) stop("sample sizes must agree") if (! (all(is.finite(c(x, y))))) stop("data contains missing or infinite values") estimates <- dcovU_stats(x, y) #RcppExports return (estimates[2]) } energy/R/dcorT.R0000644000176200001440000000451612757710073013161 0ustar liggesusers### dcorT.R ### implementation of the distance correlation t-test Astar <- function(d) { ## d is a distance matrix or distance object ## modified or corrected doubly centered distance matrices ## denoted A* (or B*) in JMVA t-test paper (2013) d <- as.matrix(d) n <- nrow(d) if (n != ncol(d)) stop("Argument d should be distance") m <- rowMeans(d) M <- mean(d) a <- sweep(d, 1, m) b <- sweep(a, 2, m) A <- b + M #same as plain A #correction to get A^* A <- A - d/n diag(A) <- m - M (n / (n-1)) * A } BCDCOR <- function(x, y, distance=FALSE) { ## compute bias corrected distance correlation ## attempt to check if distance flag is valid if (distance==FALSE) { if (class(x)=="dist" || class(y)=="dist") stop("distance==FALSE but argument is a dist object") x <- as.matrix(dist(x)) y <- as.matrix(dist(y)) } else { x <- as.matrix(x) y <- as.matrix(y) if (distance == TRUE) if (!isSymmetric(x) || !isSymmetric(y)) stop("distance==TRUE but matrices non-symmetric") } n <- NROW(x) AA <- Astar(x) BB <- Astar(y) XY <- sum(AA*BB) - (n/(n-2)) * sum(diag(AA*BB)) XX <- sum(AA*AA) - (n/(n-2)) * sum(diag(AA*AA)) YY <- sum(BB*BB) - (n/(n-2)) * sum(diag(BB*BB)) list(bcR=XY / sqrt(XX*YY), XY=XY/n^2, XX=XX/n^2, YY=YY/n^2, n=n) } dcor.t <- function(x, y, distance=FALSE) { # computes the t statistic for corrected high-dim dCor # should be approximately student T # distance arg is checked in bcdcor r <- BCDCOR(x, y, distance) Cn <- r$bcR n <- r$n M <- n*(n-3)/2 sqrt(M-1) * Cn / sqrt(1-Cn^2) } dcor.ttest <- function(x, y, distance=FALSE) { # x and y are observed samples or distance # distance arg is checked in bcdcor dname <- paste(deparse(substitute(x)),"and", deparse(substitute(y))) stats <- BCDCOR(x, y, distance) bcR <- stats$bcR n <- stats$n M <- n * (n-3) / 2 df <- M - 1 names(df) <- "df" tstat <- sqrt(M-1) * bcR / sqrt(1-bcR^2) names(tstat) <- "T" estimate <- bcR names(estimate) <- "Bias corrected dcor" pval <- 1 - pt(tstat, df=df) method <- "dcor t-test of independence" rval <- list(statistic = tstat, parameter = df, p.value = pval, estimate=estimate, method=method, data.name=dname) class(rval) <- "htest" return(rval) } energy/R/RcppExports.R0000644000176200001440000000136013144146625014366 0ustar liggesusers# Generated by using Rcpp::compileAttributes() -> do not edit by hand # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 D_center <- function(Dx) { .Call(`_energy_D_center`, Dx) } U_center <- function(Dx) { .Call(`_energy_U_center`, Dx) } dcovU_stats <- function(Dx, Dy) { .Call(`_energy_dcovU_stats`, Dx, Dy) } mvnEstat <- function(y) { .Call(`_energy_mvnEstat`, y) } partial_dcor <- function(Dx, Dy, Dz) { .Call(`_energy_partial_dcor`, Dx, Dy, Dz) } partial_dcov <- function(Dx, Dy, Dz) { .Call(`_energy_partial_dcov`, Dx, Dy, Dz) } projection <- function(Dx, Dz) { .Call(`_energy_projection`, Dx, Dz) } U_product <- function(U, V) { .Call(`_energy_U_product`, U, V) } energy/R/dcov.R0000644000176200001440000001313413156537337013041 0ustar liggesusersdcov.test <- function(x, y, index=1.0, R=NULL) { ## check for valid number of replicates R method <- "Specify the number of replicates R (R > 0) to perform the test of independence" if (! is.null(R)) { R <- floor(R) if (R < 1) R <- 0 if (R > 0) method <- "dCov test of independence" } else { R <- 0 } # distance covariance test for multivariate independence if (!(class(x) == "dist")) x <- dist(x) if (!(class(y) == "dist")) y <- dist(y) x <- as.matrix(x) y <- as.matrix(y) dst <- TRUE n <- nrow(x) m <- nrow(y) if (n != m) stop("Sample sizes must agree") if (! (all(is.finite(c(x, y))))) stop("Data contains missing or infinite values") stat <- dcorr <- reps <- 0 dcov <- rep(0, 4) if (R > 0) reps <- rep(0, R) pval <- 1 dims <- c(n, ncol(x), ncol(y), dst, R) # dcov = [dCov,dCor,dVar(x),dVar(y)] a <- .C("dCOVtest", x = as.double(t(x)), y = as.double(t(y)), byrow = as.integer(TRUE), dims = as.integer(dims), index = as.double(index), reps = as.double(reps), DCOV = as.double(dcov), pval = as.double(pval), PACKAGE = "energy") # test statistic is n times the square of dCov statistic stat <- n * a$DCOV[1]^2 dcorr <- a$DCOV V <- dcorr[[1]] names(stat) <- "nV^2" names(V) <- "dCov" dataname <- paste("index ", index, ", replicates ", R, sep="") pval <- ifelse (R < 1, NA, a$pval) e <- list( statistic = stat, method = method, estimate = V, estimates = dcorr, p.value = pval, replicates = n* a$reps^2, n = n, data.name = dataname) class(e) <- "htest" return(e) } dcor.test <- function(x, y, index=1.0, R) { # distance correlation test for multivariate independence # like dcov.test but using dcor as the test statistic if (is.null(R)) R <- 0 R <- ifelse(R > 0, floor(R), 0) RESULT <- dcov.test(x, y, index=1.0, R) # this test statistic is n times the square of dCov statistic DCOVteststat <- RESULT$statistic DCOVreplicates <- RESULT$replicates # RESULT$estimates = [dCov,dCor,dVar(x),dVar(y)] # dVar are invariant under permutation of sample indices DCORteststat <- RESULT$estimates[2] dvarX <- RESULT$estimates[3] dvarY <- RESULT$estimates[4] n <- RESULT$n DCORreps <- sqrt(DCOVreplicates / n) / sqrt(dvarX * dvarY) if (R > 0) p.value <- (1 + sum(DCORreps >= DCORteststat)) / (1 + R) else p.value <- NA names(DCORteststat) <- "dCor" dataname <- paste("index ", index, ", replicates ", R, sep="") method <- ifelse(R > 0, "dCor test of independence", "Specify the number of replicates R>0 to perform the test of independence") e <- list( method = method, statistic = DCORteststat, estimates = RESULT$estimates, p.value = p.value, replicates = DCORreps, n = n, data.name = dataname) class(e) <- "htest" return(e) } .dcov <- function(x, y, index=1.0) { # distance covariance statistic for independence # dcov = [dCov,dCor,dVar(x),dVar(y)] (vector) # this function provides the fast method for computing dCov # it is called by the dcov and dcor functions if (!(class(x) == "dist")) x <- dist(x) if (!(class(y) == "dist")) y <- dist(y) x <- as.matrix(x) y <- as.matrix(y) dst <- TRUE n <- nrow(x) m <- nrow(y) if (n != m) stop("Sample sizes must agree") if (! (all(is.finite(c(x, y))))) stop("Data contains missing or infinite values") dims <- c(n, NCOL(x), NCOL(y), dst) idx <- 1:dims[1] DCOV <- numeric(4) a <- .C("dCOV", x = as.double(t(x)), y = as.double(t(y)), byrow = as.integer(TRUE), dims = as.integer(dims), index = as.double(index), idx = as.double(idx), DCOV = as.double(DCOV), PACKAGE = "energy") return(a$DCOV) } dcov <- function(x, y, index=1.0) { # distance correlation statistic for independence return(.dcov(x, y, index)[1]) } dcor <- function(x, y, index=1.0) { # distance correlation statistic for independence return(.dcov(x, y, index)[2]) } DCOR <- function(x, y, index=1.0) { # distance covariance and correlation statistics # alternate method, implemented in R without .C call # this method is usually slower than the C version if (!(class(x) == "dist")) x <- dist(x) if (!(class(y) == "dist")) y <- dist(y) x <- as.matrix(x) y <- as.matrix(y) n <- nrow(x) m <- nrow(y) if (n != m) stop("Sample sizes must agree") if (! (all(is.finite(c(x, y))))) stop("Data contains missing or infinite values") if (index < 0 || index > 2) { warning("index must be in [0,2), using default index=1") index=1.0} stat <- 0 dims <- c(n, ncol(x), ncol(y)) Akl <- function(x) { d <- as.matrix(x)^index m <- rowMeans(d) M <- mean(d) a <- sweep(d, 1, m) b <- sweep(a, 2, m) return(b + M) } A <- Akl(x) B <- Akl(y) dCov <- sqrt(mean(A * B)) dVarX <- sqrt(mean(A * A)) dVarY <- sqrt(mean(B * B)) V <- sqrt(dVarX * dVarY) if (V > 0) dCor <- dCov / V else dCor <- 0 return(list(dCov=dCov, dCor=dCor, dVarX=dVarX, dVarY=dVarY)) } energy/R/edist.R0000644000176200001440000000460712757710073013217 0ustar liggesusersedist <- function(x, sizes, distance = FALSE, ix = 1:sum(sizes), alpha = 1, method = c("cluster","discoB","discoF")) { # computes the e-dissimilarity matrix between k samples or clusters # x: pooled sample or Euclidean distances # sizes: vector of sample (cluster) sizes # distance: TRUE if x is a distance matrix, otherwise FALSE # ix: a permutation of row indices of x # alpha: distance exponent # method: cluster distances or disco statistics # k <- length(sizes) if (k == 1) return (as.dist(0.0)) if (k < 1) return (NA) e <- matrix(nrow=k, ncol=k) n <- cumsum(sizes) m <- 1 + c(0, n[1:(k-1)]) if (distance == FALSE) { if (is.vector(x)) x <- matrix(x, nrow = length(x), ncol = 1) dst <- as.matrix(dist(x)) } else dst <- as.matrix(x) if (alpha != 1) { if (alpha <= 0 || alpha > 2) warning("exponent alpha should be in (0,2]") dst <- dst^alpha } type <- match.arg(method) if (type == "cluster") { for (i in 1:(k - 1)) { e[i, i] <- 0.0 for (j in (i + 1):k) { n1 <- sizes[i] n2 <- sizes[j] ii <- ix[m[i]:n[i]] jj <- ix[m[j]:n[j]] w <- n1 * n2 / (n1 + n2) m11 <- sum(dst[ii, ii]) / (n1 * n1) m22 <- sum(dst[jj, jj]) / (n2 * n2) m12 <- sum(dst[ii, jj]) / (n1 * n2) e[i, j] <- e[j, i] <- w * ((m12 + m12) - (m11 + m22)) } } } if (type == "discoF" || type == "discoB") { #disco statistics for testing F=G for (i in 1:(k - 1)) { e[i, i] <- 0.0 for (j in (i + 1):k) { n1 <- sizes[i] n2 <- sizes[j] ii <- ix[m[i]:n[i]] jj <- ix[m[j]:n[j]] J <- c(ii,jj) d <- dst[J, J] N <- NROW(d) total <- sum(d) / (2*N) trt <- factor(c(rep(1,n1),rep(2,n2))) y <- as.vector(d[,1]) M <- model.matrix(y ~ 0 + trt) G <- t(M) %*% d %*% M withins <- diag(G) / (2*c(n1,n2)) W <- sum(withins) B <- total - W ifelse (type == "discoF", e[i,j] <- e[j,i] <- B / (W/(N-2)), e[i,j] <- e[j,i] <- B) } } } e <- as.dist(e) attr(e,"method") <- paste(method,": index= ", alpha) e } energy/R/Ecluster.R0000644000176200001440000000225112757710073013666 0ustar liggesusers energy.hclust <- function(dst, alpha = 1) { if (!(class(dst) == "dist")) stop("The first argument must be a dist object.") d <- dst n <- attr(d, "Size") if (!isTRUE(all.equal(alpha, 1))) { if (alpha > 2) warning("Exponent alpha should be in (0,2]") if (alpha < 0) stop("Cannot use negative exponent on distance.") d <- d^alpha } labels <- attr(d, "Labels") if (is.null(labels)) labels <- paste(1:n) merge <- integer(2 * (n - 1)) height <- double(n - 1) order <- integer(n) ecl <- .C("Emin_hclust", diss = as.double(d), en = as.integer(n), merge = as.integer(merge), height = as.double(height), order = as.integer(order), PACKAGE = "energy") merge <- matrix(ecl$merge, nrow = n - 1, ncol = 2) e <- list(merge = merge, height = ecl$height, order = ecl$order, labels = labels, method = "e-distance", call = match.call(), dist.method = attr(dst, "method")) class(e) <- "hclust" e } energy/R/pdcov-test.R0000644000176200001440000000441612757710073014175 0ustar liggesuserspdcov.test <- function(x, y, z, R) { ## x, y, z must be dist. objects or data matrices (no dist matrix) if (!(class(x) == "dist")) x <- dist(x) if (!(class(y) == "dist")) y <- dist(y) if (!(class(z) == "dist")) z <- dist(z) Dx <- as.matrix(x) Dy <- as.matrix(y) Dz <- as.matrix(z) n <- nrow(Dx) Pxz <- projection(Dx, Dz) #U-center and compute projections Pyz <- projection(Dy, Dz) #PxzU <- U_center(Pxz) #not necessary, because of invariance #PyzU <- U_center(Pyz) teststat <- n * U_product(Pxz, Pyz) ## calc. pdcor den <- sqrt(U_product(Pxz, Pxz) * U_product(Pyz, Pyz)) if (den > 0.0) { estimate <- teststat / (n * den) } else estimate <- 0.0 bootfn <- function(Pxz, i, Pyz) { # generate the permutation replicates of dcovU(Pxz, Pyz) # PxzU and PyzU are the U-centered matrices U_product(Pxz[i, i], Pyz) #RcppExports } reps <- replicate(R, expr= { i <- sample(1:n) bootfn(Pxz, i, Pyz=Pyz) }) replicates <- n * reps pval <- (1 + sum(replicates > teststat)) / (1 + R) #df <- n * (n-3) / 2 - 2 dataname <- paste("replicates ", R, sep="") names(estimate) <- "pdcor" names(teststat) <- "n V^*" e <- list( call = match.call(), method = paste("pdcov test", sep = ""), statistic = teststat, estimate = estimate, p.value = pval, n = n, replicates = replicates, data.name = dataname) class(e) <- "htest" return(e) } pdcor.test <- function(x, y, z, R) { ## x, y, z must be dist. objects or data matrices (no dist matrix) ## all required calc. done in pdcov.test result <- pdcov.test(x, y, z, R) reps <- result$replicates teststat <- result$estimate estimate <- result$estimate n <- result$n if (estimate > 0.0) { nRootV <- result$statistic / result$estimate pdcor_reps <- reps / nRootV } else { pdcor_reps <- reps } names(estimate) <- names(teststat) <- "pdcor" pval <- (1 + sum(pdcor_reps > teststat)) / (1 + R) e <- list( call = match.call(), method = paste("pdcor test", sep = ""), statistic = teststat, estimate = estimate, p.value = pval, n = n, replicates = pdcor_reps, data.name = result$dataname) class(e) <- "htest" return(e) } energy/R/disco.R0000644000176200001440000001450513145560515013202 0ustar liggesusers ### disco tests - implementation of DIStance COmponents methods in: ### ### Rizzo, M.L. and Szekely, G.J. (2010) "DISCO Analysis: A Nonparametric ### Extension of Analysis of Variance, Annals of Applied Statistics ### Vol. 4, No. 2, 1034-1055. ### ### Sept 2010 parts of disco package merged into energy package ### this release supports one way models ### this version does not use the C library ### ### disco: computes the decomposition and test using F ratio ### disco.between: statistic and test using between component ### .disco1: internal computations for one factor ### .disco1stat, .disco1Bstat: internal for boot function ### ### disco <- function(x, factors, distance = FALSE, index = 1, R, method = c("disco", "discoB", "discoF")) { ## x is response or Euclidean distance matrix or dist() object factors ## is a matrix or data frame of group labels distance=TRUE if x is ## distance, otherwise FALSE index is the exponent on distance, in (0,2] ## R is number of replicates for test method: use F ratio (default) or ## between component (discoB) disco method is currently alias for discoF method <- match.arg(method) factors <- data.frame(factors) if (method == "discoB") return(disco.between(x, factors = factors, distance = distance, index = index, R = R)) nfactors <- NCOL(factors) if (distance) dst <- as.matrix(x) else dst <- as.matrix(dist(x)) N <- NROW(dst) if (NCOL(dst) != N) stop("distance==TRUE but first argument is not distance") if (!isTRUE(all.equal(index, 1))) dst <- dst^index stats <- matrix(0, nfactors, 6) colnames(stats) <- c("Trt", "Within", "df1", "df2", "Stat", "p-value") for (j in 1:nfactors) { trt <- factors[, j] stats[j, 1:4] <- .disco1(trt = trt, dst = dst) if (R > 0) { b <- boot::boot(data = dst, statistic = .disco1stat, sim = "permutation", R = R, trt = trt) stats[j, 5] <- b$t0 stats[j, 6] <- (sum(b$t > b$t0) + 1)/(R + 1) } else { stats[j, 5] <- .disco1stat(dst, i = 1:nrow(dst), trt = trt) stats[j, 6] <- NA } } methodname <- "DISCO (F ratio)" dataname <- deparse(substitute(x)) total <- sum(stats[1, 1:2]) within <- total - sum(stats[, 1]) Df.trt <- stats[, 3] factor.names <- names(factors) factor.levels <- sapply(factors, nlevels) sizes <- sapply(factors, tabulate) e <- list(call = match.call(), method = methodname, statistic = stats[, 5], p.value = stats[, 6], k = nfactors, N = N, between = stats[, 1], withins = stats[, 2], within = within, total = total, Df.trt = Df.trt, Df.e = nrow(dst) - sum(Df.trt) - 1, index = index, factor.names = factor.names, factor.levels = factor.levels, sample.sizes = sizes, stats = stats) class(e) <- "disco" e } disco.between <- function(x, factors, distance = FALSE, index = 1, R) { ## disco test based on the between-sample component similar to disco ## except that 'disco' test is based on the F ratio disco.between test ## for one factor (balanced) is asymptotically equivalent to k-sample E ## test (test statistics are proportional in that case but not in ## general). x is response or Euclidean distance matrix or dist() ## object factors is a matrix or data frame of group labels ## distance=TRUE if x is distance, otherwise FALSE index is the exponent ## on distance, in (0,2] factors <- data.frame(factors) nfactors <- NCOL(factors) if (nfactors > 1) stop("More than one factor is not implemented in disco.between") if (distance) dst <- as.matrix(x) else dst <- as.matrix(dist(x)) N <- NROW(dst) if (NCOL(dst) != N) stop("distance==TRUE but first argument is not distance") if (!isTRUE(all.equal(index, 1))) dst <- dst^index trt <- factors[, 1] if (R > 0) { b <- boot::boot(data = dst, statistic = .disco1Bstat, sim = "permutation", R = R, trt = trt) between <- b$t0 reps <- b$t pval <- mean(reps >= between) } else { between <- .disco1Bstat(dst, i = 1:nrow(dst), trt = trt) pval <- NA } if (R == 0) return(between) methodname <- "DISCO (Between-sample)" dataname <- deparse(substitute(x)) names(between) <- "DISCO between statistic" e <- list(call = match.call(), method = methodname, statistic = between, p.value = pval, data.name = dataname) class(e) <- "htest" e } .disco1 <- function(trt, dst) { ## dst is Euclidean distance matrix or power of it trt is the treatment, ## a factor trt <- factor(trt) k <- nlevels(trt) n <- tabulate(trt) N <- sum(n) total <- sum(dst)/(2 * N) y <- as.vector(dst[, 1]) M <- model.matrix(y ~ 0 + trt) G <- t(M) %*% dst %*% M withins <- diag(G)/(2 * n) W <- sum(withins) B <- total - W c(B, W, k - 1, N - k) } .disco1stat <- function(dst, i, trt) { ## i is permuation vector supplied by bootstrap dst is Euclidean ## distance matrix or power of it trt is the treatment, a factor returns ## the disco 'F' ratio idx <- 1:nrow(dst) d <- .disco1(trt = trt[idx[i]], dst = dst) statistic <- (d[1]/d[3])/(d[2]/d[4]) } .disco1Bstat <- function(dst, i, trt) { ## i is permuation vector supplied by bootstrap dst is Euclidean ## distance matrix or power of it trt is the treatment, a factor returns ## the between-sample component (for one factor) idx <- 1:nrow(dst) .disco1(trt = trt[idx[i]], dst = dst)[1] } print.disco <- function(x, ...) { k <- x$k md1 <- x$between/x$Df.trt md2 <- x$within/x$Df.e f0 <- x$statistic print(x$call) cat(sprintf("\nDistance Components: index %5.2f\n", x$index)) cat(sprintf("%-20s %4s %10s %10s %10s %10s\n", "Source", "Df", "Sum Dist", "Mean Dist", "F-ratio", "p-value")) for (i in 1:k) { fname <- x$factor.names[i] cat(sprintf("%-20s %4d %10.5f %10.5f %10.3f %10s\n", fname, x$Df.trt[i], x$between[i], md1[i], f0[i], format.pval(x$p.value[i]))) } cat(sprintf("%-20s %4d %10.5f %10.5f\n", "Within", x$Df.e, x$within, md2)) cat(sprintf("%-20s %4d %10.5f\n", "Total", x$N - 1, x$total)) } energy/R/pdcor.R0000644000176200001440000000106312757710073013207 0ustar liggesusers## pdcor.R ## ## pdcor <- function(x, y, z) { if (!(class(x) == "dist")) x <- dist(x) if (!(class(y) == "dist")) y <- dist(y) if (!(class(z) == "dist")) z <- dist(z) x <- as.matrix(x) y <- as.matrix(y) z <- as.matrix(z) partial_dcor(x, y, z)["pdcor"] } pdcov <- function(x, y, z) { if (!(class(x) == "dist")) x <- dist(x) if (!(class(y) == "dist")) y <- dist(y) if (!(class(z) == "dist")) z <- dist(z) x <- as.matrix(x) y <- as.matrix(y) z <- as.matrix(z) partial_dcov(x, y, z) } energy/R/Eindep.R0000644000176200001440000001026313144320252013271 0ustar liggesusersindep.test<- function(x, y, method = c("dcov","mvI"), index = 1, R) { # two energy tests for multivariate independence type <- match.arg(method) if (type == "dcov") return(dcov.test(x, y, index, R)) else if (type == "mvI") return(mvI.test(x, y, R)) } mvI <- function(x, y) { # energy statistic for multivariate independence # returns dependence coefficient I_n x <- as.matrix(x) y <- as.matrix(y) n <- nrow(x) m <- nrow(y) if (n != m || n < 2) stop("Sample sizes must agree") if (! (all(is.finite(c(x, y))))) stop("Data contains missing or infinite values") stat <- 0 dims <- c(n, ncol(x), ncol(y)) e <- .C("indepE", x = as.double(t(x)), y = as.double(t(y)), byrow = as.integer(TRUE), dims = as.integer(dims), stat = as.double(stat), PACKAGE = "energy") sqrt(e$stat) } mvI.test<- function(x, y, R) { # energy test for multivariate independence x <- as.matrix(x) y <- as.matrix(y) n <- nrow(x) m <- nrow(y) if (n != m || n < 2) stop("Sample sizes must agree") if (! (all(is.finite(c(x, y))))) stop("Data contains missing or infinite values") stat <- reps <- 0 if (R > 0) reps <- rep(0, R) pval <- 1 dims <- c(n, ncol(x), ncol(y), R) a <- .C("indepEtest", x = as.double(t(x)), y = as.double(t(y)), byrow = as.integer(TRUE), dims = as.integer(dims), stat = as.double(stat), reps = as.double(reps), pval = as.double(pval), PACKAGE = "energy") stat <- n*a$stat est <- sqrt(a$stat) names(est) <- "I" names(stat) <- "nI^2" dataname <- paste("x (",n," by ",ncol(x), "), y(",n," by ", ncol(y), "), replicates ", R, sep="") if (R > 0) p.value = a$pval else p.value = NA e <- list( method = "mvI energy test of independence", statistic = stat, estimate = est, replicates = n*reps, p.value = p.value, data.name = dataname) class(e) <- "htest" e } indep.e<- function(x, y) { # energy statistic for multivariate independence (deprecated) .Deprecated(new = "mvI", package = "energy") x <- as.matrix(x) y <- as.matrix(y) n <- nrow(x) m <- nrow(y) if (n != m || n < 2) stop("Sample sizes must agree") if (! (all(is.finite(c(x, y))))) stop("Data contains missing or infinite values") stat <- 0 dims <- c(n, ncol(x), ncol(y)) e <- .C("indepE", x = as.double(t(x)), y = as.double(t(y)), byrow = as.integer(TRUE), dims = as.integer(dims), stat = as.double(stat), PACKAGE = "energy") sqrt(e$stat) } indep.etest<- function(x, y, R) { # energy test for multivariate independence (deprecated) .Deprecated(new = "indep.test", package = "energy", msg = "indep.etest will become defunct in future release. Use indep.test with method mvI.") x <- as.matrix(x) y <- as.matrix(y) n <- nrow(x) m <- nrow(y) if (n != m || n < 2) stop("Sample sizes must agree") if (! (all(is.finite(c(x, y))))) stop("Data contains missing or infinite values") stat <- reps <- 0 if (R > 0) reps <- rep(0, R) pval <- 1 dims <- c(n, ncol(x), ncol(y), R) a <- .C("indepEtest", x = as.double(t(x)), y = as.double(t(y)), byrow = as.integer(TRUE), dims = as.integer(dims), stat = as.double(stat), reps = as.double(reps), pval = as.double(pval), PACKAGE = "energy") stat <- sqrt(a$stat) names(stat) <- "I" dataname <- paste("x (",n," by ",ncol(x), "), y(",n," by ", ncol(y), "), replicates ", R, sep="") if (R > 0) p.value <- a$pval else p.value <- NA e <- list( method = paste("Energy test of independence", sep = ""), statistic = stat, p.value = p.value, data.name = dataname) class(e) <- "htest" e } energy/MD50000644000176200001440000000452313156600704012061 0ustar liggesusers65a6927f246f71c78ca2940088747d97 *DESCRIPTION 06ac70e0e00da5ebc888991d170fedc9 *NAMESPACE 6ca0ab4c1370e6aff3a350cf6b1debb4 *NEWS 60c944921e1c68681babcce658ef512c *R/Ecluster.R 8b428dedf82ffb7e15422ff9f49addf4 *R/Eeqdist.R 9af28751dd6b8ed9afdc45ed3fec5cff *R/Eindep.R a0d3a418969b10f0312159869dfae2ae *R/Emvnorm.R ee422cbbaab6e95389cbf3c90db51441 *R/Epoisson.R df14913c2355403486761709b1d0571e *R/RcppExports.R f09a74b96ef85633d5a8f3a875ef84db *R/centering.R 0c3566c2bc56e52b05e733e2ef9e6c69 *R/dcorT.R ae0b5579d71db5b253d1ece0801b02fa *R/dcov.R aa0cd56cb4314d09fa6fe7381e2732f0 *R/dcovu.R d0e402aef2cafec0f7fad15a91e6835e *R/disco.R ab6155ea80b037f78dfbee4e179d4c4f *R/edist.R b59b194449218e1668ddbdc5811d010c *R/pdcor.R 99d4997266887901ee6a62763e5ecfb8 *R/pdcov-test.R 5bb9ea0c6b352a1e910817d4a2b2a997 *man/U_product.Rd 0106d2186ba21e28b9eef5f747c224bb *man/centering.Rd 21887eadf04293b14c48b56186a9a7ab *man/dcor.ttest.Rd c3c4f6a98c07966047d0114a96dbebc4 *man/dcov.Rd 7abdb2f8b9fff293f984ba7c9e4b1f61 *man/dcov.test.Rd 20cc1dbf1690caba9464df08f1b542d5 *man/dcovu.Rd 681fa084592005760e5fd97e550c70e6 *man/disco.Rd bc9faea760233d2ff7c38702cf061d4c *man/edist.Rd e4429a99d2add1f38a742331f6d578d9 *man/energy-deprecated.Rd 97bdc56c7e04ba90c96f18ab4e9311b9 *man/energy-package.Rd b918f4703f7ff7a298dce7ed3e324477 *man/energy.hclust.Rd 296495b40f99e3d4264f8285f2072ac4 *man/eqdist.etest.Rd ab71ed8778965add0a0a215d3e6e177e *man/indep.test.Rd f0a60791a303c425609a68651c549e26 *man/mvI.test.Rd ec44802e62875b25b1b167756d09075a *man/mvnorm-etest.Rd 6e62676733a8a8cdbaed67b8d0c00048 *man/pdcor.Rd ee92472463df0214c11d4e2d825987bc *man/poisson.mtest.Rd 727842fcced81550ab308552381b5b1b *src/ECl.cc 60f2661bf08b2295e345cf799b8bb28e *src/ECl.h f93deb94d53b0be60435ac02150a8009 *src/Ecluster.cc 2a0d7e13abeedc9d58e8c8b438d93f50 *src/Eindep.c ca127e0cff65055beff4b60d5fb5b8d3 *src/RcppExports.cpp c560954ffd313a1b0aeb9ede7f73c4d1 *src/U-product.cpp 932a45c4fe940a27be100c2c5ca3468c *src/centering.cpp da0b2398098bc08ae54f647b29cf60b5 *src/dcov.c fe3940cdfcef45def8b3aec42b66c1cd *src/dcovU.cpp aff0782a38beac801051d6f741566a1a *src/energy.c 345eebf91336a105c74f7c99a4c321a5 *src/energy_init.c 663ee0e9767da39329a65a8bd86b7134 *src/mvnorm.cpp 6f5eba31cc00fe9daecaabfcfa182975 *src/partial-dcor.cpp 991223944feae331d405ae200899f71a *src/projection.cpp 9ad5c89ba30013de49a6cd0fd94c7396 *src/utilities.c energy/DESCRIPTION0000644000176200001440000000213513156600704013254 0ustar liggesusersPackage: energy Title: E-Statistics: Multivariate Inference via the Energy of Data Version: 1.7-2 Date: 2017-09-14 Author: Maria L. Rizzo and Gabor J. Szekely Description: E-statistics (energy) tests and statistics for multivariate and univariate inference, including distance correlation, one-sample, two-sample, and multi-sample tests for comparing multivariate distributions, are implemented. Measuring and testing multivariate independence based on distance correlation, partial distance correlation, multivariate goodness-of-fit tests, clustering based on energy distance, testing for multivariate normality, distance components (disco) for non-parametric analysis of structured data, and other energy statistics/methods are implemented. Maintainer: Maria Rizzo Imports: Rcpp (>= 0.12.6), stats, boot LinkingTo: Rcpp Suggests: MASS URL: https://github.com/mariarizzo/energy License: GPL (>= 2) NeedsCompilation: yes Repository: CRAN Packaged: 2017-09-14 17:47:42 UTC; Maria Date/Publication: 2017-09-14 22:25:08 UTC energy/man/0000755000176200001440000000000013144312721012314 5ustar liggesusersenergy/man/dcov.Rd0000644000176200001440000001271212757710073013554 0ustar liggesusers\name{distance correlation} \alias{dcor} \alias{dcov} \alias{DCOR} \title{ Distance Correlation and Covariance Statistics} \description{ Computes distance covariance and distance correlation statistics, which are multivariate measures of dependence. } \usage{ dcov(x, y, index = 1.0) dcor(x, y, index = 1.0) DCOR(x, y, index = 1.0) } \arguments{ \item{x}{ data or distances of first sample} \item{y}{ data or distances of second sample} \item{index}{ exponent on Euclidean distance, in (0,2]} } \details{ \code{dcov} and \code{dcor} or \code{DCOR} compute distance covariance and distance correlation statistics. \code{DCOR} is a self-contained R function returning a list of statistics. \code{dcor} execution is faster than \code{DCOR} (see examples). The sample sizes (number of rows) of the two samples must agree, and samples must not contain missing values. Arguments \code{x}, \code{y} can optionally be \code{\link{dist}} objects; otherwise these arguments are treated as data. Distance correlation is a new measure of dependence between random vectors introduced by Szekely, Rizzo, and Bakirov (2007). For all distributions with finite first moments, distance correlation \eqn{\mathcal R}{R} generalizes the idea of correlation in two fundamental ways: (1) \eqn{\mathcal R(X,Y)}{R(X,Y)} is defined for \eqn{X} and \eqn{Y} in arbitrary dimension. (2) \eqn{\mathcal R(X,Y)=0}{R(X,Y)=0} characterizes independence of \eqn{X} and \eqn{Y}. Distance correlation satisfies \eqn{0 \le \mathcal R \le 1}{0 \le R \le 1}, and \eqn{\mathcal R = 0}{R = 0} only if \eqn{X} and \eqn{Y} are independent. Distance covariance \eqn{\mathcal V}{V} provides a new approach to the problem of testing the joint independence of random vectors. The formal definitions of the population coefficients \eqn{\mathcal V}{V} and \eqn{\mathcal R}{R} are given in (SRB 2007). The definitions of the empirical coefficients are as follows. The empirical distance covariance \eqn{\mathcal{V}_n(\mathbf{X,Y})}{V_n(X,Y)} with index 1 is the nonnegative number defined by \deqn{ \mathcal{V}^2_n (\mathbf{X,Y}) = \frac{1}{n^2} \sum_{k,\,l=1}^n A_{kl}B_{kl} }{ V^2_n (X,Y) = (1/n^2) sum_{k,l=1:n} A_{kl}B_{kl} } where \eqn{A_{kl}} and \eqn{B_{kl}} are \deqn{ A_{kl} = a_{kl}-\bar a_{k.}- \bar a_{.l} + \bar a_{..} } \deqn{ B_{kl} = b_{kl}-\bar b_{k.}- \bar b_{.l} + \bar b_{..}. } Here \deqn{ a_{kl} = \|X_k - X_l\|_p, \quad b_{kl} = \|Y_k - Y_l\|_q, \quad k,l=1,\dots,n, }{ a_{kl} = ||X_k - X_l||_p, b_{kl} = ||Y_k - Y_l||_q, k,l=1,\dots,n, } and the subscript \code{.} denotes that the mean is computed for the index that it replaces. Similarly, \eqn{\mathcal{V}_n(\mathbf{X})}{V_n(X)} is the nonnegative number defined by \deqn{ \mathcal{V}^2_n (\mathbf{X}) = \mathcal{V}^2_n (\mathbf{X,X}) = \frac{1}{n^2} \sum_{k,\,l=1}^n A_{kl}^2. }{ V^2_n (X) = V^2_n (X,X) = (1/n^2) sum_{k,l=1:n} A_{kl}^2. } The empirical distance correlation \eqn{\mathcal{R}_n(\mathbf{X,Y})}{R(\mathbf{X,Y})} is the square root of \deqn{ \mathcal{R}^2_n(\mathbf{X,Y})= \frac {\mathcal{V}^2_n(\mathbf{X,Y})} {\sqrt{ \mathcal{V}^2_n (\mathbf{X}) \mathcal{V}^2_n(\mathbf{Y})}}. }{ R^2_n(X,Y)= V^2_n(X,Y) / sqrt(V^2_n (X) V^2_n(Y)). } See \code{\link{dcov.test}} for a test of multivariate independence based on the distance covariance statistic. } \value{ \code{dcov} returns the sample distance covariance and \code{dcor} returns the sample distance correlation. \code{DCOR} returns a list with elements \item{dCov}{sample distance covariance} \item{dCor}{sample distance correlation} \item{dVarX}{distance variance of x sample} \item{dVarY}{distance variance of y sample} } \note{ Two methods of computing the statistics are provided. \code{DCOR} is a stand-alone R function that returns a list of statistics. \code{dcov} and \code{dcor} provide R interfaces to the C implementation, which is usually faster. \code{dcov} and \code{dcor} call an internal function \code{.dcov}. Note that it is inefficient to compute dCor by: square root of \code{dcov(x,y)/sqrt(dcov(x,x)*dcov(y,y))} because the individual calls to \code{dcov} involve unnecessary repetition of calculations. For this reason, \code{DCOR} computes and returns all four statistics. } \seealso{ \code{\link{bcdcor}} \code{\link{dcovU}} \code{\link{pdcor}} \code{\link{dcov.test}} \code{\link{dcor.ttest}} \code{\link{pdcor.test}} } \references{ Szekely, G.J., Rizzo, M.L., and Bakirov, N.K. (2007), Measuring and Testing Dependence by Correlation of Distances, \emph{Annals of Statistics}, Vol. 35 No. 6, pp. 2769-2794. \cr \url{http://dx.doi.org/10.1214/009053607000000505} Szekely, G.J. and Rizzo, M.L. (2009), Brownian Distance Covariance, \emph{Annals of Applied Statistics}, Vol. 3, No. 4, 1236-1265. \cr \url{http://dx.doi.org/10.1214/09-AOAS312} Szekely, G.J. and Rizzo, M.L. (2009), Rejoinder: Brownian Distance Covariance, \emph{Annals of Applied Statistics}, Vol. 3, No. 4, 1303-1308. } \author{ Maria L. Rizzo \email{mrizzo @ bgsu.edu} and Gabor J. Szekely } \examples{ x <- iris[1:50, 1:4] y <- iris[51:100, 1:4] dcov(x, y) dcov(dist(x), dist(y)) #same thing ## C implementation dcov(x, y, 1.5) dcor(x, y, 1.5) ## R implementation DCOR(x, y, 1.5) } \keyword{ multivariate } \concept{ independence } \concept{ distance correlation } \concept{ distance covariance } \concept{ energy statistics } energy/man/dcovu.Rd0000644000176200001440000000537712757710073013752 0ustar liggesusers\name{Unbiased distance covariance} \alias{bcdcor} \alias{dcovU} \alias{dcovU_stats} \title{Unbiased dcov and bias-corrected dcor statistics} \description{ These functions compute unbiased estimators of squared distance covariance, distance variance, and a bias-corrected estimator of (squared) distance correlation. } \usage{ bcdcor(x, y) dcovU(x, y) dcovU_stats(Dx, Dy) } \arguments{ \item{x}{ data or dist object of first sample} \item{y}{ data or dist object of second sample} \item{Dx}{ distance matrix of first sample} \item{Dy}{ distance matrix of second sample} } \details{ The unbiased (squared) dcov is inner product definition of dCov, in the Hilbert space of U-centered distance matrices. The sample sizes (number of rows) of the two samples must agree, and samples must not contain missing values. Arguments \code{x}, \code{y} can optionally be \code{\link{dist}} objects; otherwise these arguments are treated as data. } \value{ \code{dcovU} returns the unbiased estimator of squared dcov. \code{bcdcor} returns a bias-corrected estimator of squared dcor. \code{dcovU_stats} returns a vector of the components of bias-corrected dcor: [dCovU, bcdcor, dVarXU, dVarYU]. } \note{ Unbiased distance covariance (SR2014) corresponds to the biased (original) \eqn{\mathrm{dCov^2}}{dCov^2}. Since \code{dcovU} is an unbiased statistic, it is signed and we do not take the square root. For the original distance covariance test of independence (SRB2007, SR2009), the distance covariance test statistic is the V-statistic \eqn{\mathrm{n\, dCov^2} = n \mathcal{V}_n^2}{n V_n^2} (not dCov). Similarly, \code{bcdcor} is bias-corrected, so we do not take the square root as with dCor. } \references{ Szekely, G.J. and Rizzo, M.L. (2014), Partial Distance Correlation with Methods for Dissimilarities. \emph{Annals of Statistics}, Vol. 42 No. 6, 2382-2412. Szekely, G.J., Rizzo, M.L., and Bakirov, N.K. (2007), Measuring and Testing Dependence by Correlation of Distances, \emph{Annals of Statistics}, Vol. 35 No. 6, pp. 2769-2794. \cr \url{http://dx.doi.org/10.1214/009053607000000505} Szekely, G.J. and Rizzo, M.L. (2009), Brownian Distance Covariance, \emph{Annals of Applied Statistics}, Vol. 3, No. 4, 1236-1265. \cr \url{http://dx.doi.org/10.1214/09-AOAS312} } \author{ Maria L. Rizzo \email{mrizzo @ bgsu.edu} and Gabor J. Szekely } \examples{ x <- iris[1:50, 1:4] y <- iris[51:100, 1:4] Dx <- as.matrix(dist(x)) Dy <- as.matrix(dist(y)) dcovU(x, y) bcdcor(x, y) dcovU_stats(Dx, Dy) } \keyword{ multivariate } \keyword{ nonparametric } \concept{ independence } \concept{ multivariate } \concept{ distance correlation } \concept{ distance covariance } \concept{ energy statistics } energy/man/eqdist.etest.Rd0000644000176200001440000001375413144317640015236 0ustar liggesusers\name{eqdist.etest} \alias{eqdist.etest} \alias{eqdist.e} \alias{ksample.e} \title{Multisample E-statistic (Energy) Test of Equal Distributions} \description{ Performs the nonparametric multisample E-statistic (energy) test for equality of multivariate distributions. } \usage{ eqdist.etest(x, sizes, distance = FALSE, method=c("original","discoB","discoF"), R) eqdist.e(x, sizes, distance = FALSE, method=c("original","discoB","discoF")) ksample.e(x, sizes, distance = FALSE, method=c("original","discoB","discoF"), ix = 1:sum(sizes)) } \arguments{ \item{x}{ data matrix of pooled sample} \item{sizes}{ vector of sample sizes} \item{distance}{logical: if TRUE, first argument is a distance matrix} \item{method}{ use original (default) or distance components (discoB, discoF)} \item{R}{ number of bootstrap replicates } \item{ix}{ a permutation of the row indices of x } } \details{ The k-sample multivariate \eqn{\mathcal{E}}{E}-test of equal distributions is performed. The statistic is computed from the original pooled samples, stacked in matrix \code{x} where each row is a multivariate observation, or the corresponding distance matrix. The first \code{sizes[1]} rows of \code{x} are the first sample, the next \code{sizes[2]} rows of \code{x} are the second sample, etc. The test is implemented by nonparametric bootstrap, an approximate permutation test with \code{R} replicates. The function \code{eqdist.e} returns the test statistic only; it simply passes the arguments through to \code{eqdist.etest} with \code{R = 0}. The k-sample multivariate \eqn{\mathcal{E}}{E}-statistic for testing equal distributions is returned. The statistic is computed from the original pooled samples, stacked in matrix \code{x} where each row is a multivariate observation, or from the distance matrix \code{x} of the original data. The first \code{sizes[1]} rows of \code{x} are the first sample, the next \code{sizes[2]} rows of \code{x} are the second sample, etc. The two-sample \eqn{\mathcal{E}}{E}-statistic proposed by Szekely and Rizzo (2004) is the e-distance \eqn{e(S_i,S_j)}, defined for two samples \eqn{S_i, S_j} of size \eqn{n_i, n_j} by \deqn{e(S_i,S_j)=\frac{n_i n_j}{n_i+n_j}[2M_{ij}-M_{ii}-M_{jj}], }{e(S_i, S_j) = (n_i n_j)(n_i+n_j)[2M_(ij)-M_(ii)-M_(jj)],} where \deqn{M_{ij}=\frac{1}{n_i n_j}\sum_{p=1}^{n_i} \sum_{q=1}^{n_j} \|X_{ip}-X_{jq}\|,}{ M_{ij} = 1/(n_i n_j) sum[1:n_i, 1:n_j] ||X_(ip) - X_(jq)||,} \eqn{\|\cdot\|}{|| ||} denotes Euclidean norm, and \eqn{X_{ip}}{ X_(ip)} denotes the p-th observation in the i-th sample. The original (default method) k-sample \eqn{\mathcal{E}}{E}-statistic is defined by summing the pairwise e-distances over all \eqn{k(k-1)/2} pairs of samples: \deqn{\mathcal{E}=\sum_{1 \leq i < j \leq k} e(S_i,S_j). }{\emph{E} = sum[i=0] (\hat F(j) - F(j; \hat \lambda))^2 f(j; \hat \lambda).} The test is implemented by parametric bootstrap with \code{R} replicates. } \value{ The function \code{poisson.m} returns the test statistic. The function \code{poisson.mtest} returns a list with class \code{htest} containing \item{method}{Description of test} \item{statistic}{observed value of the test statistic} \item{p.value}{approximate p-value of the test} \item{data.name}{description of data} \item{estimate}{sample mean} } \references{ Szekely, G. J. and Rizzo, M. L. (2004) Mean Distance Test of Poisson Distribution, \emph{Statistics and Probability Letters}, 67/3, 241-247. \url{http://dx.doi.org/10.1016/j.spl.2004.01.005}. } \author{ Maria L. Rizzo \email{mrizzo @ bgsu.edu} and Gabor J. Szekely } \examples{ x <- rpois(20, 1) poisson.m(x) poisson.mtest(x, R = 199) } \keyword{ htest } energy/man/energy-deprecated.Rd0000644000176200001440000000412013144354300016166 0ustar liggesusers\name{indep.etest} \alias{indep.e} \alias{indep.etest} \title{ Energy Statistic Test of Independence} \description{Deprecated: use \code{indep.test} with \code{method = mvI}. Computes a multivariate nonparametric E-statistic and test of independence.} \usage{ indep.e(x, y) indep.etest(x, y, R) } \arguments{ \item{x}{ matrix: first sample, observations in rows} \item{y}{ matrix: second sample, observations in rows} \item{R}{ number of replicates} } \details{ Computes the coefficient \eqn{\mathcal I}{I_n} and performs a nonparametric \eqn{\mathcal E}{E}-test of independence. The test decision is obtained via bootstrap, with \code{R} replicates. The sample sizes (number of rows) of the two samples must agree, and samples must not contain missing values. The statistic \eqn{\mathcal E = n \mathcal I^2}{E = I^2} is a ratio of V-statistics based on interpoint distances \eqn{\|x_{i}-y_{j}\|}{||x_{i}-y_{j}||}. See the reference below for details. } \value{ The sample coefficient \eqn{\mathcal I}{I} is returned by \code{indep.e}. The function \code{indep.etest} returns a list with class \code{htest} containing \item{method}{description of test} \item{statistic}{observed value of the coefficient \eqn{\mathcal I}{I}} \item{p.value}{approximate p-value of the test} \item{data.name}{description of data} } \references{ Bakirov, N.K., Rizzo, M.L., and Szekely, G.J. (2006), A Multivariate Nonparametric Test of Independence, \emph{Journal of Multivariate Analysis} 93/1, 58-80, \cr \url{http://dx.doi.org/10.1016/j.jmva.2005.10.005} } \author{ Maria L. Rizzo \email{mrizzo @ bgsu.edu} and Gabor J. Szekely } \examples{ \dontrun{ ## independent univariate data x <- sin(runif(30, 0, 2*pi) * 2) y <- sin(runif(30, 0, 2*pi) * 4) indep.etest(x, y, R = 99) ## dependent multivariate data Sigma <- matrix(c(1, .1, 0, 0 , 1, 0, 0 ,.1, 1), 3, 3) x <- mvrnorm(30, c(0, 0, 0), diag(3)) y <- mvrnorm(30, c(0, 0, 0), Sigma) * x indep.etest(x, y, R = 99) } } \keyword{ htest } \keyword{ multivariate } \concept{ energy statistics } energy/man/mvnorm-etest.Rd0000644000176200001440000000604513144317600015251 0ustar liggesusers\name{mvnorm.etest} \alias{mvnorm.etest} \alias{mvnorm.e} \alias{normal.e} \title{E-statistic (Energy) Test of Multivariate Normality} \description{ Performs the E-statistic (energy) test of multivariate or univariate normality. } \usage{ mvnorm.etest(x, R) mvnorm.e(x) normal.e(x) } \arguments{ \item{x}{ data matrix of multivariate sample, or univariate data vector} \item{R}{ number of bootstrap replicates } } \details{ If \code{x} is a matrix, each row is a multivariate observation. The data will be standardized to zero mean and identity covariance matrix using the sample mean vector and sample covariance matrix. If \code{x} is a vector, the univariate statistic \code{normal.e(x)} is returned. If the data contains missing values or the sample covariance matrix is singular, NA is returned. The \eqn{\mathcal{E}}{E}-test of multivariate normality was proposed and implemented by Szekely and Rizzo (2005). The test statistic for d-variate normality is given by \deqn{\mathcal{E} = n (\frac{2}{n} \sum_{i=1}^n E\|y_i-Z\| - E\|Z-Z'\| - \frac{1}{n^2} \sum_{i=1}^n \sum_{j=1}^n \|y_i-y_j\|), }{E = n((2/n) sum[1:n] E||y_i-Z|| - E||Z-Z'|| - (1/n^2) sum[1:n,1:n] ||y_i-y_j||),} where \eqn{y_1,\ldots,y_n} is the standardized sample, \eqn{Z, Z'} are iid standard d-variate normal, and \eqn{\| \cdot \|}{|| ||} denotes Euclidean norm. The \eqn{\mathcal{E}}{E}-test of multivariate (univariate) normality is implemented by parametric bootstrap with \code{R} replicates. If \code{R=0} the summary for the test gives the test statistic only (no p-value). } \value{ The value of the \eqn{\mathcal{E}}{E}-statistic for univariate normality is returned by \code{normal.e}. The value of the \eqn{\mathcal{E}}{E}-statistic for multivariate normality is returned by \code{mvnorm.e}. \code{mvnorm.etest} returns a list with class \code{htest} containing \item{method}{description of test} \item{statistic}{observed value of the test statistic} \item{p.value}{approximate p-value of the test} \item{data.name}{description of data} } \references{ Szekely, G. J. and Rizzo, M. L. (2005) A New Test for Multivariate Normality, \emph{Journal of Multivariate Analysis}, 93/1, 58-80, \url{http://dx.doi.org/10.1016/j.jmva.2003.12.002}. Rizzo, M. L. (2002). A New Rotation Invariant Goodness-of-Fit Test, Ph.D. dissertation, Bowling Green State University. Szekely, G. J. (1989) Potential and Kinetic Energy in Statistics, Lecture Notes, Budapest Institute of Technology (Technical University). } \author{ Maria L. Rizzo \email{mrizzo @ bgsu.edu} and Gabor J. Szekely } \examples{ ## compute normality test statistics for iris Setosa data data(iris) mvnorm.e(iris[1:50, 1:4]) normal.e(iris[1:50, 1]) ## test if the iris Setosa data has multivariate normal distribution mvnorm.etest(iris[1:50,1:4], R = 199) ## test a univariate sample for normality x <- runif(50, 0, 10) mvnorm.etest(x, R = 199) } \keyword{ multivariate } \keyword{ htest } \concept{ energy statistics } energy/man/indep.test.Rd0000644000176200001440000001115213144320271014657 0ustar liggesusers\name{indep.test} \alias{indep.test} \title{ Energy-tests of Independence} \description{ Computes a multivariate nonparametric test of independence. The default method implements the distance covariance test \code{\link{dcov.test}}. } \usage{ indep.test(x, y, method = c("dcov","mvI"), index = 1, R) } \arguments{ \item{x}{ matrix: first sample, observations in rows} \item{y}{ matrix: second sample, observations in rows} \item{method}{ a character string giving the name of the test} \item{index}{ exponent on Euclidean distances} \item{R}{ number of replicates} } \details{ \code{indep.test} with the default \code{method = "dcov"} computes the distance covariance test of independence. \code{index} is an exponent on the Euclidean distances. Valid choices for \code{index} are in (0,2], with default value 1 (Euclidean distance). The arguments are passed to the \code{dcov.test} function. See the help topic \code{\link{dcov.test}} for the description and documentation and also see the references below. \code{indep.test} with \code{method = "mvI"} computes the coefficient \eqn{\mathcal I_n}{I_n} and performs a nonparametric \eqn{\mathcal E}{E}-test of independence. The arguments are passed to \code{mvI.test}. The \code{index} argument is ignored (\code{index = 1} is applied). See the help topic \code{\link{mvI.test}} and also see the reference (2006) below for details. The test decision is obtained via bootstrap, with \code{R} replicates. The sample sizes (number of rows) of the two samples must agree, and samples must not contain missing values. These energy tests of independence are based on related theoretical results, but different test statistics. The \code{dcov} method is faster than \code{mvI} method by approximately a factor of O(n). } \value{ \code{indep.test} returns a list with class \code{htest} containing \item{ method}{description of test} \item{ statistic}{observed value of the test statistic \eqn{n \mathcal V_n^2}{n V_n^2} or \eqn{n \mathcal I_n^2}{n I_n^2}} \item{ estimate}{ \eqn{\mathcal V_n}{V_n} or \eqn{\mathcal I_n}{I_n}} \item{ estimates}{ a vector [dCov(x,y), dCor(x,y), dVar(x), dVar(y)] (method dcov)} \item{ replicates}{ replicates of the test statistic} \item{ p.value}{approximate p-value of the test} \item{ data.name}{description of data} } \note{As of energy-1.1-0, \code{indep.etest} is deprecated and replaced by \code{indep.test}, which has methods for two different energy tests of independence. \code{indep.test} applies the distance covariance test (see \code{dcov.test}) by default (\code{method = "dcov"}). The original \code{indep.etest} applied the independence coefficient \eqn{\mathcal I_n}{I_n}, which is now obtained by \code{method = "mvI"}. } \seealso{ \code{ \link{dcov.test} } \code{ \link{mvI.test} } \code{ \link{dcov} } \code{ \link{mvI} } } \references{ Szekely, G.J. and Rizzo, M.L. (2009), Brownian Distance Covariance, \emph{Annals of Applied Statistics}, Vol. 3 No. 4, pp. 1236-1265. (Also see discussion and rejoinder.) \cr \url{http://dx.doi.org/10.1214/09-AOAS312} Szekely, G.J., Rizzo, M.L., and Bakirov, N.K. (2007), Measuring and Testing Dependence by Correlation of Distances, \emph{Annals of Statistics}, Vol. 35 No. 6, pp. 2769-2794. \cr \url{http://dx.doi.org/10.1214/009053607000000505} Bakirov, N.K., Rizzo, M.L., and Szekely, G.J. (2006), A Multivariate Nonparametric Test of Independence, \emph{Journal of Multivariate Analysis} 93/1, 58-80, \cr \url{http://dx.doi.org/10.1016/j.jmva.2005.10.005} } \author{ Maria L. Rizzo \email{mrizzo @ bgsu.edu} and Gabor J. Szekely } \examples{ ## independent multivariate data x <- matrix(rnorm(60), nrow=20, ncol=3) y <- matrix(rnorm(40), nrow=20, ncol=2) indep.test(x, y, method = "dcov", R = 99) indep.test(x, y, method = "mvI", R = 99) \dontrun{ ## dependent multivariate data if (require(MASS)) { Sigma <- matrix(c(1, .1, 0, 0 , 1, 0, 0 ,.1, 1), 3, 3) x <- mvrnorm(30, c(0, 0, 0), diag(3)) y <- mvrnorm(30, c(0, 0, 0), Sigma) * x indep.test(x, y, R = 99) #dcov method indep.test(x, y, method = "mvI", R = 99) } } \dontrun{ ## compare the computing time x <- mvrnorm(50, c(0, 0, 0), diag(3)) y <- mvrnorm(50, c(0, 0, 0), Sigma) * x set.seed(123) system.time(indep.test(x, y, method = "dcov", R = 1000)) set.seed(123) system.time(indep.test(x, y, method = "mvI", R = 1000)) } } \keyword{ htest } \keyword{ multivariate } \keyword{ nonparametric } \concept{ independence } \concept{ energy statistics } energy/man/energy-package.Rd0000644000176200001440000000301613145565237015502 0ustar liggesusers\name{energy-package} \alias{energy-package} \alias{energy} \docType{package} \title{ E-statistics: Multivariate Inference via the Energy of Data } \description{ Description: E-statistics (energy) tests and statistics for multivariate and univariate inference, including distance correlation, one-sample, two-sample, and multi-sample tests for comparing multivariate distributions, are implemented. Measuring and testing multivariate independence based on distance correlation, partial distance correlation, multivariate goodness-of-fit tests, clustering based on energy distance, testing for multivariate normality, distance components (disco) for non-parametric analysis of structured data, and other energy statistics/methods are implemented. } \author{ Maria L. Rizzo and Gabor J. Szekely } \references{ G. J. Szekely and M. L. Rizzo (2013). Energy statistics: A class of statistics based on distances, \emph{Journal of Statistical Planning and Inference}, \url{http://dx.doi.org/10.1016/j.jspi.2013.03.018} M. L. Rizzo and G. J. Szekely (2016). Energy Distance, \emph{WIRES Computational Statistics}, Wiley, Volume 8 Issue 1, 27-38. Available online Dec., 2015, \url{http://dx.doi.org/10.1002/wics.1375}. G. J. Szekely and M. L. Rizzo (2017). The Energy of Data. \emph{The Annual Review of Statistics and Its Application} 4:447-79. \url{10.1146/annurev-statistics-060116-054026} } \keyword{ package } \keyword{ multivariate } energy/man/centering.Rd0000644000176200001440000000456312757710073014604 0ustar liggesusers\name{centering distance matrices} \alias{Ucenter} \alias{Dcenter} \alias{U_center} \alias{D_center} \title{ Double centering and U-centering } \description{ Stand-alone double centering and U-centering functions that are applied in unbiased distance covariance, bias corrected distance correlation, and partial distance correlation. } \usage{ Dcenter(x) Ucenter(x) U_center(Dx) D_center(Dx) } \arguments{ \item{x}{ dist object or data matrix} \item{Dx}{ distance or dissimilarity matrix} } \details{ In \code{Dcenter} and \code{Ucenter}, \code{x} must be a \code{dist} object or a data matrix. Both functions return a doubly centered distance matrix. Note that \code{pdcor}, etc. functions include the centering operations (in C), so that these stand alone versions of centering functions are not needed except in case one wants to compute just a double-centered or U-centered matrix. \code{U_center} is the Rcpp export of the cpp function. \code{D_center} is the Rcpp export of the cpp function. } \value{ All functions return a square symmetric matrix. \code{Dcenter} returns a matrix \deqn{A_{ij}=a_{ij} - \bar a_{i.} - \bar a_{.j} + \bar a_{..}} as in classical multidimensional scaling. \code{Ucenter} returns a matrix \deqn{\tilde A_{ij}=a_{ij} - \frac{a_{i.}}{n-2} - \frac{a_{.j}}{n-2} + \frac{a_{..}}{(n-1)(n-2)},\quad i \neq j,} with zero diagonal, and this is the double centering applied in \code{pdcov} and \code{pdcor} as well as the unbiased dCov and bias corrected dCor statistics. } \note{ The c++ versions \code{D_center} and \code{U_center} should typically be faster. R versions are retained for historical reasons. } \references{ Szekely, G.J. and Rizzo, M.L. (2014), Partial Distance Correlation with Methods for Dissimilarities, \emph{Annals of Statistics}, Vol. 42, No. 6, pp. 2382-2412. DOI \url{dx.doi.org/10.1214/14-AOS1255} \cr \url{http://projecteuclid.org/euclid.aos/1413810731} } \author{ Maria L. Rizzo \email{mrizzo @ bgsu.edu} and Gabor J. Szekely } \examples{ x <- iris[1:10, 1:4] dx <- dist(x) Dx <- as.matrix(dx) M <- U_center(Dx) all.equal(M, U_center(M)) #idempotence all.equal(M, D_center(M)) #invariance } \keyword{ multivariate } \concept{ multivariate } \concept{ distance correlation } \concept{ distance covariance } \concept{ energy statistics } energy/man/energy.hclust.Rd0000644000176200001440000001266212757710073015417 0ustar liggesusers\name{energy.hclust} \alias{energy.hclust} \title{ Hierarchical Clustering by Minimum (Energy) E-distance } \description{ Performs hierarchical clustering by minimum (energy) E-distance method. } \usage{ energy.hclust(dst, alpha = 1) } \arguments{ \item{dst}{\code{dist} object} \item{alpha}{distance exponent} } \details{ Dissimilarities are \eqn{d(x,y) = \|x-y\|^\alpha}{||x-y||^a}, where the exponent \eqn{\alpha}{a} is in the interval (0,2]. This function performs agglomerative hierarchical clustering. Initially, each of the n singletons is a cluster. At each of n-1 steps, the procedure merges the pair of clusters with minimum e-distance. The e-distance between two clusters \eqn{C_i, C_j} of sizes \eqn{n_i, n_j} is given by \deqn{e(C_i, C_j)=\frac{n_i n_j}{n_i+n_j}[2M_{ij}-M_{ii}-M_{jj}], } where \deqn{M_{ij}=\frac{1}{n_i n_j}\sum_{p=1}^{n_i} \sum_{q=1}^{n_j} \|X_{ip}-X_{jq}\|^\alpha,}{ M_{ij} = 1/(n_i n_j) sum[1:n_i, 1:n_j] ||X_(ip) - X_(jq)||^a,} \eqn{\|\cdot\|}{|| ||} denotes Euclidean norm, and \eqn{X_{ip}}{ X_(ip)} denotes the p-th observation in the i-th cluster. The return value is an object of class \code{hclust}, so \code{hclust} methods such as print or plot methods, \code{plclust}, and \code{cutree} are available. See the documentation for \code{hclust}. The e-distance measures both the heterogeneity between clusters and the homogeneity within clusters. \eqn{\mathcal E}{E}-clustering (\eqn{\alpha=1}{a=1}) is particularly effective in high dimension, and is more effective than some standard hierarchical methods when clusters have equal means (see example below). For other advantages see the references. \code{edist} computes the energy distances for the result (or any partition) and returns the cluster distances in a \code{dist} object. See the \code{edist} examples. } \value{ An object of class \code{hclust} which describes the tree produced by the clustering process. The object is a list with components: \item{merge:}{ an n-1 by 2 matrix, where row i of \code{merge} describes the merging of clusters at step i of the clustering. If an element j in the row is negative, then observation -j was merged at this stage. If j is positive then the merge was with the cluster formed at the (earlier) stage j of the algorithm.} \item{height:}{the clustering height: a vector of n-1 non-decreasing real numbers (the e-distance between merging clusters)} \item{order:}{ a vector giving a permutation of the indices of original observations suitable for plotting, in the sense that a cluster plot using this ordering and matrix \code{merge} will not have crossings of the branches.} \item{labels:}{ labels for each of the objects being clustered.} \item{call:}{ the call which produced the result.} \item{method:}{ the cluster method that has been used (e-distance).} \item{dist.method:}{ the distance that has been used to create \code{dst}.} } \note{ Currently \code{stats::hclust} implements Ward's method by \code{method="ward.D2"}, which applies the squared distances. That method was previously \code{"ward"}. Because both \code{hclust} and energy use the same type of Lance-Williams recursive formula to update cluster distances, now with the additional option \code{method="ward.D"} in \code{hclust}, the energy distance method is easily implemented by \code{hclust}. (Some "Ward" algorithms do not use Lance-Williams, however). Energy clustering (with \code{alpha=1}) and "ward.D" now return the same result, except that the cluster heights of {energy.hclust} with \code{alpha=1} are two times the heights from \code{hclust}. However, the implementation in the energy package is more than 100 times faster than \code{hclust}. in a recent benchmark. } \references{ Szekely, G. J. and Rizzo, M. L. (2005) Hierarchical Clustering via Joint Between-Within Distances: Extending Ward's Minimum Variance Method, \emph{Journal of Classification} 22(2) 151-183. \cr \url{http://dx.doi.org/10.1007/s00357-005-0012-9} Szekely, G. J. and Rizzo, M. L. (2004) Testing for Equal Distributions in High Dimension, \emph{InterStat}, November (5). Szekely, G. J. (2000) Technical Report 03-05: \eqn{\mathcal{E}}{E}-statistics: Energy of Statistical Samples, Department of Mathematics and Statistics, Bowling Green State University. } \author{ Maria L. Rizzo \email{mrizzo @ bgsu.edu} and Gabor J. Szekely } \seealso{ \code{\link{edist}} \code{\link{ksample.e}} \code{\link{eqdist.etest}} \code{hclust}} \examples{ \dontrun{ library(cluster) data(animals) plot(energy.hclust(dist(animals))) } data(USArrests) ecl <- energy.hclust(dist(USArrests)) print(ecl) plot(ecl) cutree(ecl, k=3) cutree(ecl, h=150) ## compare performance of e-clustering, Ward's method, group average method ## when sampled populations have equal means: n=200, d=5, two groups z <- rbind(matrix(rnorm(1000), nrow=200), matrix(rnorm(1000, 0, 5), nrow=200)) g <- c(rep(1, 200), rep(2, 200)) d <- dist(z) e <- energy.hclust(d) a <- hclust(d, method="average") w <- hclust(d^2, method="ward.D2") list("E" = table(cutree(e, k=2) == g), "Ward" = table(cutree(w, k=2) == g), "Avg" = table(cutree(a, k=2) == g)) } \keyword{ multivariate } \keyword{ cluster } \concept{ energy statistics } energy/man/disco.Rd0000644000176200001440000001120713145562044013713 0ustar liggesusers\name{disco} \alias{disco} \alias{disco.between} \alias{print.disco} \title{ distance components (DISCO)} \description{ E-statistics DIStance COmponents and tests, analogous to variance components and anova. } \usage{ disco(x, factors, distance, index=1.0, R, method=c("disco","discoB","discoF")) disco.between(x, factors, distance, index=1.0, R) } \arguments{ \item{x}{ data matrix or distance matrix} \item{factors}{ matrix of factor labels or integers (not design matrix)} \item{distance}{ logical, TRUE if x is distance matrix} \item{index}{ exponent on Euclidean distance in (0,2]} \item{R}{ number of replicates for a permutation test} \item{method}{ test statistic } } \details{ \code{disco} calculates the distance components decomposition of total dispersion and if R > 0 tests for significance using the test statistic disco "F" ratio (default \code{method="disco"}), or using the between component statistic (\code{method="discoB"}), each implemented by permutation test. In the current release \code{disco} computes the decomposition for one-way models only. } \value{ When \code{method="discoF"}, \code{disco} returns a list similar to the return value from \code{anova.lm}, and the \code{print.disco} method is provided to format the output into a similar table. Details: \code{disco} returns a class \code{disco} object, which is a list containing \item{call}{call} \item{method}{method} \item{statistic}{vector of observed statistics} \item{p.value}{vector of p-values} \item{k}{number of factors} \item{N}{number of observations} \item{between}{between-sample distance components} \item{withins}{one-way within-sample distance components} \item{within}{within-sample distance component} \item{total}{total dispersion} \item{Df.trt}{degrees of freedom for treatments} \item{Df.e}{degrees of freedom for error} \item{index}{index (exponent on distance)} \item{factor.names}{factor names} \item{factor.levels}{factor levels} \item{sample.sizes}{sample sizes} \item{stats}{matrix containing decomposition} When \code{method="discoB"}, \code{disco} passes the arguments to \code{disco.between}, which returns a class \code{htest} object. \code{disco.between} returns a class \code{htest} object, where the test statistic is the between-sample statistic (proportional to the numerator of the F ratio of the \code{disco} test. } \references{ M. L. Rizzo and G. J. Szekely (2010). DISCO Analysis: A Nonparametric Extension of Analysis of Variance, Annals of Applied Statistics, Vol. 4, No. 2, 1034-1055. \cr \url{http://dx.doi.org/10.1214/09-AOAS245} } \note{ The current version does all calculations via matrix arithmetic and boot function. Support for more general additive models and a formula interface is under development. \code{disco} methods have been added to the cluster distance summary function \code{edist}, and energy tests for equality of distribution (see \code{eqdist.etest}). } \seealso{ \code{ \link{edist} } \code{ \link{eqdist.e} } \code{ \link{eqdist.etest} } \code{ \link{ksample.e} } } \author{ Maria L. Rizzo \email{mrizzo @ bgsu.edu} and Gabor J. Szekely } \examples{ ## warpbreaks one-way decompositions data(warpbreaks) attach(warpbreaks) disco(breaks, factors=wool, R=99) ## When index=2 for univariate data, we get ANOVA decomposition disco(breaks, factors=tension, index=2.0, R=99) aov(breaks ~ tension) ## Multivariate response ## Example on producing plastic film from Krzanowski (1998, p. 381) tear <- c(6.5, 6.2, 5.8, 6.5, 6.5, 6.9, 7.2, 6.9, 6.1, 6.3, 6.7, 6.6, 7.2, 7.1, 6.8, 7.1, 7.0, 7.2, 7.5, 7.6) gloss <- c(9.5, 9.9, 9.6, 9.6, 9.2, 9.1, 10.0, 9.9, 9.5, 9.4, 9.1, 9.3, 8.3, 8.4, 8.5, 9.2, 8.8, 9.7, 10.1, 9.2) opacity <- c(4.4, 6.4, 3.0, 4.1, 0.8, 5.7, 2.0, 3.9, 1.9, 5.7, 2.8, 4.1, 3.8, 1.6, 3.4, 8.4, 5.2, 6.9, 2.7, 1.9) Y <- cbind(tear, gloss, opacity) rate <- factor(gl(2,10), labels=c("Low", "High")) ## test for equal distributions by rate disco(Y, factors=rate, R=99) disco(Y, factors=rate, R=99, method="discoB") ## Just extract the decomposition table disco(Y, factors=rate, R=0)$stats ## Compare eqdist.e methods for rate ## disco between stat is half of original when sample sizes equal eqdist.e(Y, sizes=c(10, 10), method="original") eqdist.e(Y, sizes=c(10, 10), method="discoB") ## The between-sample distance component disco.between(Y, factors=rate, R=0) } \keyword{ htest } \keyword{ multivariate }