kSamples/0000755000176200001440000000000013470620413012031 5ustar liggesuserskSamples/inst/0000755000176200001440000000000013452475503013016 5ustar liggesuserskSamples/inst/NEWS.Rd0000644000176200001440000000163713470612016014060 0ustar liggesusers\name{NEWS} \encoding{UTF-8} \title{News for Package \pkg{kSamples}} \section{Changes in version 1.2-8 (2018-06-02)}{ \subsection{Bug Removal}{ \itemize{ \item Fixed a bug in data processing into list format, relevant to blocked data. \item Added some references to tests of similar nature and added a note about it. } } } \section{Changes in version 1.2-8 (2018-06-02)}{ \subsection{Version Synchronization}{ \itemize{ \item Fixed version synchronization in documentation. } } } \section{Changes in version 1.2-7 (2017-08-10)}{ \subsection{Bug report}{ \itemize{ \item Fixed a bug in formula input to list conversion. Anomaly pointed out by Jose Ferreira. } } } \section{Changes in version 1.2-6 (2017-06-14)}{ \subsection{Memory management}{ \itemize{ \item In ad.test stack memory management was changed to heap memory, as suggested by Tomasz Melcer. } } } kSamples/src/0000755000176200001440000000000013470613013012616 5ustar liggesuserskSamples/src/kSamples.c0000644000176200001440000020673713470613013014560 0ustar liggesusers/* helper functions */ #include #include #include #include /* for random number generator in R */ #include #include #include #include "myfuns.h" /* Code based to a large extent on Angie Zhu's original version, last modified 06/11/2015 (FWS)*/ /* dynamically allocates memory for a double array of length n and returns the pointer; prints out error message if the allocation is unsuccessful */ double *dmalloc(unsigned long n) { double *x; x = (double*) malloc((size_t) n * sizeof(double)); /* if (x == NULL) { printf("Error: Could not allocate %ld doubles\n", n); } */ return(x); } /* counts and returns the number of occurrence of a given double number z in a double array dat of length n */ int getCount(double z, double *dat, int n) { int i; int count = 0; for (i = 0; i < n; i++) { if (dat[i] == z) { count++; } } return(count); } /* computes and returns the sum of elements in a given integer array x of length n */ int getSum(int *x, int n) { int i; int sum = 0; for (i = 0; i < n; i++) { sum += x[i]; } return(sum); } /* dynamically allocates memory for an integer array with length n and returns its pointer; prints out error message if the allocation is unsuccessful */ int *imalloc(unsigned long n) { int *x; x = (int*) malloc((size_t) n * sizeof(int)); /* if (x == NULL) { printf("Error: Could not allocate %ld ints\n", n); } */ return(x); } /* produces a copy of double n*m matrix x */ void mcopy(double *x, double *copy, int n, int m) { int i; n *= m; for (i = 0; i < n; i++) { *(copy + i) = *(x + i); } } /* produces a copy of int n*m matrix X */ void imcopy(int *x, int *copy, int n, int m) { int i; n *= m; for (i = 0; i < n; i++) { *(copy + i) = *(x + i); } } /* dynamically allocates memory for an array of pointers to double arrays with length n and returns the pointer; prints out error message if the allocation is unsuccessful */ double **pdmalloc(unsigned long n) { double **x; x = (double**) malloc((size_t) n * sizeof(double*)); /* if (x == NULL) { printf("Error: Could not allocate %ld pointers to double arrays\n", n); } */ return(x); } /*************************** * Project: k-Sample Anderson-Darling Tests * adkTestStat * Last modified: 07.21.2015 (FWS) ***************************/ /* static variables */ static int k; static int L; static int nsum; static int ncomb; static int getSmat; /* logical value to indicate if smat will be generated */ static int count; /* keeps track of how many combinations have been visited */ static int *ns; static int *ivec; static int dimst; static double *xvec; static double *zstar; static double *teststat; static double *pval; static double *smat; /* statistic matrix, recording the statistics*/ /* initializes static variables */ void initvals(int kk, double *xxvec, int *nns, double *zzstar, int LL, int *iivec, double *tteststat, double *ppval, int nnsum, int nncomb, int ggetSmat, double *ssmat, int ddimst) { k = kk; xvec = xxvec; ns = nns; zstar = zzstar; L = LL; ivec = iivec; teststat = tteststat; pval = ppval; nsum = nnsum; ncomb = nncomb; getSmat = ggetSmat; smat = ssmat; count = 0; dimst = ddimst; } /* initializes static variables */ void initvals1(int kk, double *xxvec, int *nns, double *zzstar, int LL){ k = kk; xvec = xxvec; ns = nns; zstar = zzstar; L = LL; } /* computes the k-sample Anderson-Darling test statistics in both original and alternative versions for the nonparametric (rank) test described in Scholz F.W. and Stephens M.A. (1987), K-sample Anderson-Darling Tests, Journal of the American Statistical Association, Vol 82, No. 399, pp. 918-924 Arguments: adk: double array of length 2, stores AkN2 and AakN2 k: integer, number of samples being compared x: double array storing the concatenated samples in the same order as in ns ns: integer array storing the k sample sizes, corresponding to x zstar: double array storing the l distinct ordered observations in the pooled sample L: integer, length of zstar Outputs: when the computation ends, AkN2 and AakN2 are stored in the given memory pointed to by adk */ void adkTestStat(double *adk, int k, double *x, int *ns){ int i; int j; /* fij records the number of observations in the ith sample coinciding with zstar[j], where i = 1, ..., k, and j = 1, ..., L */ /* int fij[k*L]; replaced my next line as per Tomasz Melcer*/ int *fij = calloc(k*L, sizeof *fij); /* lvec is an integer vector with length L, whose jth entry = \sum_{i=1}^{k} f_{ij}, i.e., the multiplicity of zstar[j] */ /* int lvec[L]; replaced my next line as per Tomasz Melcer */ int *lvec = calloc(L, sizeof *lvec); /* for computation */ double mij; double maij; double innerSum; double aInnerSum; double bj; double baj; double tmp; /* samples is a two-dimensional double array with length k; it stores an array of k pointers to double arrays which are the k samples being compared */ double **samples; /* dynamically allocate memory */ samples = pdmalloc(k); nsum = 0; for (i = 0; i < k; i++) { samples[i] = dmalloc(ns[i]); for (j = 0; j < ns[i]; j++) { samples[i][j] = x[nsum + j]; } nsum += ns[i]; } /* fij: k*L integer matrix, where L is the length of zstar and k is the number of samples being compared lvec: integer vector of length L, records the multiplicity of each element of zstar */ for (j = 0; j < L; j++) { lvec[j] = 0; for (i = 0; i < k; i++) { fij[i + j*k] = getCount(zstar[j], samples[i], ns[i]); lvec[j] += fij[i + j*k]; } } adk[0] = adk[1] = 0; for (i = 0; i < k; i++) { mij = 0; maij = 0; innerSum = 0; aInnerSum = 0; for (j = 0; j < L; j++) { mij += fij[i + j*k]; maij = mij - (double) fij[i + j*k] / 2.0; bj = getSum(lvec, j + 1); baj = bj - (double) lvec[j] / 2.0; if (j < L - 1) { tmp = (double) nsum * mij - (double) ns[i] * bj; innerSum = innerSum + (double) lvec[j] * tmp * tmp / (bj * ((double) nsum - bj)); } tmp = (double) nsum * maij - (double) ns[i] * baj; aInnerSum = aInnerSum + (double) lvec[j] * tmp * tmp / (baj * (nsum - baj) - nsum * (double) lvec[j] / 4.0); } adk[0] = adk[0] + innerSum / ns[i]; /* AkN2*/ adk[1] = adk[1] + aInnerSum / ns[i]; /* AakN2 */ } /* k-sample Anderson-Darling test statistics in both original and alternative versions, AkN2 and AakN2, are stored in the given double array adk */ adk[0] = adk[0] / (double) nsum; /* AkN2*/ adk[1] = (nsum - 1) * adk[1] / ((double) nsum * (double) nsum); /* AakN2 */ /* free pointers */ for (i = 0; i < k; i++) { free(samples[i]); } free(samples); free(lvec); free(fij); } /* ***************************************** * Project: k-Sample Anderson-Darling Tests * functions initvals, exactcomb * Last modified: 07.28.2015 (FWS) *******************************************/ /* The algorithm of generating all possible combinations using Chase's sequence is written by Donald Knuth (TAOCP V.4A, 7.2.1.3 Algorithm C) */ /* uses recursive backtracking to find all possible ways/combinations to divide n elements into k subcollections, where the size of each subcollection is fixed; for each combination, the desired test statistics are computed and compared with the observed values. The recursion arises because at each iteration of exactcomb a combination of ns[i] is split off from what is left over, until nothing more can be split off because only ns[k-1] are left over */ int exactcomb(int now, int *position, int m, int dimst, void(*testStatFun)(double *teststat, int k, double *x, int *ns)) { /* exactcomb in its initial call starts with now = 0, m = nsum, and position filled with i = 0, 1, 2, ..., nsum-1. */ int i; int j; if (now == k - 1) { /* here we have arrived at the end stage when nothing more can be split off */ double xc[nsum]; double teststatcomb[dimst]; double *pt; /* fills the remaining m=ns[k-1] positions of ivec with now = k-1 (FWS) */ for (i = 0; i < m; i++) { ivec[position[i]] = now; } /* here we equate the pointer pt with that of xc and by filling the associated array pt we also fill the array xc. (FWS) */ pt = xc; /* here we fill pt=xc first with all the xvec[j] which are supposed to belong to sample 0 according to ivec, then with those belonging to sample 1, and so on. This will give us the full permuted sample sequence (FWS) */ for (i = 0; i < k; i++) { for (j = 0; j < nsum; j++) { if (ivec[j] == i) { *pt = xvec[j]; pt++; } } } /* get test statistics for this combination xc */ (*testStatFun)(teststatcomb, k, xc, ns); /* compares test statistics for this combination with observed ones */ for (j = 0; j < dimst; j++) { if (getSmat) { /* records the test statistics for each combination */ smat[count + ncomb * j] = teststatcomb[j]; } /* compares teststatcomb for this combination with observed one */ if (teststatcomb[j] >= teststat[j]) { pval[j] = pval[j] + 1; } } count++; return(2); /* this return gets us back to just beyond the point of the last previous call to exactcomb, to find the next combination at that stage of now (FWS) */ } else { /* Algorithm using Chase's sequence by Donald Knuth (TAOCP V.4A, 7.2.1.3 Algorithm C) */ int s = m - ns[now]; /* s represents the size of the remainder after the ns[now] sample values for the current combination have be chosen. The meaning of the variables a, w, r is pretty much explained in Knuth, p. 367. In particular, a[i] = 1 means that the element with index i is designated as part of the chosen combination. (FWS) */ int r; int *a; int *w; int newposition[s]; /* this newposition array is meant to replace the position array in the recursive call to exactcomb, to get to the next combination of the k combinations to be chosen. It is set up below, right after while (1) {.... (FWS) */ int *tmp; /* this pointer is equated to the pointer of the newposition array and is used to fill that array. (FWS) */ /* initializes variables */ /* this is the start of the C1 step in Knuth's algorithm C (FWS) */ a = imalloc(m); w = imalloc(m + 1); for (j = 0; j < s; j++) { a[j] = 0; w[j] = 1; } for (j = s; j < m; j++) { a[j] = w[j] = 1; } w[m] = 1; if (s > 0) { r = s; } else { r = ns[now]; } /* this is the end of the C1 step in Knuth's algorithm C (FWS) */ j = r; /* the setup of this function assures that j != m at this point since ns[now] > 0 and ns[now] != m */ while (1) { /* visits current combination */ /* here we equate the pointers tmp and newposition and by filling tmp we fill newposition. (FWS) */ /* visits current combination */ tmp = newposition; /* If indicated by a[i]=1 (relative to the current position array and w.r.t. the array a in that context), we fill ivec at index position[i] with the sample index now, that is under discussion here. All other position indices are collected inside the array newposition, by assignment via tmp. It amounts to splitting the m position elements into two groups of size ns[now] (the chosen combination for the now sample) and s = m-ns[now], the remainder. (FWS) */ for (i = 0; i < m; i++) { if (a[i]) { ivec[position[i]] = now; } else { *tmp = position[i]; tmp++; } } /* recursive function call */ /* to get the next combination, as indicated by now+1, using the residual position vector newposition, but when understanding what happens to it, that newposition vector is referred to as position inside the algorithm exactcomb. (FWS) */ exactcomb(now + 1, newposition, s, dimst, testStatFun); /* finds j and branches */ j = r; while(w[j] == 0) { w[j] = 1; j++; } /* Here we find out whether we have encountered the last combination already, and whether we should step back prior to the last invocation of exactcomb, possibly leading to further stepping back, until there is no more stepping back, i.e., we have traversed all combination splits. If we do not terminate here, we generate the next step in the array generation, according to Knuth's C2-C7. (FWS) */ if (j == m) { /* terminate point of this algorithm */ return(1); } else { w[j] = 0; } if (a[j] == 1) { if (j % 2 == 0 && a[j-2] == 0) { a[j-2] = 1; a[j] = 0; if (r == j) { if (j - 2 > 1) { r = j - 2; } else { r = 1; } } else if (r == j - 2) { r = j - 1; } } else { a[j-1] = 1; a[j] = 0; if (r == j && j > 1) { r = j - 1; } else if (r == j - 1) { r = j; } } } else { /* a[j] == 0 */ if (j % 2 == 1 && a[j-1] == 0) { a[j] = 1; a[j-2] = 0; if (r == j - 2) { r = j; } else if (r == j - 1) { r = j - 2; } } else { a[j] = 1; a[j-1] = 0; if (r == j && j > 1) { r = j - 1; } else if (r == j - 1) { r = j; } } } } /* This return gets us back to just past the last invocation of exactcomb. We either arrive at now = k-1 or need to split off further combinations as needed. (FWS) */ return(0); } } /******************************************* * Project: k-Sample Anderson-Darling Tests * function adkPVal * Last modified: 07.21.2015 (FWS) *******************************************/ /* computes or estimates (depends on Nsim and the total number of all possible combinations) p-values for the observed k-sample Anderson-Darling test statistics in both original and alternative versions for the nonparametric (rank)test described in Scholz F.W. and Stephens M.A. (1987), K-sample Anderson-Darling Tests, Journal of the American Statistical Association, Vol 82, No. 399, pp. 918-924 Arguments: pval: double array of length 2, stores estimated p-values for the observed AkN2 and AakN2 Nsim: integer, number of simulations k: integer, number of samples being compared x: double array storing the concatenated samples in the same order as in ns ns: integer array storing the k sample sizes, corresponding to x zstar: double array storing the l distinct ordered observations in the pooled sample L: integer, length of zstar useExact: integer, 0: not, 1: yes; indicates whether the p-value will be computed via examining all possible combinations (this occurs when ncomb < Nsim, i.e., the total number of possible combinations is less than Nsim and the user chooses the exact approach, see R function getAdkPVal for details) getA2mat: logical, to indicate if a2mat, a double matrix storing the test statistics of all exact or simulated combinations, will be returned as part of the output ncomb: integer, number of all possible combinations a2mat: double matrix, either ncomb * 2 or Nsim * 2, depending on which approach is used, stores the test statistics of all exact or simulated combinations Outputs: when the computation ends, p-values of the observed AkN2 and AakN2 are stored in the given memory pointed to by pval and the test statistics of all exact or simulated combinations are stored in the given memory pointed to by a2mat (1st column: AkN2, 2nd column: AakN2) */ void adkPVal(double *pval, int Nsim, int k, double *x, int *ns, double *zstar, int L, int useExact, int getA2mat, double ncomb, double *a2mat) { int i; int j; int nsum = getSum(ns, k); /* total sample size = n_1 + ... + n_k */ int index; double adk[2]; /* initializes static variables */ initvals1(k, x, ns, zstar, L); /* gets observed AkN2 and AakN2 */ (*adkTestStat)(adk, k, x, ns); pval[0] = pval[1] = 0; /* uses R random number generator */ GetRNGstate(); if (useExact) { /* goes through all possible combinations */ int ivec[nsum]; int position[nsum]; for (i = 0; i < nsum; i++) { position[i] = i; } initvals(k, x, ns, zstar, L, ivec, adk, pval, nsum, (int) ncomb, getA2mat, a2mat,2); exactcomb(0, position, nsum, 2, adkTestStat); /* gets exact p-values */ pval[0] = pval[0] / (double) ncomb; pval[1] = pval[1] / (double) ncomb; } else { /* uses Nsim simulations to get p-value */ double randy; double temp; double adksim[2]; double xc[nsum]; /* copy of x */ for (i = 0; i < Nsim; i++) { /* gets random permutation xc of x */ randPerm(nsum, x, xc, ns); /* gets simulated AkN2 and AakN2 */ (*adkTestStat)(adksim, k, xc, ns); /* compares simulated AkN2 and AakN2 with observed ones */ for (j = 0; j < 2; j++) { if (getA2mat) { /* records the AD test statistics for each simulated combination */ a2mat[i + Nsim * j] = adksim[j]; } if (adksim[j] >= adk[j]) { pval[j] = pval[j] + 1; } } } /* estimates p-values */ pval[0] = pval[0] / (double) Nsim; pval[1] = pval[1] / (double) Nsim; } /* finishes using R random number generator */ PutRNGstate(); } /*************************** * Project: k-Sample Anderson-Darling Tests * Function adkPVal0 * Last modified: 07.21.2015 ***************************/ /* wrapper function for function adkPVal to enable R calls to it */ void adkPVal0(double *pval, int *Nsim, int *k, double *x, int *ns, double *zstar, int *L, int *useExact, int * getA2mat, double *ncomb, double *a2mat){ adkPVal(pval, *Nsim, *k, x, ns, zstar, *L, *useExact, *getA2mat, *ncomb, a2mat); } /* wrapper function for function adkTestStat to enable R calls to it */ void adkTestStat0(double *ans, int *k, double *x, int *ns, double *zstar, int *L){ initvals1(*k, x, ns, zstar, *L); adkTestStat(ans, *k, x, ns); } /*********************************************** * End Project: k-Sample Anderson-Darling Tests ************************************************/ /*********************************** * Project: 2*t Contingency Table * Function contingency2xtExact * Last modified: 07.21.2015 ***********************************/ /* computes the exact null distribution of the Kruskal-Wallis statistics in a 2 x t contingency table, which is \bar{K}^* = N * (N - 1) * (\sum (A_i^2 / d_i) - m^2 / N) / (m * n ) Define delta = \sum (A_i^2 / d_i) # Treatment | 1 2 ... t | Total # --------------------------------------- # Response | | # a | A_1 A_2 ... A_t | m # b | B_1 B_2 ... B_t | n # --------------------------------------- # Total | d_1 d_2 ... d_t | N Arguments: Avec: integer array of length tnum, storing the column counts with Response "a" Bvec: integer array of length tnum, storing the column counts with Response "b" tnum: integer, number of columns in the contingency table ncomb: integer, number of possible splits of m into the sum of tnum nonnegative integers, i.e., choose(m + tnum - 1, tnum - 1) results: double vector of length (2 + 2 * ncomb), whose first two entries contain the observed value of delta and its p-value, followed by the ncomb delta values for each possible split, where delta is defined to be \sum (A_i^2 / d_i), followed by the corresponding ncomb probabilities (note that some splits have 0 probability since a_i <= d_i) Output: the values of delta, \sum (A_i^2 / d_i), observed and for all splits, and their corresponding p-value and probabilities are stored in the given memory pointed by results */ void contingency2xtExact(int *Avec, int *Bvec, int tnum, int ncomb, int getDist, double *results) { int dvec[tnum]; /* tnum column sums */ int m = 0; /* row sum for response "a" */ int n = 0; /* row sum for response "b" */ int i; int j; int sum; int count; int boolean; int flag; int uvec[tnum - 1]; /* index vector */ /* xvec: numbers of units with Response "a" in each treatment group */ int xvec[tnum]; double delta; double deltaObserved = 0; double prob; /* get m, n, dvec, and the observed delta */ for(i = 0; i < tnum; i++){ m = m + Avec[i]; n = n + Bvec[i]; dvec[i] = Avec[i] + Bvec[i]; deltaObserved = deltaObserved + Avec[i] * Avec[i] / (double) dvec[i]; } results[0] = deltaObserved; results[1] = 0; /* Algorithm using Chase's sequence by Donald Knuth (TAOCP V.4A, 7.2.1.3 Algorithm C); goes through all combinations of choosing (tnum-1) from (m + tnum - 1) distinct objects */ int *a; int *w; int *pt; int mt1 = m + tnum - 1; int s = m; int r; /* initializes variables */ /* this is the start of the C1 step in Knuth's algorithm C (FWS) */ a = imalloc(mt1); w = imalloc(mt1 + 1); for (j = 0; j < s; j++) { a[j] = 0; w[j] = 1; } for (j = s; j < mt1; j++) { a[j] = w[j] = 1; } w[mt1] = 1; if (s > 0) { r = s; } else { r = tnum - 1; } /* this is the end of the C1 step in Knuth's algorithm C (FWS) */ j = r; count = 2; boolean = 1; while (boolean) { /* visits current combination */ /* sets up the (tnum - 1) indices, where 1 <= uvec[0] < ... < uvec[tnum - 2] <= mt1 */ pt = uvec; for (i = 0; i < mt1; i++) { if (a[i]) { *pt = i + 1; pt++; } } /* computes x_i's , the number of units with response "a" in each of the treatment groups, which are stored in xvec */ sum = 0; delta = 0; for (i = 0; i < tnum - 1; i++) { xvec[i] = uvec[i] - i - 1 - sum; sum = sum + xvec[i]; delta = delta + xvec[i] * xvec[i] / (double) dvec[i]; } xvec[tnum-1] = m - sum; delta += xvec[tnum-1] * xvec[tnum-1] / (double) dvec[tnum-1]; /* store delta for this split*/ if(getDist){ results[count] = delta; } /* computes the probability associated with current combination */ prob = 1; /* initializes probability */ flag = 1; i = 0; while (flag && i < tnum) { if (xvec[i] > dvec[i]) { prob = 0; flag = 0; /* gets out of while loop early */ } else { prob *= choose(dvec[i], xvec[i]); i++; } } if (flag) { prob = prob / choose(m + n, m); } /* updating p-value */ if(delta >= deltaObserved) results[1] += prob; /* store probability for this split */ if(getDist){ results[count + ncomb] = prob; } count++; /* end of visiting current combination */ /* finds j and branches */ j = r; while(w[j] == 0) { w[j] = 1; j++; } if (j == mt1) { /* terminate point of this algorithm */ boolean = 0; /* gets out of while loop */ } else { /* continue */ w[j] = 0; if (a[j] == 1) { if (j % 2 == 0 && a[j-2] == 0) { a[j-2] = 1; a[j] = 0; if (r == j) { if (j - 2 > 1) { r = j - 2; } else { r = 1; } } else if (r == j - 2) { r = j - 1; } } else { a[j-1] = 1; a[j] = 0; if (r == j && j > 1) { r = j - 1; } else if (r == j - 1) { r = j; } } } else { /* a[j] == 0 */ if (j % 2 == 1 && a[j-1] == 0) { a[j] = 1; a[j-2] = 0; if (r == j - 2) { r = j; } else if (r == j - 1) { r = j - 2; } } else { a[j] = 1; a[j-1] = 0; if (r == j && j > 1) { r = j - 1; } else if (r == j - 1) { r = j; } } } } } /* end of while(boolean) */ /* frees the pointers */ free(a); free(w); } /* wrapper function for function table2xtExact to enable R calls to it */ void contingency2xtExact0(int *Avec, int *Bvec, int *tnum, int *ncomb, int *getDist, double *ans){ contingency2xtExact(Avec, Bvec, *tnum, *ncomb, *getDist, ans); } /* simulates the null distribution of the Kruskal-Wallis statistics in a 2 x t contingency table, which is \bar{K}^* = N * (N - 1) * (\sum (A_i^2 / d_i) - m^2 / N) / (m * n ) Define delta = \sum (A_i^2 / d_i) # Treatment | 1 2 ... t | Total # --------------------------------------- # Response | | # a | A_1 A_2 ... A_t | m # b | B_1 B_2 ... B_t | n # --------------------------------------- # Total | d_1 d_2 ... d_t | N Arguments: dvec: integer array of length tnum, storing the column totals of the 2 x t contingency table m: integer, the total number of units with Response "a" n: integer, the total number of units with Response "b" ncomb: integer, number of possible splits of m into the sum of tnum nonnegative integers, i.e., choose(m + tnum - 1, tnum - 1) nsim: integer, number of simulations results: double array, storing the simulated values of delta, where delta = \sum (A_i^2 / d_i) Output: the simulated values of delta, \sum (A_i^2 / d_i), are stored in the given memory pointed to by results */ void contingency2xtSim(int *Avec, int *Bvec, int tnum, int nsim, int getDist, double *results) { int dvec[tnum]; /* tnum column sums */ int m = 0; /* row sum for response "a" */ int n = 0; /* row sum for response "b" */ int i; int j; int a; int nb; int k; int sum; double delta; double deltaObserved = 0; int pval = 0; for(i = 0; i < tnum; i++){ m = m + Avec[i]; n = n + Bvec[i]; dvec[i] = Avec[i] + Bvec[i]; deltaObserved = deltaObserved + Avec[i] * Avec[i] / (double) dvec[i]; } results[0] = deltaObserved; results[1] = 0; /* uses R random number generator */ GetRNGstate(); for (i = 0; i < nsim; i++) { /* initializes variables */ nb = m + n; k = m; delta = 0; sum = 0; for (j = 0; j < tnum - 1; j++) { nb = nb - dvec[j]; /* function rhyper in Rmath.h: random generation for the hypergeometric distribution */ a = (int) rhyper(dvec[j], nb, k); delta = delta + a * a / (double) dvec[j]; sum = sum + a; k = k - a; } a = m - sum; delta = delta + a * a / (double) dvec[tnum - 1]; if(delta >= deltaObserved) pval += 1; if(getDist){ results[i+2] = delta; } } results[1] = (double) (pval) / (double) nsim; /* finishes using R random number generator */ PutRNGstate(); } /* wrapper function for function contingency2xtSim to enable R calls it */ void contingency2xtSim0(int *Avec, int *Bvec, int *tnum, int *nsim, int *getDist, double *ans){ contingency2xtSim(Avec, Bvec, *tnum, *nsim, *getDist, ans); } /*********************************** * End of 2*t Contingency Table * Last modified: 07.21.2015 ***********************************/ /*************************** * Project: k-Sample Kruskal-Wallis Test * Functions QNTestStat, QNinitvals, QNexact * based on KWexact by Angie Zhu * modified 06/11/2015 Fritz Scholz ***************************/ static int getQNdist; /* logical value to indicate whether generated QN values are recorded in QNvec */ static double *QNvec; /* initializes static variables */ void QNinitvals(int kk, double *xxvec, int *nns, int *iivec, double *tteststat, double *ppval, int nnsum, int ggetQNdist, double *QQNvec, int ddimst) { k = kk; xvec = xxvec; ns = nns; ivec = iivec; teststat = tteststat; pval = ppval; nsum = nnsum; getSmat = ggetQNdist; smat = QQNvec; count = 0; dimst = ddimst; } void QNinitvals1(int kk, double *xxvec, int *nns){ k = kk; xvec = xxvec; ns = nns; } /* computes the non-normalized k-sample rank score test statistics Arguments: QN: double array with length 1, stores the non-normalized QN test statistic k: integer, number of samples being compared rx: double array storing the concatenated average rank scores of the k samples in the same order as in ns ns: integer array storing the k sample sizes, corresponding to rx nsum: integer, number of all observations ns[0]+...+ns[k-1] Outputs: when the computation ends, the non-normalized QN statistic is stored in the given memory pointed to by QN */ void QNTestStat(double *QN, int k, double *rx, int *ns) { int i; int j; double Ri; int istart, iend; QN[0] = 0.0; istart = 0; for(i = 0; i < k; i++){ Ri = 0.0; iend = istart+ns[i]; for(j = istart; j < iend;j++){ Ri += rx[j]; } QN[0] += Ri * Ri / ns[i]; istart = iend; } QN[0] = round(1e8*QN[0])/1e8; /* this avoids computational quirks due to machine representation of numbers*/ } /*************************** * Project: k-Sample QN Test * Function QNpvalue * 06/11/2015 Fritz Scholz ***************************/ /* for random number generator in R */ /* estimates p-values for the observed k-sample Kruskal-Wallis test statistics. Arguments: pval: double array with length 1, storing the estimated p-value for the observed QN value Nsim: integer, number of simulations k: integer, number of samples being compared rx: double array storing the average rank scores of the concatenated samples in the same order as in ns ns: integer array, storing the k sample sizes, corresponding to rx useExact: integer, 0: not, 1: yes; indicates if the p-value will be computed via examining all possible combinations (this occurs when ncomb < Nsim, i.e., the total number of possible combinations is less than Nsim and the user chooses the exact approach, see R function getQNPVal for details) getQNdist: logical, to indicate whether the exact or simulated QNvec will be returned as part of the output ncomb: double, number of all possible combinations Outputs: when the computation ends, p-values of the observed, non-normalized QN is stored in the memory pointed at by pval and the distribution of the non-normalized QN values of all exact or simulated combinations is stored in array QNvec. The observed non-normalized QN is stored in memory pointed at by QNobs. */ void QNpvalue(double *pval, int Nsim, int k, double *rx, int *ns, int useExact, int getQNdist, double ncomb, double *QNobs, double *QNvec) { int i; int j; int Ri; int isim; int nsum = getSum(ns, k); /* total sample size = n_1 + ... + n_k */ int index; QNinitvals1(k,rx,ns); /* get observed test statistic for the average rank score vector rx in non-normalized form */ QNTestStat(QNobs, k, rx, ns); pval[0] = 0.0; /* uses R random number generator */ GetRNGstate(); if (useExact) { /* goes through all possible combinations */ int ivec[nsum]; int position[nsum]; for (i = 0; i < nsum; i++) { position[i] = i; } /* initializes static variables */ QNinitvals(k, rx, ns, ivec, QNobs, pval, nsum, getQNdist, QNvec,1); exactcomb(0,position,nsum,1,QNTestStat); /* gets exact p-values */ pval[0] = pval[0] / ncomb; } else { /* uses Nsim simulations to get p-value */ double randy; double temp; double QNsim[1]; double rc[nsum]; /* copy of rx */ for (isim = 0; isim < Nsim; isim++) { randPerm(nsum, rx, rc, ns); /* gets simulated QN */ QNTestStat(QNsim, k, rc, ns); /* compares simulated QN with observed one */ if (QNsim[0] >= QNobs[0]) { pval[0] = pval[0] + 1.0; } if (getQNdist) { QNvec[isim] = QNsim[0]; } } /* estimates p-values */ pval[0] = pval[0] / (double) Nsim; } /* finishes using R random number generator */ PutRNGstate(); } /*************************** * Project: k-Sample QN Test * Function QNtest * Fritz Scholz, last modified: 06.11.2015 ***************************/ /* wrapper function for function QNpvalue to enable R calls to it */ void QNtest(double *pval, int *Nsim, int *k, double *rx, int *ns, int *useExact, int *getQNdist, double *ncomb, double *QNobs, double *QNvec){ QNpvalue(pval,*Nsim,*k,rx,ns,*useExact,*getQNdist, *ncomb,QNobs,QNvec); } /**************************** * End of k-Sample QN Test ***************************/ /*************************** * Project: Steel Mutiple Wilcoxon Test * Functions SteelTestStat, Steelinitvals, * Steelinitvals1, Steelexact, Steelpvalue * based on KWexact by Angie Zhu * last modified: 06.11.2015 ***************************/ /* static variables */ static int k; /* number of samples */ static double *xvec; /* vector of averaged rank scores for all observations */ static int *ns; /* sample sizes for the k samples */ static int *ivec; /* this vector will hold the indices 0, 1, ..., k-1, indicating the sample associated with the respective positions. It indicates the final set of combinations (FWS) */ static double *teststat; /* observed Steel statistic (Steel.obs), not normalized */ static double *pval; /* number of cases where Steel.comb >= (or <=) Steel.obs */ static int nsum; /* total number of observations on all k samples */ static int alt; /* indicating the type & direction of the test statistic */ static double *mu; /* array of length k-1 for Wilcoxon means for standardization */ static double *tau; /* array of length k-1 for Wilcoxon std. devs. for standardization */ static int getSteeldist; /* logical value to indicate whether generated Steel values are recorded in Steelvec*/ static int count; /* keeps track of how many combinations have been visited */ /* initializes static variables */ void Steelinitvals(int kk, double *xxvec, int *nns, int *iivec, double *tteststat, double *ppval, int nnsum, int aalt, double *mmu, double *ttau, int ggetSteeldist, double *SSteelvec, int ddimst) { k = kk; xvec = xxvec; ns = nns; ivec = iivec; teststat = tteststat; pval = ppval; nsum = nnsum; alt = aalt; mu = mmu; tau = ttau; getSmat = ggetSteeldist; smat = SSteelvec; dimst = ddimst; count = 0; } void Steelinitvals1(int kk, double *xxvec, int *nns, int aalt, double *mmu, double *ttau){ k = kk; xvec = xxvec; ns = nns; alt = aalt; mu = mmu; tau = ttau; } /* computes the maximum/minimum/max-absolute pairwise standardized Wilcoxon test statistics when comparing k-1 samples against the same control sample. Arguments: Steel: double array with length 1, stores the standardized version of the Steel test statistic k: integer, number of samples being compared, including the control sample rx: double array storing the concatenated double midrank scores of the k samples in the same order as ns, where the first ns[0] are the controls ns: integer array storing the k sample sizes, corresponding to rx nsum: integer, number of all observations ns[0]+... +ns[k-1] alt: integer with values -1, 0 or 1, indicating which type of test statistic to compute. For alt = 1 it computes the maximum standardized Wilcoxon statistic when comparing each treatment sample with the control sample. Standardization is done by mu[i-1] and tau[i-1] as mean and standard deviation when dealing with treatment i. For alt = -1 it computes the minimum standardized Wilcoxon statistic when comparing each treatment sample with the control sample. For alt = 0 it computes the maximum absolute standardized Wilcoxon statistic when comparing each treatment sample with the control sample. mu: double array of length k-1, holding the means of the Wilcoxon (Mann-Whitney form) test statistics, i.e., mu[i] = ns[0]*ns[i+1]/2, i=0,...,k-2. tau: double array of length k-1, holding the standard deviations of the Wilcoxon (Mann-Whitney form) test statistics, in the randomization distribution conditioned on the tie pattern in all the data, as expressed in rx. Outputs: when the computation ends, the Steel statistic based on standardized Wilcoxon statistics is stored in the given memory pointed to by Steel */ void SteelTestStat(double *Steel, int k, double *rx, int *ns){ int i; int j; int m; double Ri; double maxR; int istart, iend; istart = ns[0]; Ri = 0.0; for(i = 1; i < k; i++){ iend = istart+ns[i]; for(j = istart; j < iend;j++){ for( m = 0; m < ns[0]; m++){ if(rx[m] <= rx[j]){ if(rx[m] == rx[j]){ Ri += .5; }else{ Ri += 1; } } } } Ri = (Ri-mu[i-1])/tau[i-1]; if(alt == 1){ if(i ==1){ Steel[0] = Ri;}else{ if(Ri > Steel[0]) Steel[0] = Ri; } } if(alt == -1){ if(i ==1){ Steel[0]= Ri;}else{ if(Ri < Steel[0]) Steel[0] = Ri; } } if(alt == 0){ Ri = fabs(Ri); if(i ==1){ Steel[0] = Ri;}else{ if(Ri > Steel[0]) Steel[0] = Ri; } } istart = iend; Ri = 0.0; } } /* estimates p-values for the observed Steel test statistic. Arguments: pval: double array with length 1, storing the estimated p-value for the observed Steel value Nsim: integer, number of simulations k: integer, number of samples being compared and control, (k-1) comparisons rx: double array storing the midranks of the concatenated samples in the same order as ns, the control corresponds to ns[0] ns: integer array, storing the k sample sizes, corresponding to rx useExact: integer, 0: not, 1: yes; indicates if the p-value will be computed via examining all possible combinations (this occurs when ncomb <= Nsim, i.e., the total number of possible combinations is <= Nsim and the user chooses the exact approach) getSteeldist: logical, to indicate whether the exact or simulated Steelvec will be returned as part of the output ncomb: double, number of all possible combinations alt: integer -1, 1, or 0 indicating which one-sided or two-sided statistic to use. alt = 1, use maximum standardized Wilcoxon statistics alt = -1, use minimum standardized Wilcoxon statistics alt = 0, use maximum absolute standardized Wilcoxon statistics Outputs: when the computation ends, p-values of the observed Steel statistic is stored in the memory pointed at by pval and the distribution of the standardized Steel statistic values of all exact or simulated combination splits is stored in array Steelvec. The observed standardized Steel statistic is stored in memory pointed at by Steelobs. */ void Steelpvalue(double *pval, int Nsim, int k, double *rx, int *ns, int useExact, int getSteeldist, double ncomb, int alt, double *mu, double *tau, double *Steelobs, double *Steelvec) { int i; int j; int Ri; int isim; int nsum = getSum(ns, k); /* total sample size = n_1 + ... + n_k */ int index; Steelinitvals1(k, rx, ns, alt, mu, tau); /* get observed test statistic for the average rank score vector rx in standardized form */ SteelTestStat(Steelobs, k, rx, ns); pval[0] = 0.0; /* uses R random number generator */ GetRNGstate(); if (useExact) { /* goes through all possible combinations */ int ivec[nsum]; int position[nsum]; for (i = 0; i < nsum; i++) { position[i] = i; } /* initializes static variables */ Steelinitvals(k, rx, ns, ivec, Steelobs, pval, nsum, alt, mu, tau, getSteeldist, Steelvec,1); exactcomb(0, position, nsum, 1, SteelTestStat ); /* gets exact p-values */ pval[0] = pval[0] / ncomb; } else { /* uses Nsim simulations to get p-value */ double randy; double temp; double Steelsim[1]; double rc[nsum]; /* copy of rx */ for (isim = 0; isim < Nsim; isim++) { /* gets random permutation rc of rx */ randPerm(nsum, rx, rc, ns); /* gets simulated Steel */ SteelTestStat(Steelsim, k, rc, ns); /* compares simulated Steel with observed one */ if(alt != -1){ if (Steelsim[0] >= Steelobs[0]) { pval[0] = pval[0] + 1.0; } }else{ if (Steelsim[0] <= Steelobs[0]) { pval[0] = pval[0] + 1.0; } } if (getSteeldist) { Steelvec[isim] = Steelsim[0]; } } /* estimates p-values */ pval[0] = pval[0] / (double) Nsim; } /* finishes using R random number generator */ PutRNGstate(); } /*************************************************************** * Project: Steel Multiple Wilcoxon Test * Function Steeltest.c * last modified: 06.11.2015 **************************************************************/ /* wrapper function for function Steelpvalue to enable R calls to it */ void Steeltest(double *pval, int *Nsim, int *k, double *rx, int *ns, int *useExact, int *getSteeldist, double *ncomb, int *alt, double *mu, double *tau, double *Steelobs, double *Steelvec){ Steelpvalue(pval,*Nsim,*k,rx,ns,*useExact,*getSteeldist, *ncomb,*alt,mu,tau,Steelobs,Steelvec); } /*************************************** * End of Steel Multiple Wilcoxon Test *****************************************/ /************************************************************* * Project: Steel Multiple Wilcoxon Test Confidence Intervals * Function SteelTestStatVec, SteelinitvalsVec, SteelexactVec, * SteelVec * 06/11/2015 *************************************************************/ /* computes the pairwise Wilcoxon test statistics when comparing k-1 treatment samples against the same control sample. Arguments: SteelVec: double array of length k-1, stores the Mann-Whitney statistics of the k-1 treatment samples when comparing each with the same control sample. k: integer, number of samples being compared, including the control sample rx: double array storing the concatenated double midrank scores of the k samples in the same order as ns, where the first ns[0] are the controls ns: integer array storing the k sample sizes, corresponding to rx nsum: integer, number of all observations ns[0]+... +ns[k-1] Outputs: when the computation ends, the k-1 Mann-Whitney statistics are stored in the given memory pointed to by SteelVec */ void SteelTestStatVec(double *SteelVec, int k, double *rx, int *ns){ int i; int j; int m; double Ri; int istart, iend; istart = ns[0]; Ri = 0.0; for(i = 1; i < k; i++){ iend = istart+ns[i]; for(j = istart; j < iend; j++){ for( m = 0; m < ns[0]; m++){ if(rx[m] <= rx[j]){ if(rx[m] == rx[j]){ Ri += .5; }else{ Ri += 1; } } } } SteelVec[i-1] = Ri; istart = iend; Ri = 0.0; } } /* The algorithm of generating all possible combinations using Chase's sequence is written by Donald Knuth (TAOCP V.4A, 7.2.1.3 Algorithm C) */ /* static variables */ static int k; /* number of samples */ static double *xvec; /* vector of averaged rank scores for all observations */ static int *ns; /* sample sizes for the k samples */ static int *ivec; /* this vector will hold the indices 0, 1, ..., k-1, indicating the sample associated with the respective positions. It indicates the final set of combinations (FWS) */ static int nsum; /* total number of observations on all k samples */ static double *MannWhitneyStats; /* holds the Mann-Whitney statistics for all visited combinations, stacked on top of each other in groups of k-1 */ static int count; /* keeps track of how many combinations have been visited */ /* initializes static variables */ void SteelinitvalsVec(int kk, double *xxvec, int *nns, int *iivec, int nnsum, double *MMannWhitneyStats) { k = kk; xvec = xxvec; ns = nns; ivec = iivec; nsum = nnsum; MannWhitneyStats = MMannWhitneyStats; count = 0; } /* uses recursive backtracking to find all possible ways/combinations to divide nsum elements into k subcollections, where the size of each subcollection is fixed; for each combination, the desired test statistics are computed */ int SteelexactVec(int now, int *position, int m) { int i; int ix; int ccount; int j; int k1 = k - 1; if (now == k - 1) { double rc[nsum]; double teststatscomb[k1]; double *pt; /* fills the remaining m=ns[k-1] positions of ivec with now = k-1 (FWS) */ for (i = 0; i < m; i++) { ivec[position[i]] = now; } /* here we equate the pointer pt with that of rc and by filling the associated array pt we also fill the array rc. (FWS) */ pt = rc; /* here we fill pt=rc first with all the xvec[j] which are supposed to belong to sample 0 according to ivec, then with those belonging to sample 1, and so on. This will give us the full permuted sample sequence (FWS) */ for (i = 0; i < k; i++) { for (j = 0; j < nsum; j++) { if (ivec[j] == i) { *pt = xvec[j]; pt++; } } } /* get test statistic for this combination rc */ SteelTestStatVec(teststatscomb, k, rc, ns); /* records the test statistics for each combination */ ccount = count * k1; for( ix = 0; ix < k1; ix++){ MannWhitneyStats[ccount+ix] = teststatscomb[ix]; } count++; return(2); /* this return gets us back to just beyond the point of the last previous call to SteelexactVec, to find the next combination at that stage of now (FWS) */ } else { /* Algorithm using Chase's sequence by Donald Knuth (TAOCP V.4A, 7.2.1.3 Algorithm C) */ int s = m - ns[now]; /* s represents the size of the remainder after the ns[now] sample values for the current combination have been chosen. The meaning of the variables a, w, r are pretty much explained in Knuth, p. 367. In particular, a[i] = 1 means that the element with index i is designated as part of the chosen combination. (FWS) */ int r; int *a; int *w; int newposition[s]; /* this newposition array is meant to replace the position array in the recursive call SteelexactVec, to get to the next combination of the k combinations to be chosen. It is set up below, right after while (1) {.... (FWS) */ int *tmp; /* this pointer is equated to the pointer of the newposition array and is used to fill that array. (FWS) */ /* initializes variables */ /* this is the start of the C1 step in Knuth's algorithm C (FWS) */ a = imalloc(m); w = imalloc(m + 1); for (j = 0; j < s; j++) { a[j] = 0; w[j] = 1; } for (j = s; j < m; j++) { a[j] = w[j] = 1; } w[m] = 1; if (s > 0) { r = s; } else { r = ns[now]; } /* this is the end of the C1 step in Knuth's algorithm C (FWS) */ j = r; /* the setup of this function assures that j != m at this point since ns[now] > 0 and ns[now] != m */ while (1) { /* visits current combination */ /* here we equate the pointers tmp and newposition and by filling tmp we fill newposition. (FWS) */ tmp = newposition; /* If indicated by a[i]=1 (relative to the current position array and w.r.t. the array a in that context), we fill ivec at index position[i] with the sample index now, that is under discussion here. All other position indices are collected inside the array newposition, by assignment via tmp. It amounts to splitting the m position elements into two groups of size ns[now] (the chosen combination for the now sample) and s = m-ns[now], the remainder. (FWS) */ for (i = 0; i < m; i++) { if (a[i]) { ivec[position[i]] = now; } else { *tmp = position[i]; tmp++; } } /* recursive function call */ /* to get the next combination, as indicated by now+1, using the residual position vector newposition, but when understanding what happens to it, that newposition vector is referred to as position inside the algorithm SteelexactVec. (FWS) */ SteelexactVec(now + 1, newposition, s); /* finds j and branches */ j = r; while(w[j] == 0) { w[j] = 1; j++; } /* Here we find out whether we have encountered the last combination already, and whether we should step back prior to the last invocation of SteelexactVec, possibly leading to further stepping back, until there is no more stepping back, i.e., we have traversed all combination splits. If we do not terminate here, we generate the next step in the array generation, according to Knuth's C2-C7. (FWS) */ if (j == m) { /* terminate point of this algorithm */ return(1); } else { w[j] = 0; } if (a[j] == 1) { if (j % 2 == 0 && a[j-2] == 0) { a[j-2] = 1; a[j] = 0; if (r == j) { if (j - 2 > 1) { r = j - 2; } else { r = 1; } } else if (r == j - 2) { r = j - 1; } } else { a[j-1] = 1; a[j] = 0; if (r == j && j > 1) { r = j - 1; } else if (r == j - 1) { r = j; } } } else { /* a[j] == 0 */ if (j % 2 == 1 && a[j-1] == 0) { a[j] = 1; a[j-2] = 0; if (r == j - 2) { r = j; } else if (r == j - 1) { r = j - 2; } } else { a[j] = 1; a[j-1] = 0; if (r == j && j > 1) { r = j - 1; } else if (r == j - 1) { r = j; } } } } /* This return gets us back to just past the last invocation of SteelexactVec. We either arrive at now = k-1 or need to split off further combinations as needed. (FWS) */ return(0); } } /* estimates p-values for the observed Steel test statistic. Arguments: Nsim: integer, number of simulations k: integer, number of samples being compared and control, (k-1) comparisons rx: double array storing the midranks of the concatenated samples in the same order as ns, the control corresponds to ns[0] ns: integer array, storing the k sample sizes, corresponding to rx useExact: integer, 0: not, 1: yes; 1 indicates that full enumeration of all ncomb combination splits is used in getting the Mann-Whitney statistics. This should occur only when ncomb <= Nsim. Otherwise Nsim random combination splits are used. Outputs: when the computation ends, the double array MannWhitneyStats will contain in stacked form the enumerated or simulated Mann-Whitney statistics in groups of k-1. It will have length ncomb*(k-1) or Nsim*(k-1). */ void SteelVec(int Nsim, int k, double *rx, int *ns, int useExact, double *MannWhitneyStats) { int i; int ix; int ccount; int j; int Ri; int isim; int k1=k-1; int nsum = getSum(ns, k); /* total sample size = n_1 + ... + n_k */ int index; /* uses R random number generator */ GetRNGstate(); if (useExact) { /* goes through all possible combinations */ int ivec[nsum]; int position[nsum]; for (i = 0; i < nsum; i++) { position[i] = i; } /* initializes static variables */ SteelinitvalsVec(k, rx, ns, ivec, nsum, MannWhitneyStats); SteelexactVec(0, position, nsum); } else { /* uses Nsim simulations to get p-value */ double randy; double temp; double Steelsim[k1]; double rc[nsum]; /* copy of rx */ for (isim = 0; isim < Nsim; isim++) { randPerm(nsum, rx, rc, ns); /* gets simulated Steel */ SteelTestStatVec(Steelsim, k, rc, ns); ccount = k1*isim; for(ix=0; ix= k; i--){ values[i+1] = values[i]; probs[i+1] = probs[i]; } } values[k] = xnew; probs[k] = pnew; *Lt = *Lt+1; } void convaddtotable(double xnew, double pnew, int *Lt, int M, double *values, double *probs){ /* this function adds a value xnew to a table of ordered values[0] < ... < values[*Lt-1], increasing *Lt by 1 if a new value is inserted at values[k] with probs[k] set to pnew, shifting all other values and probs for indices >= k by one. If the value is not new, then only the corresponding probs value is incremented by pnew. It requires that *Lt < M. */ int k1, k2, kk, k; if(*Lt > 2){ k1 = 0; k2 = *Lt-1; if(xnew < values[k1]){ k = 0; insertxp(xnew,pnew,k,Lt,values,probs); }else if(xnew > values[k2]){ k = *Lt; insertxp(xnew,pnew,k,Lt,values,probs); }else if(xnew == values[k2]){ probs[k2] += pnew; }else{ while(k1+1 < k2){ kk = (int) floor((double)(k2- k1)/2) +k1; if(xnew < values[kk]){ k2 = kk; } else { k1 = kk; } } if(xnew == values[k1]){ probs[k1] += pnew; }else{ k = k2; insertxp(xnew,pnew,k,Lt,values,probs); } } }else if(*Lt <= 0){ values[0] = xnew; probs[0] = pnew; *Lt = 1; }else if (*Lt ==1){ if(xnew < values[0]){ k = 0; insertxp(xnew,pnew, k,Lt,values,probs); }else if(xnew > values[0]){ k = 1; insertxp(xnew,pnew,k,Lt,values,probs); }else if(xnew == values[0]){ probs[0] += pnew; } }else if(*Lt == 2){ if(xnew < values[0]){ k = 0; insertxp(xnew,pnew,k,Lt,values,probs); }else if(xnew > values[1]){ k = *Lt; insertxp(xnew,pnew,k,Lt,values,probs); }else if(xnew == values[0]){ probs[0] += pnew; }else if(xnew == values[1]){ probs[1] += pnew; }else{ k = 1; insertxp(xnew,pnew,k,Lt,values,probs); } } } /* convolutes two distributions given by supports x1 and x2 and corresponding probabilities p1 and p2, returning x and p as the resulting distribution */ void conv(double *x1, double *p1, int *n1, double *x2, double *p2, int *n2, double *x, double *p, int *n) { int i,j, M; double xij, pij; M = n1[0]*n2[0]; n[0] = 0; for(i=0; i < n1[0]; i++){ for(j=0; j < n2[0]; j++){ xij = x1[i]+x2[j]; pij = p1[i]*p2[j]; xij = round(1e8*xij)/1e8; convaddtotable(xij,pij, n, M, x, p); } } } /* convolutes two vectors x1 and x2 of respective lengths n1 and and n2 and produces the vector of length n = n1*n2 of all possible sums x1[i]+x2[j] */ void convvec(double *x1, int *n1, double *x2, int *n2, double *x, int *n) { int i,j, M; double xij, pij; n[0] = 0; for(i=0; i < n1[0]; i++){ for(j=0; j < n2[0]; j++){ x[j+i*n2[0]] = x1[i]+x2[j]; } } } void Harding(int k, int L1, int *nn, int *nvec, double *freq){ int L, M, i, ii, m, n, P, Q, t, u, s; double mnm ; L = L1-1; M = L/2; freq[0] = 1; for( i = 1; i < L1; i++ ){ freq[i] = 0; } for( i = 1; i <= (k-1); i++ ){ m = nvec[i-1] - nvec[i]; n = nvec[i]; if( n+1 <= M){ P = m + n; if( M < P ) P = M; for( t = n+1; t <= P; t++){ for( u = M; u >= t; u--){ freq[u] = freq[u] - freq[u-t]; } } } Q = M; if(m < M) Q = m; for(s = 1; s <= Q; s++){ for(u = s; u <= M; u++){ freq[u] = freq[u] + freq[u-s]; } } mnm = (double) choose(m+n,m); for(ii=0; ii < L1; ii++){ freq[ii] = freq[ii]/mnm; } } if( L % 2 == 0 ){ for( i = 1; i <= M; i++) freq[M + i] = freq[M - i]; } else { for( i = 1; i <= (M+1); i++) freq[M+i] = freq[M+1-i]; } } /* wrapper function for function Harding to enable R calls to it */ void Harding0(int *k, int *L1, int *nn, int *nvec, double *freq){ Harding(*k, *L1, nn, nvec, freq); } /*************************** * Project: k-Sample Jonckhere-Terpstra Test * Functions JTTestStat, JTinitvals, JTexact * based on KWexact by Angie Zhu * modified 08/25/2015 Fritz Scholz ***************************/ static int getJTdist; /* logical value to indicate whether generated JT values are recorded in JTvec */ static double *JTvec; /* initializes static variables */ void JTinitvals(int kk, double *xxvec, int *nns, int *iivec, double *tteststat, double *ppval, int nnsum, int ggetJTdist, double *JJTvec, int ddimst) { k = kk; xvec = xxvec; ns = nns; ivec = iivec; teststat = tteststat; pval = ppval; nsum = nnsum; getSmat = ggetJTdist; smat = JJTvec; count = 0; dimst = ddimst; } void JTinitvals1(int kk, double *xxvec, int *nns){ k = kk; xvec = xxvec; ns = nns; } /* computes the non-normalized k-sample rank score test statistics Arguments: JT: double array with length 1, stores the JT test statistic k: integer, number of samples being compared rx: double array storing the concatenated scores of the k samples in the same order as in ns ns: integer array storing the k sample sizes, corresponding to rx nsum: integer, number of all observations ns[0]+...+ns[k-1] Outputs: when the computation ends, the JT statistic is stored in the given memory pointed to by JT */ void JTTestStat(double *JT, int k, double *rx, int *ns) { int i, j; int m, n; int mstart, mend, nstart, nend; mstart = 0.0; JT[0] = 0.0; for(i = 0; i < k-1; i++){ mend = mstart + ns[i]; nstart = mend; for(j = i+1; j < k; j++){ nend = nstart+ns[j]; for(n = nstart; n < nend; n++){ for( m = mstart; m < mend; m++){ if(rx[m] <= rx[n]){ if(rx[m] == rx[n]){ JT[0] += .5; }else{ JT[0] += 1; } } } } nstart = nend; } mstart = mend; } } /* wrapper function for function JTTestStat to enable R calls to it */ void JTTestStat0(double *JT, int *k, double *rx, int *ns){ JTTestStat(JT, *k, rx, ns); } /*************************** * Project: k-Sample JT Test * Function JTpvalue * 08/25/2015 Fritz Scholz ***************************/ /* for random number generator in R */ /* estimates p-values for the observed k-sample Jonckheere-Terpstra test statistics. Arguments: pval: double array with length 1, storing the estimated p-value for the observed JT value Nsim: integer, number of simulations k: integer, number of samples being compared rx: double array storing the scores of the concatenated samples in the same order as in ns ns: integer array, storing the k sample sizes, corresponding to rx useExact: integer, 0: not, 1: yes; indicates if the p-value will be computed via examining all possible combinations (this occurs when ncomb < Nsim, i.e., the total number of possible combinations is less than Nsim and the user chooses the exact approach, see R function getJTPVal for details) getJTdist: logical, to indicate whether the exact or simulated JTvec will be returned as part of the output ncomb: double, number of all possible combinations Outputs: when the computation ends, the p-value of the observed, JT is stored in the memory pointed at by pval and the distribution of the JT values of all exact or simulated combinations is stored in array JTvec, provided getJTdist is not 0. The observed JT is stored in memory pointed at by JTobs. */ void JTpvalue(double *pval, int Nsim, int k, double *rx, int *ns, int useExact, int getJTdist, double ncomb, double *JTobs, double *JTvec) { int i; int j; int Ri; int isim; int nsum = getSum(ns, k); /* total sample size = n_1 + ... + n_k */ int index; JTinitvals1(k,rx,ns); /* get observed test statistic for the score vector rx */ JTTestStat(JTobs, k, rx, ns); pval[0] = 0.0; /* uses R random number generator */ GetRNGstate(); if (useExact) { /* goes through all possible combinations */ int ivec[nsum]; int position[nsum]; for (i = 0; i < nsum; i++) { position[i] = i; } /* initializes static variables */ JTinitvals(k, rx, ns, ivec, JTobs, pval, nsum, getJTdist, JTvec,1); exactcomb(0,position,nsum,1,JTTestStat); /* gets exact p-values */ pval[0] = pval[0] / ncomb; } else { /* uses Nsim simulations to get p-value */ double randy; double temp; double JTsim[1]; double rc[nsum]; /* copy of rx */ for (isim = 0; isim < Nsim; isim++) { randPerm(nsum, rx, rc, ns); /* gets simulated JT */ JTTestStat(JTsim, k, rc, ns); /* compares simulated JT with observed one */ if (JTsim[0] >= JTobs[0]) { pval[0] = pval[0] + 1.0; } if (getJTdist) { JTvec[isim] = JTsim[0]; } } /* estimates p-values */ pval[0] = pval[0] / (double) Nsim; } /* finishes using R random number generator */ PutRNGstate(); } /*************************** * Project: k-Sample JT Test * Function JTtest * Fritz Scholz, last modified: 08.26.2015 ***************************/ /* wrapper function for function JTpvalue to enable R calls to it */ void JTtest(double *pval, int *Nsim, int *k, double *rx, int *ns, int *useExact, int *getJTdist, double *ncomb, double *JTobs, double *JTvec){ JTpvalue(pval,*Nsim,*k,rx,ns,*useExact,*getJTdist, *ncomb,JTobs,JTvec); } /**************************** * End of k-Sample JT Test ***************************/ void randPerm(int nsum, double *rx, double *rc, int *ns){ /* Prior to using function randPerm, need to execute GetRNGstate(). After being done with all usages of randPerm during a simulation run, need to execute PutRNGstate() */ double randy; double temp; int j, index; mcopy(rx, rc, nsum, 1); /* gets a copy of rx */ /* generate a random permutation of rc by randomly interchanging values on positions nsum - 1, nsum - 2, ..., ns[0] (C uses 0-based indexing; elements 0, ... , ns[0] - 1 all belong to the first sample; for details of this algorithm, see "Simulation" by Sheldon M. Ross, e.g. 4b, p.51-52.) */ for (j = nsum; j > ns[0]; j--) { randy = runif(0, 1); while(1 <= randy ) { /* to eliminate rare event randy = 1 */ randy = runif(0, 1); } /* index is an random integer between 0 and j-1 (discrete uniform) */ index = (int) floor(randy * (double) (j)); /* interchanges the values at positions j-1 and index */ temp = rc[j-1]; rc[j-1] = rc[index]; rc[index] = temp; } } static const R_CMethodDef cMethods[] = { {"convvec", (DL_FUNC) &convvec, 6}, {"adkTestStat0", (DL_FUNC) &adkTestStat0, 6}, {"adkPVal0", (DL_FUNC) &adkPVal0, 11}, {"contingency2xtExact0", (DL_FUNC) &contingency2xtExact0, 6}, {"contingency2xtSim0", (DL_FUNC) &contingency2xtSim0, 6}, {"convC", (DL_FUNC) &conv, 9}, {"Harding0", (DL_FUNC) &Harding0, 5}, {"JTtest", (DL_FUNC) &JTtest, 10}, {"QNtest", (DL_FUNC) &QNtest, 10}, {"Steeltest", (DL_FUNC) &Steeltest, 13}, {"SteelConf", (DL_FUNC) &SteelConf, 6}, {NULL, NULL, 0} }; void R_init_kSamples(DllInfo *info) { R_registerRoutines(info, cMethods, NULL, NULL, NULL); R_useDynamicSymbols(info, TRUE); } kSamples/src/myfuns.h0000644000176200001440000000733013470613013014313 0ustar liggesusers/* functions prototypes */ double *dmalloc(unsigned long n); int getCount(double z, double *dat, int n); int getSum(int *x, int n); int *imalloc(unsigned long n); void mcopy(double *x, double *copy, int n, int m); void imcopy(int *x, int *copy, int n, int m); double **pdmalloc(unsigned long n); int exactcomb(int now, int *position, int m, int dimst, void(*testStatFun)(double *teststat, int k, double *x, int *ns)); void initvals(int kk, double *xx, int *nns, double *zzstar, int ll, int *iivec, double *tteststat, double *ppval, int nnsum, int nncomb, int ggetSmat, double *ssmat, int ddimst); void initvals1(int kk, double *xx, int *nns, double *zzstar, int LL); int runCount(void); void adkTestStat(double *adk, int k, double *x, int *ns); void adkPVal(double *pval, int Nsim, int k, double *x, int *ns, double *zstar, int l, int useExact, int getA2mat, double ncomb, double *a2mat); void contingency2xtExact(int *Avec, int *Bvec, int tnum, int ncomb, int getDist, double *results); void contingency2xtSim(int *Avec, int *Bvec, int tnum, int nsim, int getDist, double *results); void QNraw(double *QN, int k, double *rx, int *ns); void QNinitvals(int kk, double *rrvec, int *nns, int *iivec, double *tteststat, double *ppval, int nnsum, int ggetQNdist, double *QQNvec, int ddimst); void QNinitvals1(int kk, double *rrvec, int *nns); // int QNexact(int now, int *position, int m); void QNpvalue(double *pval, int Nsim, int k, double *rx, int *ns, int useExact, int getQNdist, double ncomb, double *QNobs, double *QNvec); void SteelTestStat(double *Steel, int k, double *rx, int *ns); // int alt, double *mu, double *tau); void Steelinitvals(int kk, double *rrvec, int *nns, int *iivec, double *tteststat, double *ppval, int nnsum, int aalt, double *mmu, double *ttau, int ggetSteeldist, double *SSteelvec, int ddimst); void Steelinitvals1(int kk, double *xxvec, int *nns, int aalt, double *mmu, double *ttau); int Steelexact(int now, int *position, int m); void Steelpvalue(double *pval, int Nsim, int k, double *rx, int *ns, int useExact, int getSteeldist, double ncomb, int alt, double *mu, double *tau, double *Steelobs, double *Steelvec); void SteelVec(int Nsim, int k, double *rx, int *ns, int useExact, double *MannWhitneyStats); void SteelrawVec(double *SteelVec, int k, double *rx, int *ns); int SteelexactVec(int now, int *position, int m); void SteelinitvalsVec(int k, double *rvec, int *ns, int *ivec, int nsum, double *MMannWhitneyStats); void insertxp(double xnew, double pnew, int k, int *Lt, double *values, double *probs); void convaddtotable(double xnew, double pnew, int *Lt, int M, double *values, double *probs); void conv(double *x1, double *p1, int *n1, double *x2, double *p2, int *n2, double *x, double *p, int *n); void convvec(double *x1, int *n1, double *x2, int *n2, double *x, int *n); void Harding(int k, int L1, int *nn, int *nvec, double *freq); void JTinitvals(int kk, double *xxvec, int *nns, int *iivec, double *tteststat, double *ppval, int nnsum, int ggetJTdist, double *JJTvec, int ddimst); void JTinitvals1(int kk, double *xxvec, int *nns); void JTTestStat(double *JT, int k, double *rx, int *ns); void JTpvalue(double *pval, int Nsim, int k, double *rx, int *ns, int useExact, int getJTdist, double ncomb, double *JTobs, double *JTvec); void randPerm(int nsum, double *rx, double *rc, int *ns); kSamples/NAMESPACE0000644000176200001440000000075413120314014013243 0ustar liggesusersuseDynLib("kSamples", .registration = TRUE) export(ad.pval,ad.test,ad.test.combined,contingency2xt,contingency2xt.comb) export(conv,pp.kSamples,qn.test,qn.test.combined) export(Steel.test,SteelConfInt,djt,pjt,qjt,jt.test) S3method(print,kSamples) importFrom("SuppDists","normOrder") importFrom("graphics","abline","par","plot","text") importFrom("methods","is") importFrom("stats","dnorm","integrate","model.response","pchisq", "pnorm","predict","qnorm","smooth.spline","uniroot","var") kSamples/data/0000755000176200001440000000000013452475503012752 5ustar liggesuserskSamples/data/ShorelineFireEMS.rda0000644000176200001440000000355612005352424016543 0ustar liggesusers‹íÝënÜDði×P >ð¡BªBA[(ˆÛ)´¡\ %I¡ÜÝ–„l’©¼Ày€÷8â'¶ÙÿLñÉNÆc/»K¥ä·sñØ©'»¶çeóòÍsOÝ|ʳfÖž9eÖ™¾\_›~;eÖÍ“SOoíý~çprogcrçʵ­i×gúÏ\Ÿ~==ýz•ÌìŸó*Ïü~¿Fû&Ê›2såm”o |Cæû Aöˆ~ß²G©zÇu“çûý¿‡?b»Ÿ”?Ã_”·0ÎílæÔßÉ!Ú÷LEsôÿüÈŸ„Âà>1RO|Îãóð < _‚ qÕKÊ ôÛ@ù”?áù^Eû§Z®ú™Qò|?7y¾_˜€èwÍäª_\õ+ãí×M@Š”*õ´ïo^É#{”vnššRM9Riæ–i)EÊ+íÜ6©¡Ü±RÏ&‘Ô³P†ñÓ³”Xn©Ôó[3”XŽTšé®û–É#Q†õ;Ó³Ô³œX‡ß›ÄRÇrOJZ0‰¥ŽäŽ”8íóäRK9±2_ýÜ'ZJ,'RÒjŸ‡5–"åH¥™úù^r) 7TÚ™›žÌ:’™'’;¶¨©´´Lã-0 HódEM%±åÉÞ65Í"¥Hsì±PŠGüœný¥®Y@ªiî‘•…RY¦ñW³HÉcî‘EYÎwÇ(³†R@X,OvDZ‹ýí²ãìÊÌߌ2ó˜C»]YU­W>4Sâ¸öX‰q'¤Dûíû¤Äþfes´Núø_¶´òµÎÒ·ë[—õy¦¦<ëÑ®ó¾ _Q¾ª|]ù<ÏC»nü–ò"|¾߃ïÃxæÇ(ŒòeåSÕ®?ûtëÒ!¹žv;Z®zlÜ'ÇZ7oªw½=•ÜÎàú~S¹[ƒñZ>Yoê÷±ŸÔ`»ì÷ã¢ÿ¡­/«ÞÍ”÷î¬ÝåK?ú¡ð¿‰ åQíé–¦ÊãÖê¼í¾äñe¥Îç~ ¾Ïy´ùÙïÂà‡ðÏ´yÚ—áÔo%ê}yÖWQ¯ó£½yÒ\O›Ê3î:?8u~oë<_>ÙTyµ×M¤”HŽTêéÍÛÔR@ŽTâì=ß{RÚ9ºü5n©,¦ ŸÆ=+ií,ÿi(©cy eX&ÿ‰{RÒºty35o)GZÔT–Íl7jÍzïÖÝ?¦/N?hAåúÖö›¾~ëü^_øÏë7§þmÌÚ³GÛTšuwâï@ûA†áèwÁ–º?¨ƒþöÛ ï~ñs%Úõ±ý ñ^ `û[Ø^_0¹ LŒçn0Ðno¤Ü'Êîƒ-c;û@Î=XÅ8î7êÝÚÝ‚ÚÝ‚!Úí¾ ±8.ˆ«ÚÀ%¸eÀlà› ”;kì¿çéK:Ìgõ.ðÚ_ ý‘ìy´çϾºóhËh·wDY@´» Ô» Œsì‚ÇÌ×]ˆa<ïüÈ”ØÞÝ  ¿»¡Áq¸y#U݈£¿}€à}°v=¯ÜBêÝŒã»öÍ÷` ýÜû ú»í¶Çï4¨ª^¨Óv¾ØyaßOl`²=ædg0kÿ˜VŸ)•1ÖÐþØ•k[Û_m£ôøÆäþ΃âÑæïÆ4Õ‹kSamples/R/0000755000176200001440000000000013470613013012230 5ustar liggesuserskSamples/R/djt.R0000644000176200001440000000056012606574772013157 0ustar liggesusersdjt <- function (x, nn) { dist <- Harding(nn) if (is.nan(dist[1])) stop("can't compute due to numerical instability\n") nn <- sort(as.integer(nn)) nvec <- rev(cumsum(rev(nn))) k <- length(nn) L <- sum(nn[1:(k - 1)] * nvec[2:k]) pos <- match(x, 0:L) d <- numeric(length(x)) d[!is.na(pos)] <- dist[pos[!is.na(pos)]] d } kSamples/R/test.list.R0000644000176200001440000000023212575371442014315 0ustar liggesuserstest.list <- function(x){ sel <- unlist(lapply(x,length)) x <- x[sel > 1] if(length(x) <= 1) stop("need more than one block with at least 2 samples") x } kSamples/R/contingency2xt.comb.R0000644000176200001440000002155112536365334016270 0ustar liggesuserscontingency2xt.comb <- function (..., method=c("asymptotic","simulated","exact"), dist=FALSE,Nsim=10000) { ################################################################# # This function contingency2xt.comb combines several 2 x t # contingency table analyses over M blocks (possibly different t # across blocks) by adding the Kruskal-Wallis test criteria for # each of the blocks. # This follows the same pattern as generally combinging # Kruskal-Wallis tests across blocks as suggested in # Lehmann, E.L. (2006), Nonparametrics, Statistical Methods Based # on Ranks, Ch. 6, Sec. 5D. # See also the documentation of contingency2xt for the analysis # of a single 2 x t table of counts. # This combined version tests the hypothesis that all the # hypotheses underlying the individual contingency tables are # true simultaneously and is relevant when randomizations or # samples are independent from block to block # # Input: ... # can take the form of several lists, # say L.1,...,L.M, where list L.i contains # 2 sample vectors of length t > 1 each, but the # t may vary from list to list, # # or a single list of such lists. # # method # can take one of three values "asymptotic", "simulated", # and "exact", # which determines the mode of P-value calculation. # The asymptotic P-value, based on the chi-square # approximation, is always returned. # The simulated P-value simulates counts for the tables # conditioned on the observed marginal totals # (see contingency2xt), doing this independently across # tables, and computing the corresponding # Kruskal-Wallis statistics KW.i, for i = 1,...,M. # Adding the KW.i yields the combined statistic # KW.combined. # This is repeated Nsim times and the simulated P-value is # the proportion of these values that are >= the observed # combined value. # The exact P-value should only be attempted for small M # and small marginal totals and requires that Nsim be set # to >= the total number of KW.combined enumerations. # Otherwise Nsim simulations are run to get a simulated # P-value, as described above. # As example consider: M=2 with t.1 = 3 columns in the # first table and row totals m.1 = 40, n.1[2] = 60, while # the second table has t.2 = 4 columns, with row totals # m.2 = 30 and n.2 = 25. # Then we would have choose(40+2,2) = 861 possible counts # for table 1 # and choose(30+3,3) = 5456 counts for table 2, thus # 861*5456 = 4697616 possible count configurations for both # tables jointly. Thus one should choose Nsim >= 4697616. # However, the ultimate distributions KW.combined may have # far fewer unique values. # # dist # takes values FALSE (default) or TRUE, where TRUE enables # the return of the simulated or exact distributions of # KW.combined. Otherwise NULL is returned for both versions # # Nsim # = 10000 (default), number of simulations as discussed # above. # # # # An example: # contingency2xt.comb(list(c(15,12,25),c(12,5,7)), # list(c(12,6,4),c(6,12,3)), # method="exact",dist=F, Nsim=1e6) # produces the outcome below. ################################################################# # # Combined Kruskal-Wallis Tests for 2 x t Contingency Tables # # for data set 1 we get # observed KW asympt. P-value exact P-Value # 3.4539090 0.1778252 0.1786888 # # for data set 2 we get # observed KW asympt. P-value exact P-Value # 4.0259740 0.1335890 0.1673558 # # Combined Criterion: KW.combined = KW.1+KW.2 # # KW.combined asympt. P-value exact P-Value # 7.4798830 0.1125996 0.1099804 # # ################################################################# # For # out <- contingency2xt.comb(list(c(15,12,25),c(12,5,7)), # list(c(12,6,4),c(6,12,3)), # method="exact",dist=F, Nsim=1e6) # we get the object out of class ksamples with the following # components # names(out) # [1] "test.name" "t" "M" "kw.list" "kw.c" "null.dist" # [7] "method" "Nsim" # where # test.name = "Combined 2 x t Contingency Tables" # t = vector giving the number of columns for each table # M = number of tables # kw.list = list of M vectors holding the results for each of # the tests corresponding to the M blocks # kw.c = 2 (or 3) vector containing the observed KW.combined, # asymptotic P-value, (and simulated or exact P-value) # for the combined test. # null.dist = L x 2 matrix (if dist = TRUE), with first # column holding the unique, simulated or fully # numerated KW statistics, and the second column # holding the corresponding relative frequencies # or probabilities. # If dist = FALSE we get null.dist = NULL. # method one of the following values: "asymptotic", # "simulated", "exact" as it was ultimately used. # Nsim number of simulations used, when applicable. # # Fritz Scholz, April 2012 ################################################################# # the following converts individual data set lists into a list of # such, if not already in this form. if(length(list(...)) == 1) { if(is.list(...) & is.list(...[[1]])){ data.sets <- list(...)[[1]]}else{ stop("you need more than 1 block of data sets\n") } }else { data.sets <- list(...) } # end of data.sets list conversion method <- match.arg(method) M <- length(data.sets) # number of data sets if(M < 2) stop("To combine test results you must have at least two data sets.") tvec <- numeric(M) ncomb <- 1 for(i in 1:M){ n.sample <- sapply(data.sets[[i]], length) if(n.sample[1] != n.sample[2]) stop("Not all count vectors have the same length.") tvec[i] <- n.sample[1] m <- sum(data.sets[[i]][[1]]) ncomb <- ncomb * choose(m+tvec[i] - 1,m) } if(ncomb > Nsim & method == "exact") method <- "simulated" # Initializing output objects KWobs <- 0 null.dist <- NULL kw.list <- list() if(method == "asymptotic"){ dist0 <- FALSE for(i in 1:M){ out <- contingency2xt(data.sets[[i]][[1]],data.sets[[i]][[2]], method=method,dist=dist0,tab0=FALSE,Nsim=Nsim) kw.list[[i]] <- out$KW.cont KWobs <- KWobs + out$KW.cont[1] # aggregates combined KW stats } } # the following loops aggregate the (estimated or exact) # convolution distribution of the combined KW statistics if(method == "simulated"){ dist0 <- TRUE for(i in 1:M){ out <- contingency2xt(data.sets[[i]][[1]],data.sets[[i]][[2]], method=method,dist=dist0,tab0=FALSE,Nsim=Nsim) if(i == 1){ null.dist <- out$null.dist }else{ null.dist <- null.dist + out$null.dist } kw.list[[i]] <- out$KW.cont KWobs <- KWobs + out$KW.cont[1] # aggregates combined KW stats } } if(method == "exact"){ dist0 <- TRUE for(i in 1:M){ out <- contingency2xt( data.sets[[i]][[1]],data.sets[[i]][[2]], method=method,dist=dist0,Nsim=Nsim) if(i == 1){ null.dist <- out$null.dist }else{ null.dist <- conv(null.dist[,1],null.dist[,2], out$null.dist[,1],out$null.dist[,2]) } kw.list[[i]] <- out$KW.cont KWobs <- KWobs + out$KW.cont[1] # aggregates combined KW stats } } # get exact or simulated P-value if(method == "simulated"){ pval <- sum(null.dist >= KWobs)/Nsim } if(method == "exact"){ pval <- sum(null.dist[null.dist[,1] >= KWobs,2]) } # get asymptotic P-value pval.asympt <- 1-pchisq(KWobs, sum(tvec) - M) if(method=="asymptotic"){ kw.c <- c(KWobs,pval.asympt) }else{ kw.c <- c(KWobs,pval.asympt,pval) } if(method=="asymptotic"){ names(kw.c) <- c("KW.combined"," asympt. P-value") } if(method=="exact"){ names(kw.c) <- c("KW.combined"," asympt. P-value","exact P-Value") } if(method=="simulated"){ names(kw.c) <- c("KW.combined"," asympt. P-value","sim. P-Value") } if(dist==FALSE){ null.dist <- NULL } if(method== "simulated" & dist==TRUE){ out <- table(round(null.dist,6)) null.dist <- cbind(as.numeric(names(out)),as.numeric(out)/Nsim) dimnames(null.dist) <- list(NULL,c("KW","prob")) } object <- list(test.name =paste( "Combined 2 x t Contingency Tables"), t = tvec, M = M, kw.list = kw.list, kw.c = kw.c, null.dist = null.dist, method = method, Nsim = Nsim) class(object) <- "kSamples" object } kSamples/R/listmake.R0000644000176200001440000000052413470601047014171 0ustar liggesuserslistmake <- function (y=1:10,g=as.factor(c(1,1,1,2,2,2,2,2,2,3)),b=as.factor(c(1,1,1,1,3,2,2,2,3,1))) { data.sets <- lapply(levels(b), function(blv){ gg <- as.factor(as.numeric(unlist(g[b==blv]))) lapply(levels(gg), function(glv){ return(y[b == blv & g == glv]) } ) } ) data.sets } kSamples/R/qjt.R0000644000176200001440000000063212606574616013171 0ustar liggesusersqjt <- function (p, nn) { dist <- Harding(nn) if (is.nan(dist[1])) stop("can't compute due to numerical instability\n") nn <- sort(as.integer(nn)) nvec <- rev(cumsum(rev(nn))) k <- length(nn) L <- sum(nn[1:(k - 1)] * nvec[2:k]) dist <- c(0, cumsum(dist)) x <- c(-Inf, 0:L) out <- findInterval(p, dist) out[p %in% dist] <- out[p %in% dist] - 1 x[out + 1] } kSamples/R/na.remove.R0000644000176200001440000000071312607276530014260 0ustar liggesusersna.remove <- function(x){ # # This function removes NAs from a list and counts the total # number of NAs in na.total. # Returned is a list with the cleaned list x.new and with # the count na.total of NAs. # na.status <- lapply(x,is.na) # changed sapply to lapply k <- length(x) x.new <- list() na.total <- 0 for( i in 1:k ){ x.new[[i]] <- x[[i]][!na.status[[i]]] na.total <- na.total + sum(na.status[[i]]) } list(x.new=x.new,na.total=na.total) } kSamples/R/conv.R0000644000176200001440000000103713120314673013323 0ustar liggesusersconv <- function(x1,p1,x2,p2){ #---------------------------------------------------- # R routine for calling conv in conv.c # created 02.05.2012 Fritz Scholz #---------------------------------------------------- n1 <- length(x1) n2 <-length(x2) n <- n1*n2 x <- numeric(n) p <- numeric(n) out <- .C("convC", x1=as.double(x1), p1=as.double(p1), n1=as.integer(n1), x2=as.double(x2), p2=as.double(p2), n2=as.integer(n2), x=as.double(x), p=as.double(p), n=as.integer(n), PACKAGE = "kSamples") n <- out$n cbind(out$x[1:n],out$p[1:n]) } kSamples/R/pjt.R0000644000176200001440000000065512606574664013200 0ustar liggesuserspjt <- function (x, nn) { dist <- Harding(nn) if (is.nan(dist[1])) stop("can't compute due to numerical instability\n") nn <- sort(as.integer(nn)) nvec <- rev(cumsum(rev(nn))) k <- length(nn) L <- sum(nn[1:(k - 1)] * nvec[2:k]) dist <- cumsum(dist) x <- floor(x) pos <- match(x, 0:L) d <- numeric(length(x)) d[x > L] <- 1 d[!is.na(pos)] <- dist[pos[!is.na(pos)]] d } kSamples/R/io2.R0000644000176200001440000000416712660725522013065 0ustar liggesusersio2 <- function(...,data=NULL) { # ... can be a sequence of lists of numeric sample vectors # with at least 2 sample vectors per list. # Each list corresponds to a block level, and the samples # within a block correspond to different (treatment) # group levels, although the same group levels (treatments) # may be used within different blocks. # Or ... can be a list of such lists. # Or ... can be a formula that specifies a # a response (e.g., y), grouped by a treatment factor, e.g., g, # with a blocking factor, e.g., b, via formula y ~ g | b. # The variables y, g, b may be names of columns in a supplied # data frame dat via data = dat (default NULL). # When data = NULL is used the variables y, g, b should exist # in the calling environment. # This breaks down y into blocks of responses via the block # levels and within each block the responses are broken down into # different samples corresponding to the different levels of g. # Or ... can be a data.frame with three column y, g, and b, # as described above. xlist <- list(...) cl <- match.call() # gets a copy of the current call if(is(xlist[[1L]], "formula")) { clstr <- unlist (strsplit(as.character(eval(cl[[2]])),NULL)) if ("~" %in% clstr & "|" %in% clstr) { Y <- eval(as.name(strsplit( as.character(eval(cl[[2]]))," ")[[2]]), envir = data) G <- eval(as.name(strsplit( as.character(eval(cl[[2]]))," ")[[3]][1]), envir = data) B <- eval(as.name(strsplit( as.character(eval(cl[[2]]))," ")[[3]][3]), envir = data) data.sets <- listmake( Y,as.factor(G),as.factor(B)) } } else { if(length(xlist) == 1 && is.list(...[[1]])){ data.sets <- list(...)[[1]] } else { if( length(list(...)) > 1 && is.list(list(...)[[1]])){ data.sets <- list(...) } } } b <- length(data.sets) if(b < 2) stop("less than 2 blocks\n") num <- NULL for( i in 1:b ){ num <- c(num,unlist(lapply(data.sets[[i]],FUN=is.numeric))) if(length(data.sets[[i]]) < 2) stop("at least one block with less than 2 samples\n") } if( all(num) == FALSE ) stop("improper data in ... \n") data.sets } kSamples/R/contingency2xt.R0000644000176200001440000001361613117074101015335 0ustar liggesusers# Project: 2 * t Contingency Table # adapted from Angie Zhu's getTable2xtNull.R # Filename: Contingency2xt.R # Last modified: 03.23.2012 # This function computes or simulates the null distribution of # the Kruskal-Wallis statistics in a 2 x t contingency table. # Treatment | 1 2 ... t | Total # --------------------------------------- # Response | | # a | A_1 A_2 ... A_t | m # b | B_1 B_2 ... B_t | n # --------------------------------------- # Total | d_1 d_2 ... d_t | N # Arguments: # Avec: integer vector of length tnum, containing the # "a" reponses # Bvec: integer vector of length tnum, containing the # "b" reponses # method: "asymptotic", "simulated", "exact", # indicating the method of p-value calculation, # dist: FALSE (default) means that the exact or # estimated null distribution is not requested, # otherwise it is. # tab0: TRUE (default), when method = "simulated", # the null distribution is given in tabular form, # when tab0 = FALSE and method = "simulated", the # null distribution is given as a single vector # of all simulated values. # Nsim: number of simulations to run, default 10^6 # # # Output: a list of class "kSamples" with components # test.name = "2 x t Contingency Table" # t = number of columns in the table # KW.cont = 2 (or 3 vector) containing the observed KW # value, its asymptotic (and the simulated # or exact) P-value # null.dist = a 2 x M matrix giving the M unique # ordered KW values of the null # distribution in the first column, and # the corresponding probabilities # (simulated or exact) in the second # column. When method = "simulated" and # tab0 = FALSE null.dist consists of a # single vector of all simulated values. # When dist = NULL or method = "asymptotic", only # NULL is returned. # method = the method used # Nsim = number of simulations employed contingency2xt <- function(Avec, Bvec, method=c("asymptotic","simulated","exact"), dist = FALSE, tab0 = TRUE, Nsim=1e6) { method <- match.arg(method) m <- sum(Avec) n <- sum(Bvec) tnum <- length(Avec) N <- m + n dvec <- Avec + Bvec KW.obs <- (N * (N - 1) / (m * n)) * (sum(Avec^2 / dvec) - m^2/N ) pValue.asy <- 1 - pchisq(KW.obs,tnum-1) if(dist == TRUE) Nsim <- min(Nsim,1e8) # limits the size of null.dist # whether method = "exact" or = "simulated" if(method == "asymptotic"){ KW.cont <- c(KW.obs,pValue.asy) null.dist <- NULL }else{ ncomb <- choose(m + tnum - 1, tnum - 1) if (ncomb <= Nsim && method == "exact") { if(dist){ ans <- numeric(2+2*ncomb) }else{ ans <- numeric(2) } out <- .C("contingency2xtExact0", Avec = as.integer(Avec), Bvec = as.integer(Bvec), tnum = as.integer(tnum), ncomb = as.integer(ncomb), getDist = as.integer(dist), ans = as.double(ans), PACKAGE = "kSamples") KW.obs <- out$ans[1] KW.obs <- (N * (N - 1) / (m * n)) * (KW.obs - m^2 / N) KW.obs <- round(KW.obs, 6) pValue <- out$ans[2] if(dist){ out <- out$ans[-(1:2)] out <- matrix(out, nrow=ncomb, ncol=2, byrow=FALSE) out <- out[out[, 2] > 0, ] prob <- out[ , 2] KW <- (N * (N - 1) / (m * n)) * ( out[ , 1] - m^2 / N) KW <- round(KW, 6) KW.u <- sort(unique(KW)) k.u <- length(KW.u) prob.u <- sapply(1:k.u, function(j) { sum( prob[KW == KW.u[j]] ) }) null.dist <- cbind(KW.u,prob.u) dimnames(null.dist) <- list(NULL,c("KW","prob")) }else{ null.dist <- NULL } KW.cont <- c(KW.obs,pValue.asy,pValue) }else{ method <- "simulated" } if( method == "simulated" ){ if( Nsim < 100) Nsim <- 100 if(dist){ ans <- numeric(2+Nsim) }else{ ans <- numeric(2) } out <- .C("contingency2xtSim0", Avec = as.integer(Avec), Bvec = as.integer(Bvec), tnum = as.integer(tnum), Nsim = as.integer(Nsim), getDist = as.integer(dist), ans = as.double(ans)) KW.obs <- out$ans[1] KW.obs <- (N * (N - 1) / (m * n)) * (KW.obs - m^2 / N) KW.obs <- round(KW.obs, 6) pValue <- out$ans[2] if(dist){ KW <- (N * (N - 1) / (m * n)) * (round(out$ans[3:(Nsim+2)],6) - m^2 / N) if(tab0==TRUE){ tab <- table(KW)/Nsim null.dist <- cbind(as.numeric(names(tab)), as.vector(tab)) dimnames(null.dist) <- list(NULL,c("KW","prob"))}else{ null.dist <- KW } }else{ null.dist <- NULL } KW.cont <- c(KW.obs,pValue.asy,pValue) } } if(method=="asymptotic"){ names(KW.cont) <- c("observed KW", " asympt. P-value") } if(method=="exact"){ names(KW.cont) <- c("observed KW", " asympt. P-value","exact P-Value") } if(method=="simulated"){ names(KW.cont) <- c("observed KW", " asympt. P-value","sim. P-Value") } object <- list(test.name =paste("2 x t Contingency Table"), t = tnum, KW.cont = KW.cont, null.dist = null.dist, method = method, Nsim = Nsim) class(object) <- "kSamples" object } kSamples/R/qn.test.R0000644000176200001440000002261613300402100013737 0ustar liggesusersqn.test <- function (..., data = NULL, test = c("KW","vdW","NS"), method=c("asymptotic","simulated","exact"), dist=FALSE,Nsim=10000) { ############################################################################# # This function "qn.test" tests whether k samples (k>1) come from a common # continuous distribution, using the QN rank test. See Lehmann (2006), # Nonparametrics, Statistical Methods Based on Ranks, Appendix Corollary 10. # Ties are handled by using average rank scores. # While the asymptotic P-value is always returned, there is the option # to get an estimate based on Nsim simulations or an exact value based # on the full enumeration distribution, provided method = "exact" is chosen # and the number of full enumerations is <= the Nsim specified. # If the latter is not the case, simulation is used with the indicated Nsim. # These simulated or exact P-values are appropriate under the continuity # assumption or, when ties are present, they are still appropriate # conditionally on the tied rank pattern, provided randomization took # place in allocating subjects to the respective samples, i.e., also # under random sampling from a common discrete parent population. # # # # Inputs: # ...: can either be a sequence of k (>1) sample vectors, # # or a list of k (>1) sample vectors, # # or y, g, where y contains the concatenated # samples and g is a factor which by its levels # identifies the samples in y, # # or a formula y ~ g with y and g as in previous case. # # test: specifies the ranks scores to be used, averaging the scores # of tied observations. # test = "KW" uses scores 1:N ( ==> Kruskal-Wallis test) # test = "vdW" uses the van der Waerden scores qnorm(1:N/(N+1)) # test = "NS" uses normal scores, expected standard normal order # statistics, uses function normOrder of package SuppDists. # Other scores could easily be added to this function. # # method: takes values "asymptotic", "simulated", or "exact". # The value "asymptotic" causes calculation of P-values # using the asymptotic chi-square approximation, always done. # # The value "simulated" causes estimation of P-values # by randomly splitting the the pooled data into # samples of sizes ns[1], ..., ns[k], where # ns[i] is the size of the i-th sample vector, # and n = ns[1] + ... + ns[k] is the pooled sample size. # For each such random split the QN statistic is # computed. This is repeated Nsim times and the proportions # of simulated values >= the actually observed QN value # is reported as P-value estimate. # # The value "exact" enumerates all n!/(ns[1]! * ... * ns[k]) # splits of the pooled sample and computes the QN statistic. # The proportion of all enumerated QN statistics # that are >= the actually observed QN value # is reported as exact (conditional) P-value. # # dist: = FALSE (default) or TRUE, TRUE causes the simulated # or fully enumerated vector of the QN statstic to be returned # as null.dist. # # Nsim: number of simulations to perform, # for method = "exact" to take hold, it needs to be at least # equal the number of all possible splits of the pooled # data into samples of sizes ns[1], ..., ns[k], where # ns[i] is the size of the i-th sample vector. # # When there are NA's among the sample values they are removed, # with a warning message indicating the number of NA's. # It is up to the user to judge whether such removals make sense. # # An example: # z1 <- c(0.824, 0.216, 0.538, 0.685) # z2 <- c(0.448, 0.348, 0.443, 0.722) # z3 <- c(0.403, 0.268, 0.440, 0.087) # qn.test(z1,z2,z3,method="exact",dist=T,Nsim=100000) # or # qn.test(list(z1,z2,z3),test="KW",method="exact",dist=T,Nsim=100000) # which produces the output below. ############################################################################# # # Kruskal-Wallis k-sample test. # # Number of samples: 3 # Sample sizes: 4 4 4 # Total number of values: 12 # Number of unique values: 12 # # Null Hypothesis: All samples come from a common population. # # QN asympt. P-value exact P-Value # 3.5769231 0.1672172 0.1729870 # # # Warning: At least one sample size is less than 5. # asymptotic p-values may not be very accurate. # ############################################################################# # In order to get the output list, call # qn.out <- qn.test(list(z1,z2,z3),test="KW",method="exact",dist=T,Nsim=100000) # then qn.out is of class ksamples and has components # > names(qn.out) # [1] "test.name" "k" "ns" "N" "n.ties" "qn" # [7] "warning" "null.dist" "method" "Nsim" # # where # test.name = "Kruskal-Wallis", "van der Waerden", or "normal scores" # k = number of samples being compared # ns = vector of the k sample sizes ns[1],...,ns[k] # N = ns[1] + ... + ns[k] total sample size # n.ties = number of ties in the combined set of all n observations # qn = 2 (or 3) vector containing the QN statistics, its asymptotic P-value, # (and its exact or simulated P-value). # warning = logical indicator, warning = TRUE indicates that at least # one of the sample sizes is < 5. # null.dist is a vector of simulated values of the QN statistic # or the full enumeration of such values. # This vector is given when dist = TRUE is specified, # otherwise null.dist = NULL is returned. # method = one of the following values: "asymptotic", "simulated", "exact" # as it was ultimately used. # Nsim = number of simulations used, when applicable. # # The class ksamples causes qn.out to be printed in a special output # format when invoked simply as: > qn.out # An example was shown above. # # Fritz Scholz, August 2012 # ################################################################################# ave.score <- function(z, scores){ # This function takes a data vector z and a vector scores # of same length and returns a vector av.scores of scores # using average scores for each group of tied # observations in z. av.scores and scores have same length. N <- length(z) rz <- rank(z) r.rz <- rank(z,ties.method="random") rz.u <- unique(rz) av.scores <- rep(0,N) for(rz.ui in rz.u){ av.scores[rz==rz.ui] <- mean(scores[r.rz[rz ==rz.ui]]) } av.scores } samples <- io(...,data = data) test <- match.arg(test) method <- match.arg(method) out <- na.remove(samples) na.t <- out$na.total if( na.t > 1) print(paste("\n",na.t," NAs were removed!\n\n")) if( na.t == 1) print(paste("\n",na.t," NA was removed!\n\n")) samples <- out$x.new k <- length(samples) if (k < 2) stop("Must have at least two samples.") ns <- sapply(samples, length) n <- sum(ns) if (any(ns == 0)) stop("One or more samples have no observations.") x <- unlist(samples) if(test == "KW"){ scores.vec <- 1:n } if (test == "NS") { if (!requireNamespace("SuppDists", quietly = TRUE)){ # if (!exists("normOrder")) library(SuppDists) stop("SuppDists (>= 1.1-9.4) needed for this function to work. Please install it.", call. = FALSE) } scores.vec <- normOrder(n) } if(test == "vdW") { scores.vec <- qnorm((1:n)/(n + 1)) } QNobs <- 0 pval <- 0 rx <- ave.score(x,scores.vec) svar <- var(rx) smean <- mean(rx) L <- length(unique(rx)) if(dist == TRUE) Nsim <- min(Nsim,1e8) ncomb <- 1 if( method == "exact"){ np <- n for(i in 1:(k-1)){ ncomb <- ncomb * choose(np,ns[i]) np <- np-ns[i] } # it is possible that ncomb overflows to Inf if(!(ncomb < Inf)) stop('ncomb = Inf, method = "exact" not possible\n') } if( method == "exact" & Nsim < ncomb) { method <- "simulated" } if( method == "exact" & dist == TRUE ) nrow <- ncomb if( method == "simulated" & dist == TRUE ) nrow <- Nsim if( method == "simulated" ) ncomb <- 1 # don't need ncomb anymore if(method == "asymptotic"){ Nsim <- 1 dist <- FALSE } useExact <- FALSE if(method == "exact") useExact <- TRUE if(dist==T){ QNvec <- numeric(nrow) }else{ QNvec <- 0 } out <- .C("QNtest", pval=as.double(pval), Nsim=as.integer(Nsim), k=as.integer(k), rx=as.double(rx), ns=as.integer(ns), useExact=as.integer(useExact), getQNdist=as.integer(dist), ncomb=as.double(ncomb),QNobs=as.double(QNobs), QNvec = as.double(QNvec), PACKAGE = "kSamples") QNobs <- (out$QNobs - n*smean^2)/svar pval <- out$pval if(dist){ QNvec <- round((out$QNvec- n*smean^2)/svar,8) } pval.asympt <- 1-pchisq(QNobs,k-1) if(method=="asymptotic"){ qn <- c(QNobs,pval.asympt) }else{ qn <- c(QNobs,pval.asympt,pval) } if(method=="asymptotic"){ names(qn) <- c("test statistic"," asympt. P-value") } if(method=="exact"){ names(qn) <- c("test statistic"," asympt. P-value","exact P-Value") } if(method=="simulated"){ names(qn) <- c("test statistic"," asympt. P-value","sim. P-Value") } warning <- FALSE if(min(ns) < 5) warning <- TRUE if(dist == FALSE | method == "asymptotic") QNvec <- NULL if(test == "vdW") test.name <- "van der Waerden scores" if(test == "NS") test.name <- "normal scores" if(test == "KW") test.name <- "Kruskal-Wallis" object <- list(test.name = test.name, k = k, ns = ns, N = n, n.ties = n - L, qn = qn, warning = warning, null.dist = QNvec, method=method, Nsim=Nsim) class(object) <- "kSamples" object } kSamples/R/Steelnormal0.R0000644000176200001440000000205612614525112014724 0ustar liggesusersSteelnormal0 <- function(sig0,sig,tau,U,ni, alternative=c("greater","two-sided")){ # this function computes the normal approximation of the p-value for the Steel test, # based on the sizes ni = c(n1,...,nk) of the k treatment samples # based on the observed Steel statistic U # sig0, sig and tau are parameters required for the power evaluation. alternative <- match.arg(alternative) k <- length(ni) if(alternative=="greater"){ funz <- function(z,k,sig0,sig,tau,S,ni){ fac <- 1 for(i in 1:k){ fac <- fac * pnorm((S*tau[i]-ni[i]*sig0*z)/sig[i]) } dnorm(z)*fac } N <- length(U) pval <- numeric(N) for(i in 1:N){ pval[i] <- 1-integrate(funz,-Inf,Inf,k,sig0,sig,tau,U[i],ni)$value } } if(alternative=="two-sided"){ funz <- function(z,k,sig0,sig,tau,S,ni){ fac <- 1 for(i in 1:k){ fac <- fac * (pnorm((S*tau[i]-ni[i]*sig0*z)/sig[i])- pnorm((-S*tau[i]-ni[i]*sig0*z)/sig[i])) } dnorm(z)*fac } N <- length(U) pval <- numeric(N) for(i in 1:N){ pval[i] <- 1-integrate(funz,-Inf,Inf,k,sig0,sig,tau,U[i],ni)$value } } pval } kSamples/R/ad.pval.R0000644000176200001440000002035511731456440013714 0ustar liggesusersad.pval<- function (tx,m,version=1) { # This function "adk.pval" evaluates the p-value of the observed value # tx of the T_m statistic in "K-Sample Anderson-Darling Tests" by F.W. Scholz # and M.A. Stephens (1987), Journal of the American Statistical Association, # Vol 82, No. 399, pp 918-924. Thus this p-value is P(T_m >= tx). # # Input: tx = observed value of T_m, tx > 0. # m = the index of T_m, m >= 1. # version = 1 (default) uses the first version of the AD statistic, # otherwise the second version is used. # # Output: a list with components # p0 = p-value of tx, i.e., p0 = P(T_m >= tx) # extrap = a logical indicator # extrap = TRUE means that linear extrapolation took place # extrap = FALSE means that quadratic interpolation was used. # # Computational Details: # # This function uses the upper T_m quantiles as obtained via simulation of # the Anderson-Darling test statistics (Nsim = 2*10^6) with sample sizes n=500 # for each sample, and after standardization, in order to emulate the Table 1 # values given in the above reference. However, here we estimate p-quantiles # for p = .00001,.00005,.0001,.0005,.001,.005,.01,.025,.05,.075, # .1,.2,.3,.4,.5,.6,.7,.8,.9,.925,.95,.975,.99,.9925,.995,.9975,.999, # .99925,.9995,.99975,.9999,.999925,.99995,.999975,.99999 # First the appropriate p-quantiles are determined from those simulated # for ms = 1,2,3,4,6,8,10, Inf, interpolating to the given value of m # by fitting a smoothing spline in 1/sqrt(ms) to the simulated quantiles. # Visual inspection of plots of fitted splines versus 1/sqrt(ms) shows # good agreement for the used smoothing parameter spar = .4. # # Next a smoothing spline is used to fit the log((1-p)/p) to # these interpolated quantiles and the value fitted to tx is # obtained (extrapolating linearly) beyond p = .00001 and .99999. # # The p-values from Table 1 were reproduced mainly with relative error # bounded by 1%, in 6 cases with respective relative errors of 1.25%, 1.2%, # 1.3%, 1.5%, 2.7% and 2.8% as can be seen from the relative error table below. # The columns correspond to Table 1 p-values of .25, .10, .05, .025, .01 and # the rows correspond to m = 1, 2 3, 4, 6, 8, 10, Inf. # # [,1] [,2] [,3] [,4] [,5] # [1,] -0.0012 0.0011 0.0038 0.0123 0.0277 # [2,] -0.0105 -0.0109 -0.0022 0.0129 0.0271 # [3,] -0.0077 -0.0052 0.0042 0.0106 0.0152 # [4,] -0.0050 -0.0031 0.0035 0.0067 0.0057 # [5,] -0.0041 -0.0033 0.0055 0.0014 -0.0050 # [6,] -0.0048 -0.0026 0.0002 -0.0021 -0.0082 # [7,] -0.0070 -0.0042 -0.0023 -0.0036 -0.0119 # [8,] 0.0010 0.0005 0.0013 0.0017 0.0023 # # # # # Fritz Scholz, March 2012 #========================================================================================= if(version==1){ table1.adk <- structure(c(1, 2, 3, 4, 6, 8, 10, Inf, -1.1954, -1.5806, -1.8172, -2.0032, -2.2526, -2.4204, -2.5283, -4.2649, -1.1786, -1.5394, -1.7728, -1.9426, -2.1685, -2.3288, -2.4374, -3.8906, -1.166, -1.5193, -1.7462, -1.9067, -2.126, -2.2818, -2.3926, -3.719, -1.1407, -1.4659, -1.671, -1.8105, -2.0048, -2.1356, -2.2348, -3.2905, -1.1253, -1.4371, -1.6314, -1.7619, -1.9396, -2.0637, -2.1521, -3.0902, -1.0777, -1.3503, -1.5102, -1.6177, -1.761, -1.8537, -1.9178, -2.5758, -1.0489, -1.2984, -1.4415, -1.5355, -1.6625, -1.738, -1.7936, -2.3263, -0.9978, -1.2098, -1.3251, -1.4007, -1.4977, -1.5555, -1.5941, -1.96, -0.9417, -1.1187, -1.209, -1.2671, -1.3382, -1.379, -1.405, -1.6449, -0.8981, -1.0491, -1.1235, -1.1692, -1.2249, -1.2552, -1.2755, -1.4395, -0.8598, -0.9904, -1.0513, -1.0879, -1.1317, -1.155, -1.1694, -1.2816, -0.7258, -0.7938, -0.8188, -0.8312, -0.8435, -0.8471, -0.8496, -0.8416, -0.5966, -0.617, -0.6177, -0.6139, -0.6073, -0.5987, -0.5941, -0.5244, -0.4572, -0.4383, -0.419, -0.4033, -0.3834, -0.3676, -0.3587, -0.2533, -0.2966, -0.2428, -0.2078, -0.1844, -0.1548, -0.1346, -0.1224, 0, -0.1009, -0.0169, 0.0304, 0.0596, 0.0933, 0.1156, 0.1294, 0.2533, 0.1571, 0.2635, 0.3169, 0.348, 0.3823, 0.4038, 0.4166, 0.5244, 0.5357, 0.6496, 0.6992, 0.7246, 0.7528, 0.7683, 0.7771, 0.8416, 1.2255, 1.2989, 1.3202, 1.3254, 1.3305, 1.3286, 1.3257, 1.2816, 1.5262, 1.5677, 1.5709, 1.5663, 1.5561, 1.5449, 1.5356, 1.4395, 1.9633, 1.943, 1.919, 1.8975, 1.8641, 1.8389, 1.8212, 1.6449, 2.7314, 2.5899, 2.5, 2.4451, 2.3664, 2.3155, 2.2823, 1.96, 3.7825, 3.4425, 3.2582, 3.1423, 3.0036, 2.9101, 2.8579, 2.3263, 4.1241, 3.716, 3.4984, 3.3651, 3.2003, 3.0928, 3.0311, 2.4324, 4.6044, 4.0847, 3.8348, 3.6714, 3.4721, 3.3453, 3.2777, 2.5758, 5.409, 4.7223, 4.4022, 4.1791, 3.9357, 3.7809, 3.6963, 2.807, 6.4954, 5.5823, 5.1456, 4.8657, 4.5506, 4.3275, 4.2228, 3.0902, 6.8279, 5.8282, 5.3658, 5.0749, 4.7318, 4.4923, 4.3642, 3.1747, 7.2755, 6.197, 5.6715, 5.3642, 4.9991, 4.7135, 4.5945, 3.2905, 8.1885, 6.8537, 6.2077, 5.8499, 5.4246, 5.1137, 4.9555, 3.4808, 9.3061, 7.6592, 6.85, 6.4806, 5.9919, 5.6122, 5.5136, 3.719, 9.6132, 7.9234, 7.1025, 6.6731, 6.1549, 5.8217, 5.7345, 3.7911, 10.0989, 8.2395, 7.4326, 6.9567, 6.3908, 6.011, 5.9566, 3.8906, 10.8825, 8.8994, 7.8934, 7.4501, 6.9009, 6.4538, 6.2705, 4.0556, 11.8537, 9.5482, 8.5568, 8.0283, 7.4418, 6.9524, 6.6195, 4.2649), .Dim = c(8L, 36L), .Dimnames = list( NULL, NULL))}else{ table1.adk <- structure(c(1, 2, 3, 4, 6, 8, 10, Inf, -1.1976, -1.5824, -1.8195, -2.005, -2.2546, -2.422, -2.5292, -4.2649, -1.1806, -1.5416, -1.7747, -1.9434, -2.1687, -2.3301, -2.438, -3.8906, -1.1681, -1.5212, -1.7479, -1.9078, -2.1268, -2.2827, -2.3937, -3.719, -1.1427, -1.4677, -1.6724, -1.8115, -2.0059, -2.1363, -2.2359, -3.2905, -1.1272, -1.4387, -1.6325, -1.7629, -1.9405, -2.0649, -2.1527, -3.0902, -1.0794, -1.3518, -1.5112, -1.6187, -1.7617, -1.8545, -1.9182, -2.5758, -1.0504, -1.2997, -1.4425, -1.5362, -1.6632, -1.7387, -1.7943, -2.3263, -0.999, -1.2109, -1.3259, -1.4014, -1.4981, -1.5561, -1.5945, -1.96, -0.9428, -1.1196, -1.2098, -1.2677, -1.3386, -1.3795, -1.4054, -1.6449, -0.8991, -1.05, -1.1241, -1.1697, -1.2253, -1.2557, -1.2758, -1.4395, -0.8607, -0.9911, -1.0518, -1.0883, -1.1321, -1.1555, -1.1698, -1.2816, -0.7264, -0.7944, -0.8192, -0.8315, -0.8437, -0.8473, -0.8498, -0.8416, -0.597, -0.6173, -0.6179, -0.6141, -0.6074, -0.5989, -0.5942, -0.5244, -0.4574, -0.4385, -0.4191, -0.4034, -0.3835, -0.3677, -0.3588, -0.2533, -0.2966, -0.2427, -0.2078, -0.1844, -0.1548, -0.1347, -0.1224, 0, -0.1007, -0.0168, 0.0305, 0.0596, 0.0934, 0.1157, 0.1295, 0.2533, 0.1573, 0.2638, 0.3171, 0.3482, 0.3825, 0.404, 0.4168, 0.5244, 0.5363, 0.6501, 0.6996, 0.7249, 0.753, 0.7685, 0.7773, 0.8416, 1.2263, 1.2997, 1.3209, 1.3258, 1.3309, 1.329, 1.326, 1.2816, 1.5274, 1.5686, 1.5716, 1.5667, 1.5565, 1.5453, 1.536, 1.4395, 1.9644, 1.944, 1.92, 1.8983, 1.8647, 1.8396, 1.8216, 1.6449, 2.7334, 2.5915, 2.5013, 2.4457, 2.3671, 2.3162, 2.2827, 1.96, 3.7851, 3.4443, 3.2595, 3.1436, 3.0046, 2.9111, 2.8585, 2.3263, 4.1255, 3.7175, 3.4997, 3.3661, 3.2011, 3.0939, 3.0318, 2.4324, 4.6067, 4.0869, 3.8363, 3.6724, 3.4729, 3.3463, 3.278, 2.5758, 5.4121, 4.7248, 4.4032, 4.1812, 3.9369, 3.7819, 3.6977, 2.807, 6.5, 5.5856, 5.1469, 4.8683, 4.552, 4.3284, 4.2229, 3.0902, 6.8324, 5.8302, 5.3678, 5.0769, 4.7332, 4.4933, 4.3654, 3.1747, 7.278, 6.1999, 5.674, 5.3661, 5.0001, 4.7147, 4.5956, 3.2905, 8.1926, 6.8586, 6.2082, 5.8524, 5.4265, 5.115, 4.9571, 3.4808, 9.3096, 7.6673, 6.8522, 6.4825, 5.9934, 5.6135, 5.5147, 3.719, 9.6207, 7.929, 7.1051, 6.6763, 6.1548, 5.8229, 5.7358, 3.7911, 10.1076, 8.2437, 7.4349, 6.9593, 6.3923, 6.0136, 5.9573, 3.8906, 10.8874, 8.9034, 7.8991, 7.4543, 6.9017, 6.4568, 6.2723, 4.0556, 11.8602, 9.5499, 8.5596, 8.0315, 7.4425, 6.9537, 6.6213, 4.2649), .Dim = c(8L, 36L), .Dimnames = list( NULL, NULL)) } extrap <- FALSE mt <- table1.adk[, 1] sqm1 <- 1/sqrt(mt) sqm2 <- sqm1^2 tm <- NULL p <- 1 - c(.00001,.00005,.0001,.0005,.001,.005,.01,.025,.05,.075, .1,.2,.3,.4,.5,.6,.7,.8,.9,.925,.95,.975,.99,.9925,.995,.9975,.999, .99925,.9995,.99975,.9999,.999925,.99995,.999975,.99999) np <- length(p) lp <- log(p/(1 - p)) sqm0 <- 1/sqrt(m) for (i in 1:np) { out <- smooth.spline(sqm1,table1.adk[,i+1],spar=.4) y <- predict(out,sqm0)$y tm <- c(tm, y) } out <- smooth.spline(tm,lp,spar=.25) lp0 <- predict(out,tx)$y p0 <- exp(lp0)/(1 + exp(lp0)) names(p0) <- NULL p0 } kSamples/R/SteelConfInt.R0000644000176200001440000003475513117074776014745 0ustar liggesusersSteelConfInt <-function(..., data = NULL, conf.level=.95, alternative=c("less","greater","two.sided"), method = c("asymptotic","exact","simulated"), Nsim = 10000){ pmaxWilcox <- function(x,ns){ #---------------------------------------------------------------------------------- # computes the CDF of the maximum of standarized Wilcoxon statistics for each # element in x, using the asymptotic approximation. # Standardized Wilcoxon statistics are considered for sample sizes ns[1] and # each of ns[i], i=2,..., length(ns) #---------------------------------------------------------------------------------- if(length(ns) < 2) return("ns has length < 2\n") n1 <- ns[1] ni <- ns[-1] s <- length(ni) f1 <- sqrt(1+ni/(n1+1)) f2 <- sqrt(ni/(n1+1)) fx <- function(z,U,f1,f2){ fxx <- 1 for(i in 1:s){ fxx <- fxx * pnorm(U*f1[i]-z*f2[i]) } fxx*dnorm(z) } N <- length(x) px <- numeric(N) for(j in 1:N){ px[j] <- integrate(f=fx,-Inf,Inf,U=x[j],f1=f1,f2=f2)$value } px } # end of pmaxWilcox qmaxWilcox <- function(p,ns){ #---------------------------------------------------------- # computes the p-quantile of the CDF provided by pmaxWilcox # not vectorized over p #---------------------------------------------------------- if(length(ns) < 2) return("ns has length < 2\n") fx <- function(z,ns,p){ pmaxWilcox(z,ns)-p } a <- qnorm(p)-2 b <- qnorm(p)+2 while(fx(a,ns,p) > 0) a <- a-1 while(fx(b,ns,p) < 0) b <- b+1 uniroot(fx,c(a,b),ns,p)$root } # end of qmaxWilcox ProbWilcox <- function(U,n1,ni){ #----------------------------------------------------------------------- # computes the asymptotic joint probability P(W.i <= U[i], i=1,\ldots,s) # where W.i is the Mann-Whitney statistic for sample sizes n1 and ni[i], # for i=1,...,s, and the same sample is involved for the first sample # of size n1 in all Mann-Whitney statistics. #----------------------------------------------------------------------- s <- length(ni) if(s < 1) return("ni has length < 1\n") if(length(U) != s) return(paste("U does not have length",s,"\n")) f1 <- sqrt(1+ni/(n1+1)) f2 <- sqrt(ni/(n1+1)) fx <- function(z,U,f1,f2){ fxx <- 1 for(i in 1:s){ fxx <- fxx * pnorm(U[i]*f1[i]-z*f2[i]) } fxx*dnorm(z) } px <- integrate(f=fx,-Inf,Inf,U=U,f1=f1,f2=f2)$value px } # end of ProbWilcox qdiscrete <- function (x, gam) { #------------------------------------------------------------ # x is assumed to be a sorted numeric vector and 0 < gam < 1 # This function finds cm and cp such that # cm is the largest value such that the proportion of # x values <= cm is <= gam. cm is set to -Inf, when the # highest achievable proportion <= gam is zero. # cp is the smallest value such that the proportion of # x values <= cp is >= gam #------------------------------------------------------------ if (gam <= 0 | gam >= 1) stop(paste("conf.level not in (0,1)","\n")) N <- length(x) gamN <- gam * N im <- floor(gamN) ip <- ceiling(gamN) cm <- -Inf cp <- x[ip] if(im > 0){ if(x[im] < x[ip]){ cm <- x[im]}else{ if(sum(x <= x[im]) <= gamN){cm <- x[im]}else{ if(x[im] > x[1]) cm <- max(x[x 1) cat(paste("\n",na.t," NAs were removed!\n\n")) if( na.t == 1) cat(paste("\n",na.t," NA was removed!\n\n")) samples <- out$x.new k <- length(samples) if (k < 2) stop("Must have at least two samples.") ns <- sapply(samples, length) nsamp <- sum(ns) if (any(ns == 0)) stop("One or more samples have no observations.") x <- numeric(nsamp) istart <- 0 for (i in 1:k){ x[istart+(1:ns[i])] <- samples[[i]] istart <- istart + ns[i] } if(method != "asymptotic"){ if(!is.loaded("SteelConf")) dyn.load("SteelConf.so") } if(alternative=="two.sided"){gam <- 1-(1-conf.level)/2 }else{gam <- conf.level} if(length(x) != sum(ns)) return("sample sizes don't agree with length of data vector\n") k <- length(ns) s <- k-1 # number of treatments rx <- rank(x) n1 <- ns[1] ni <- ns[-1] mu <- ni*n1/2 # vector of means for Mann-Whitney statistics tau <- sqrt(n1*ni*(n1+ni+1)/12) # vector of stand. deviations of Mann-Whitney statistics # in the case of continuous sampled populations (no ties). n.ties <- nsamp - length(unique(x)) # computing total number of combination splits ncomb <- choose(nsamp,ns[1]) np <- nsamp-ns[1] if(k>2){ for(i in 2:(k-1)){ ncomb <- ncomb * choose(np,ns[i]) np <- np-ns[i] } } if(method == "exact"){ useExact <- TRUE}else{ useExact <- FALSE } # calculation of coverage probabilities based on asymptotics cgam <- qmaxWilcox(gam,ns) if(alternative != "greater"){ ellpU <- ceiling(tau*cgam+mu+1) ellpUx <- ellpU ellpUx[ellpUx > n1*ni] <- Inf ellmU <- floor(tau*cgam+mu+1) ellmUx <- ellmU ellmUx[ellmUx > n1*ni] <- Inf ellcU <- round(tau*cgam+mu+1) ellcUx <- ellcU ellcUx[ellcUx > n1*ni] <- Inf probpU <- ProbWilcox((ellpUx-1-mu)/tau,n1,ni) probmU <- ProbWilcox((ellmUx-1-mu)/tau,n1,ni) probcU <- ProbWilcox((ellcUx-1-mu)/tau,n1,ni) } if(alternative != "less"){ ellmL <- ceiling(n1*ni-tau*cgam-mu) ellmLx <- ellmL ellmLx[ellmLx < 1] <- -Inf ellpL <- floor(n1*ni-tau*cgam-mu) ellpLx <- ellpL ellpLx[ellpLx < 1] <- -Inf ellcL <- round(n1*ni-tau*cgam-mu) ellcLx <- ellcL ellcLx[ellcLx < 1] <- -Inf probpL <- ProbWilcox((n1*ni-ellpLx-mu)/tau,n1,ni) probmL <- ProbWilcox((n1*ni-ellmLx-mu)/tau,n1,ni) probcL <- ProbWilcox((n1*ni-ellcLx-mu)/tau,n1,ni) } # initializing asymptotic bounds Lboundp <- rep(-Inf,s) Uboundp <- rep(Inf,s) Lboundc <- rep(-Inf,s) Uboundc <- rep(Inf,s) # initializing bounds based on exact or simulated null distribution LboundXp <- rep(-Inf,s) UboundXp <- rep(Inf,s) LboundXc <- rep(-Inf,s) UboundXc <- rep(Inf,s) if(alternative != "greater"){ # determine ellUp and probUp for which probUp comes closest # to gam but >= gam among the three possibilities ellpU, # ellcU, and ellmU, with corresponding probpU, probcU, probmU ellUp <- ellpU probUp <- probpU if(probcU >= gam & probcU < probUp){ ellUp <- ellcU probUp <- probcU } if(probmU >= gam & probmU < probUp){ ellUp <- ellmU probUp <- probmU } ellUc <- ellcU probUc <- probcU # determine ellUc and probUc for which probUc comes closest # to gam among the three possibilities ellpU, # ellcU, and ellmU, with corresponding probpU, probcU, probmU if(abs(probUc-gam)>abs(probpU-gam)){ ellUc <- ellpU probUc <- probpU } if(abs(probUc-gam)>abs(probmU-gam)){ ellUc <- ellmU probUc <- probmU } } if(alternative != "less"){ # determine ellLp and probLp for which probLp comes closest # to gam but >= gam among the three possibilities ellpL, # ellcL, and ellmL, with corresponding probpL, probcL, probmL ellLp <- ellpL probLp <- probpL if(probcL >= gam & probcL < probLp){ ellLp <- ellcL probLp <- probcL } if(probmL >= gam & probmL < probLp){ ellLp <- ellmL probLp <- probmL } # determine ellLc and probLc for which probLc comes closest # to gam among the three possibilities ellpL, # ellcL, and ellmL, with corresponding probpL, probcL, probmL ellLc <- ellcL probLc <- probcL if(abs(probLc-gam)>abs(probpL-gam)){ ellLc <- ellpL probLc <- probpL } if(abs(probLc-gam)>abs(probmL-gam)){ ellLc <- ellmL probLc <- probmL } } if(alternative == "less"){ achieved.confidence.p <- probUp achieved.confidence.c <- probUc } if(alternative == "greater"){ achieved.confidence.p <- probLp achieved.confidence.c <- probLc } if(alternative == "two.sided"){ achieved.confidence.p <- probUp+probLp-1 achieved.confidence.c <- probUc+probLc-1 } # end of asymptotic treatment # calculation based on exact enumeration or simulation if(method != "asymptotic"){ if((Nsim < ncomb & method == "exact") | method == "simulated"){ method <- "simulated" useExact <- FALSE nrow <- Nsim * s }else{ nrow <- ncomb * s } MannWhitneyStats <- numeric(nrow) out <- .C("SteelConf", Nsim=as.integer(Nsim), k=as.integer(k), rx=as.double(rx), ns=as.integer(ns), useExact=as.integer(useExact), MannWhitneyStats = as.double(MannWhitneyStats), PACKAGE = "kSamples") MannWhitneyStats <- matrix(out$MannWhitneyStats,ncol=s,byrow=T) maxstandMannWhitneyStats <- apply(matrix( (out$MannWhitneyStats-mu)/tau,ncol=s,byrow=T),1,max) if(method=="simulated"){ # taking advantage (only in the simulated case) of the equality of the full # distributions of # -minstandMannWhitneyStats and maxstandMannWhitneyStats. minstandMannWhitneyStats <- apply(matrix( (out$MannWhitneyStats-mu)/tau,ncol=s,byrow=T),1,min) maxstandMannWhitneyStats <- sort(c(maxstandMannWhitneyStats, -minstandMannWhitneyStats)) }else{ maxstandMannWhitneyStats <- sort(maxstandMannWhitneyStats) } cmp <- qdiscrete(maxstandMannWhitneyStats,gam) cm <- cmp$cm cp <- cmp$cp ellhat <- ceiling(tau*cp+mu) elltilde <- floor(tau*cm+mu) ellest <- round(tau*(cm+cp)/2+mu) selhat <- MannWhitneyStats[,1] <= ellhat[1] seltilde <- MannWhitneyStats[,1] <= elltilde[1] selest <- MannWhitneyStats[,1] <= ellest[1] if(s >1){ for(i in 2:s){ selhat <- selhat & (MannWhitneyStats[,i] <= ellhat[i]) seltilde <- seltilde & (MannWhitneyStats[,i] <= elltilde[i]) selest <- selest & (MannWhitneyStats[,i] <= ellest[i]) } } p.ellhat <- mean(selhat) p.elltilde <- mean(seltilde) p.ellest <- mean(selest) if(alternative == "less"){ ellhatU <- ellhat+1 elltildeU <- elltilde+1 ellestU <- ellest+1 } if(alternative == "greater"){ ellhatL <- n1*ni-ellhat elltildeL <- n1*ni-elltilde ellestL <- n1*ni-ellest } if(alternative == "two.sided"){ ellhatU <- ellhat+1 elltildeU <- elltilde+1 ellestU <- ellest+1 ellhatL <- n1*ni-ellhat elltildeL <- n1*ni-elltilde ellestL <- n1*ni-ellest p.ellhat <- 1-2*(1-p.ellhat) p.elltilde <- 1-2*(1-p.elltilde) p.ellest <- 1-2*(1-p.ellest) # reset gam to original confidence level for later # refinement comparisons. gam <- conf.level } if(alternative != "greater"){ # determine ellUXp and probXp for which probXp comes closest # to gam but >= gam among the three possibilities ellhatU, # elltildeU, and ellestU, with corresponding p.ellhat, p.elltilde, p.ellest ellUXp <- ellhatU probXp <- p.ellhat if(p.elltilde >= gam & p.elltilde < probXp){ ellUXp <- elltildeU probXp <- p.elltilde } if(p.ellest >= gam & p.ellest < probXp){ ellUXp <- ellestU probXp <- p.ellest } ellUXc <- ellhatU probXc <- p.ellhat # determine ellUXc and probXc for which probXc comes closest # to gam among the three possibilities ellhatU, # elltildeU, and ellestU, with corresponding p.ellhat, p.elltilde, p.ellest if(abs(probXc-gam)>abs(p.elltilde-gam)){ ellUXc <- elltildeU probXc <- p.elltilde } if(abs(probXc-gam)>abs(p.ellest-gam)){ ellUXc <- ellestU probXc <- p.ellest } } if(alternative != "less"){ # determine ellLXp and probXp for which probXp comes closest # to gam but >= gam among the three possibilities ellhatL, # elltildeL, and ellestL, with corresponding p.ellhat, p.elltilde, p.ellest ellLXp <- ellhatL probXp <- p.ellhat if(p.elltilde >= gam & p.elltilde < probXp){ ellLXp <- elltildeL probXp <- p.elltilde } if(p.ellest >= gam & p.ellest < probXp){ ellLXp <- ellestL probXp <- p.ellest } ellLXc <- ellhatL probXc <- p.ellhat # determine ellLXc and probXc for which probXc comes closest # to gam among the three possibilities ellhatL, # elltildeL, and ellestL, with corresponding p.ellhat, p.elltilde, p.ellest if(abs(probXc-gam)>abs(p.elltilde-gam)){ ellLXc <- elltildeL probXc <- p.elltilde } if(abs(probXc-gam)>abs(p.ellest-gam)){ ellLXc <- ellestL probXc <- p.ellest } } } if(alternative != "greater"){ istart <- n1 for( i in 1:s ){ D <- sort(outer(x[(istart+1):(istart+ni[i])],x[1:n1],"-")) if(ellUp[i] > n1*ni[i]){ Uboundp[i] <- Inf}else{ Uboundp[i] <- D[ellUp[i]] } if(ellUc[i] > n1*ni[i]){ Uboundc[i] <- Inf}else{ Uboundc[i] <- D[ellUc[i]] } if(method != "asymptotic"){ if(ellUXp[i] > n1*ni[i]){ UboundXp[i] <- Inf}else{ UboundXp[i] <- D[ellUXp[i]] } if(ellUXc[i] > n1*ni[i]){ UboundXc[i] <- Inf}else{ UboundXc[i] <- D[ellUXc[i]] } } istart <- istart + ni[i] } } if(alternative != "less"){ istart <- n1 for( i in 1:s ){ D <- sort(outer(x[(istart+1):(istart+ni[i])],x[1:n1],"-")) if(ellLp[i] < 1){ Lboundp[i] <- -Inf}else{ Lboundp[i] <- D[ellLp[i]] } if(ellLc[i] < 1){ Lboundc[i] <- -Inf}else{ Lboundc[i] <- D[ellLc[i]] } if(method != "asymptotic"){ if(ellLXp[i] < 1){ LboundXp[i] <- -Inf}else{ LboundXp[i] <- D[ellLXp[i]] } if(ellLXc[i] < 1){ LboundXc[i] <- -Inf}else{ LboundXc[i] <- D[ellLXc[i]] } } istart <- istart + ni[i] } } Delta <- paste("Delta_",1:s,sep="") LBound <- Lboundp UBound <- Uboundp LBoundc <- Lboundc UBoundc <- Uboundc levelA <- c(round(achieved.confidence.p,6),rep("",length(LBound)-1)) levelB <- c(round(achieved.confidence.c,6),rep("",length(LBound)-1)) outA<- data.frame(L=LBound,U=UBound,level=levelA,row.names=Delta) outB<- data.frame(L=LBoundc,U=UBoundc,level=levelB,row.names=Delta) if(alternative=="greater"){ ellUc <- NA ellUp <- NA ellUXc <- NA ellUXp <- NA } if(alternative=="less"){ ellLc <- NA ellLp <- NA ellLXc <- NA ellLXp <- NA } i.LU <- cbind(ellLp,ellUp,ellLc,ellUc) dimnames(i.LU) <- list(NULL,c("i.L","i.U","i.Lc","i.Uc")) i.LUX <- NA if(method=="asymptotic"){ out <- list(conservative.bounds.asymptotic = outA, closest.bounds.asymptotic=outB) }else{ i.LUX <- cbind(ellLXp,ellUXp,ellLXc,ellUXc) dimnames(i.LUX) <- list(NULL,c("i.L","i.U","i.Lc","i.Uc")) levelAX <- c(round(probXp,6),rep("",length(LboundXp)-1)) levelBX <- c(round(probXc,6),rep("",length(LboundXp)-1)) outAX <- data.frame(L=LboundXp,U=UboundXp,level=levelAX,row.names=Delta) outBX <- data.frame(L=LboundXc,U=UboundXc,level=levelBX,row.names=Delta) if(method == "simulated"){ out <- list(conservative.bounds.asymptotic = outA, closest.bounds.asymptotic=outB, conservative.bounds.simulated = outAX, closest.bounds.simulated=outBX) } if(method == "exact"){ out <- list(conservative.bounds.asymptotic = outA, closest.bounds.asymptotic=outB, conservative.bounds.exact = outAX, closest.bounds.exact=outBX) } } outx <- list(test.name = "Steel.bounds", n1 = n1, ns = ni, N = nsamp,n.ties=n.ties, bounds=out,method=method,Nsim=Nsim,i.LU=i.LU,i.LUX=i.LUX) class(outx) <- "kSamples" outx } kSamples/R/pp.kSamples.R0000644000176200001440000001316512651252010014552 0ustar liggesuserspp.kSamples <- function(x){ if(class(x) != "kSamples") stop('input not of class "kSamples"\n') if(x$test.name == "Anderson-Darling"){ if(is.null(x$null.dist1)) stop("the null.dist1 & null.dist2 components of input are NULL\n") }else{ if(is.null(x$null.dist)) stop("the null.dist component of input is NULL\n") } if(x$test.name == "Anderson-Darling"){ version <- "0" while(substr(version,1,1) != "1" & substr(version,1,1) != "2"){ version <- readline("choose version of test statistic: 1 or 2\n") } if(names(x)[[2]]=="k") { dist <- x[[8+as.numeric(version)]]}else{ dist <- x[[12+as.numeric(version)]] } if(!is.null(dist)){ xy <- table(dist) N <- length(dist) if(names(x)[[2]]=="k") { xx <- (as.numeric(names(xy))-(x$k-1))/x$sig px <- ad.pval(xx,x$k-1,as.numeric(version))}else{ xx <- (as.numeric(names(xy))-(x$mu.c))/x$sig.c px <- ad.pval(xx,x$mu.c,as.numeric(version)) } px.exact <- rev(cumsum(rev(as.numeric(xy)))/N) m <- min(px,px.exact) M <- max(px,px.exact) par(pty="s") if(names(x)[[2]]=="k") titlex <- paste(x$test.name,"k-sample test, version", version) if(names(x)[[2]]=="M") titlex <- paste("combined", x$test.name,"k-sample tests, version", version) plot(px.exact,px,log="xy",xlim=c(m,M), ylim=c(m,M), main= titlex, xlab=paste(x$method,"significance probability"),ylab="asymptotic significance probability",pch=16,cex=.5) text(.055,m,".05",adj=0) abline(0,1,lty=2) abline(h=.05,lty=2) abline(v=.05,lty=2) text(.012,.5,".01",adj=0) abline(h=.01,lty=2) abline(v=.01,lty=2) if(x$method=="simulated"){ text(m,M,paste("Nsim =",x$Nsim),adj=0)}else{ text(m,M,paste("Ncomb =",length(dist)),adj=0) } }else{ cat("no distribution data\n") } } if(x$test.name == "Kruskal-Wallis" | x$test.name == "van der Waerden scores" | x$test.name == "normal scores"){ dist <- x$null.dist if(!is.null(dist)){ xy <- table(dist) N <- length(dist) xx <- as.numeric(names(xy)) if(names(x)[[2]] == "k"){ px <- 1-pchisq(xx,x$k - 1)}else{ df <- sum(sapply(x$n.samples,length))-length(x$n.samples) px <- 1-pchisq(xx,df) } px.exact <- rev(cumsum(rev(as.numeric(xy)))/N) m <- min(px,px.exact) M <- max(px,px.exact) par(pty="s") if(names(x)[[2]]=="k") titlex <- paste(x$test.name,"k-sample test") if(names(x)[[2]]=="M") titlex <- paste("combined", x$test.name,"k-sample tests") plot(px.exact,px,log="xy",xlim=c(m,M), ylim=c(m,M), main= titlex, xlab=paste(x$method,"significance probability"), ylab="asymptotic significance probability",pch=16,cex=.5) text(.055,m,".05",adj=0) abline(0,1,lty=2) abline(h=.05,lty=2) abline(v=.05,lty=2) text(.012,.5,".01",adj=0) abline(h=.01,lty=2) abline(v=.01,lty=2) if(x$method=="simulated"){ text(m,M,paste("Nsim =",x$Nsim),adj=0)}else{ text(m,M,paste("Ncomb =",length(dist)),adj=0) } }else{ cat("no distribution data\n") } } if(x$test.name == "Jonckheere-Terpstra"){ dist <- x$null.dist if(!is.null(dist)){ xy <- table(dist) N <- length(dist) xx <- as.numeric(names(xy)) mu <- x$JT[2] sig <- x$JT[3] px <- 1-pnorm(xx,mu,sig) px.exact <- rev(cumsum(rev(as.numeric(xy)))/N) m <- min(px,px.exact) M <- max(px,px.exact) par(pty="s") titlex <- paste(x$test.name,"k-sample test") plot(px.exact,px,log="xy",xlim=c(m,M), ylim=c(m,M), main= titlex, xlab=paste(x$method,"significance probability"), ylab="asymptotic significance probability",pch=16,cex=.5) text(.055,m,".05",adj=0) abline(0,1,lty=2) abline(h=.05,lty=2) abline(v=.05,lty=2) text(.012,.5,".01",adj=0) abline(h=.01,lty=2) abline(v=.01,lty=2) if(x$method=="simulated"){ text(m,M,paste("Nsim =",x$Nsim),adj=0)}else{ text(m,M,paste("Ncomb =",length(dist)),adj=0) } }else{ cat("no distribution data\n") } } if(x$test.name == "2 x t Contingency Table" | x$test.name == "Combined 2 x t Contingency Tables"){ dist <- x$null.dist if(!is.null(dist)){ xx <- dist[,1] px.exact <- rev(cumsum(rev(dist[,2]))) N <- length(dist[,1]) if(names(x)[[3]] == "M"){ M <- x$M tvec <- x$t df <- sum(tvec)-M }else{ df <- x$t -1 } px <- pchisq(xx,df,lower.tail=FALSE) m <- min(px,px.exact) M <- max(px,px.exact) plot(px.exact,px,log="xy",xlim=c(m,M), ylim=c(m,M), main= x$test.name, xlab=paste(x$method,"significance probability"),ylab="asymptotic significance probability",pch=16,cex=.5) text(.055,m,".05",adj=0) abline(0,1,lty=2) abline(h=.05,lty=2) abline(v=.05,lty=2) text(.008,.5,".01",adj=1) abline(h=.01,lty=2) abline(v=.01,lty=2) if(x$method=="simulated"){ text(m,M,paste("Nsim =",x$Nsim),adj=0)}else{ text(m,M,paste("Nsupp =",length(dist[,1])),adj=0) } }else{ cat("no distribution data\n") } } if(x$test.name == "Steel"){ dist <- x$null.dist if(!is.null(dist)){ xy <- table(dist) N <- length(dist) xx <- as.numeric(names(xy)) tau <- x$tau sig <- x$sig sig0 <- x$sig0 ni <- x$ns[-1] px <- Steelnormal0(sig0,sig,tau,xx,ni, alternative= x$alternative) px.exact <- rev(cumsum(rev(as.numeric(xy)))/N) m <- min(px,px.exact) M <- max(px,px.exact) par(pty="s") titlex <- paste(x$test.name,"k-sample test") plot(px.exact,px,log="xy",xlim=c(m,M), ylim=c(m,M), main= titlex, xlab=paste(x$method,"significance probability"), ylab="asymptotic significance probability",pch=16,cex=.5) text(.055,m,".05",adj=0) abline(0,1,lty=2) abline(h=.05,lty=2) abline(v=.05,lty=2) text(.012,.5,".01",adj=0) abline(h=.01,lty=2) abline(v=.01,lty=2) if(x$method=="simulated"){ text(m,M,paste("Nsim =",x$Nsim),adj=0)}else{ text(m,M,paste("Ncomb =",length(dist)),adj=0) } }else{ cat("no distribution data\n") } } } kSamples/R/io.R0000644000176200001440000000321412660723556013000 0ustar liggesusersio <- function(...,data=NULL) { # This function can take a bunch of numeric sample vectors # in ..., or a list of such, or a formula that specifies a # a response (e.g., y), grouped by a factor, e.g., g, of same # length as y, via y ~ g. # This breaks down y into the desired samples, one sample # for each factor level. xlist <- list(...) if(is(xlist[[1L]], "formula")) { cl <- match.call() # gets a copy of the current call mf <- cl mf[[1L]] <- as.name("model.frame") mf <- eval(mf,parent.frame()) # mf is a data frame consisting of response # and the other model variable mt <- attr(mf, "terms") y <- model.response(mf, "numeric") # response vector fname <- attr(mt,"term.labels") # fname contains the names of explanatory variables if(length(fname) != 1) { stop("Please specify only one term in the formula") } fvec <- as.factor(mf[, fname]) # fvec contains the values of the single explanatory variable # as a factor samples <- lapply(levels(fvec), function(flvl) { return(y[which(fvec == flvl)])} ) } else { # tests whether ... is a list, when not a formula if (is.list(xlist[[1]])) { samples <- xlist[[1]] } else { if( all(unlist(lapply(xlist,FUN=is.numeric))) == FALSE) stop("improper input for ...\n") samples <- lapply(xlist,as.numeric) } } if(length(samples) < 2) stop("fewer than 2 samples\n") if(all(unlist(lapply(samples,FUN=is.numeric))) != TRUE) stop("improper input for ...\n") lapply(samples,as.numeric) } kSamples/R/jt.test.R0000644000176200001440000002146713117074407013766 0ustar liggesusersjt.test <- function (..., data = NULL, method=c("asymptotic","simulated","exact"), dist=FALSE,Nsim=10000) { ############################################################################# # This function "jt.test" tests whether k samples (k>1) come from a common # continuous distribution, using the Jonckheere-Terpstra rank test. See Lehmann (2006), # Nonparametrics, Statistical Methods Based on Ranks. # The test rejects the null hypothesis of no effect when JT is too large, # i.e., a positive trend in the samples (in the order given) seems indicated. # # Ties are handled by using midranks. # While the asymptotic P-value is always returned, there is the option # to get a P-value estimate based on Nsim simulations or an exact value based # on the full enumeration distribution, provided method = "exact" is chosen # and the number of full enumerations is <= the Nsim specified. # If the latter is not the case, simulation is used with the indicated Nsim. # These simulated or exact P-values are appropriate under the continuity # assumption or, when ties are present, they are still appropriate # conditionally on the tied rank pattern, provided randomization took # place in allocating subjects to the respective samples, i.e., also # under random sampling from a common discrete parent population. # However, under ties the results are only meaningful conditionally given # the observed tie pattern. # # # # Inputs: # ...: can either be a sequence of k (>1) sample vectors, # # or a list of k (>1) sample vectors, # # or y, g, where y contains the concatenated # samples and g is a factor which by its levels # identifies the samples in y, # # or a formula y ~ g with y and g as in previous case. # # # data: data frame with variables usable in formula input, default = NULL. # # # method: takes values "asymptotic", "simulated", or "exact". # The value "asymptotic" causes calculation of P-values # using the asymptotic normal approximation, always done. # # The value "simulated" causes estimation of P-values # by randomly splitting the pooled data into # samples of sizes ns[1], ..., ns[k], where # ns[i] is the size of the i-th sample vector, # and n = ns[1] + ... + ns[k] is the pooled sample size. # For each such random split the JT statistic is # computed. This is repeated Nsim times and the proportions # of simulated values >= the actually observed JT value # is reported as P-value estimate. # # The value "exact" enumerates all n!/(ns[1]! * ... * ns[k]) # splits of the pooled sample and computes the JT statistic # for each such split. # The proportion of all enumerated JT statistics # that are >= the actually observed JT value # is reported as exact (conditional) P-value. # # dist: = FALSE (default) or TRUE, TRUE causes the simulated # or fully enumerated vector of the QN statstic to be returned # as null.dist. The length of this vector should not exceed 1e8. # # Nsim: number of simulations to perform, # for method = "exact" to take hold, it needs to be >= # the number of all possible splits of the pooled # data into samples of sizes ns[1], ..., ns[k], where # ns[i] is the size of the i-th sample vector. # # When there are NA's among the sample values they are removed, # with a warning message indicating the number of NA's. # It is up to the user to judge whether such removals make sense. # # An example: # x1 <- c(1,2) # x2 <- c(1.5,2.1) # x3 <- c(1.9,3.1) # jt.test(x1,x2,x3,method="exact",Nsim=90) # or # jt.test(list(x1,x2,x3),method="exact",Nsim=90) # which produces the output below. ############################################################################# # Jonckheere-Terpstra k-sample test. # # Number of samples: 3 # Sample sizes: 2, 2, 2 # Number of ties: 0 # # Null Hypothesis: All samples come from a common population. # Alternative: Samples indicate a positive trend. # # test statistic mu sig asympt. P-value # 9.0000000 6.0000000 2.5166115 0.1166151 # exact P-Value # 0.1666667 # # # Warning: At least one sample size is less than 5, # asymptotic p-values may not be very accurate. ############################################################################# # In order to get the output list, call # JT.out <- jt.test(list(x1,x2,x3),method="exact",dist=T,Nsim=100000) # then JT.out is of class "kSamples" and has components # > names(JT.out) # [1] "test.name" "k" "ns" "N" "n.ties" "JT" # [7] "warning" "null.dist" "method" "Nsim" # # where # test.name = "Jonckheere-Terpstra" # k = number of samples being compared # ns = vector of the k sample sizes ns[1],...,ns[k] # N = ns[1] + ... + ns[k] total sample size # n.ties = number of ties in the combined set of all n observations # JT = 4 (or 5) vector containing the JT statistics, its mean and standard # deviation, its asymptotic P-value, (and its exact or simulated P-value). # warning = logical indicator, warning = TRUE indicates that at least # one of the sample sizes is < 5. # null.dist is a vector of simulated values of the JT statistic # or the full enumeration of such values. # This vector is given when dist = TRUE is specified, # otherwise null.dist = NULL is returned. # method = one of the following values: "asymptotic", "simulated", "exact" # as it was ultimately used. # Nsim = number of simulations used, when applicable. # # The class "kSamples" causes JT.out to be printed in a special output # format when invoked simply as: > JT.out # An example was shown above. # # Fritz Scholz, August 2015 # ################################################################################# JTmusig <- function (rx,ns) { # this function computes the mean and standard deviation of the # JT statistic when ties are present. N <- length(rx) dvec <- as.vector(table(rx)) X1 <- N*(N-1)*(2*N+5) X2 <- sum(ns*(ns-1)*(2*ns+5)) X3 <- sum(dvec*(dvec-1)*(2*dvec+5)) A1 <- (X1-X2-X3)/72 A2 <- sum(ns*(ns-1)*(ns-2))*sum(dvec*(dvec-1)*(dvec-2)) A2 <- A2/(36*N*(N-1)*(N-2)) A3 <- sum(ns*(ns-1))*sum(dvec*(dvec-1))/(8*N*(N-1)) sig <- sqrt(A1+A2+A3) nmat <- outer(ns,ns,"*") mu <- sum(nmat[upper.tri(nmat)])/2 list(mu=mu,sig=sig) } ############################################################## samples <- io(..., data = data) method <- match.arg(method) out <- na.remove(samples) na.t <- out$na.total if( na.t > 1) print(paste("\n",na.t," NAs were removed!\n\n")) if( na.t == 1) print(paste("\n",na.t," NA was removed!\n\n")) samples <- out$x.new k <- length(samples) if (k < 2) stop("Must have at least two samples.") ns <- sapply(samples, length) n <- sum(ns) if (any(ns == 0)) stop("One or more samples have no observations.") rx <- rank(unlist(samples)) JTobs <- 0 pval <- 0 L <- length(unique(rx)) # to count ties if(dist == TRUE) Nsim <- min(Nsim,1e8) # limits the size of null.dist ncomb <- 1 np <- n for(i in 1:(k-1)){ ncomb <- ncomb * choose(np,ns[i]) np <- np-ns[i] } # it is possible that ncomb overflows to Inf if( method == "exact" & Nsim < ncomb ) { method <- "simulated" } if( method == "exact" & dist == TRUE ) nrow <- ncomb if( method == "simulated" & dist == TRUE ) nrow <- Nsim if( method == "simulated" ) ncomb <- 1 # don't need ncomb anymore if(method == "asymptotic"){ Nsim <- 1 dist <- FALSE } useExact <- FALSE if(method == "exact") useExact <- TRUE if(dist == TRUE){ JTvec <- numeric(nrow) }else{ JTvec <- 0 } out <- .C("JTtest", pval=as.double(pval), Nsim=as.integer(Nsim), k=as.integer(k), rx=as.double(rx), ns=as.integer(ns), useExact=as.integer(useExact), getJTdist=as.integer(dist), ncomb=as.double(ncomb),JTobs=as.double(JTobs), JTvec = as.double(JTvec), PACKAGE = "kSamples") JTobs <- out$JTobs musig <- JTmusig(rx,ns) mu <- musig$mu sig <- musig$sig pval <- out$pval if(dist){ JTvec <- round(out$JTvec,8) }else{ JTvec <- NULL } pval.asympt <- 1-pnorm((JTobs - mu)/sig) if(method=="asymptotic"){ JT <- c(JTobs,mu,sig,pval.asympt) }else{ JT <- c(JTobs,mu,sig,pval.asympt,pval) } if(method=="asymptotic"){ names(JT) <- c("test statistic","mu","sig"," asympt. P-value") } if(method=="exact"){ names(JT) <- c("test statistic","mu","sig"," asympt. P-value","exact P-Value") } if(method=="simulated"){ names(JT) <- c("test statistic","mu","sig"," asympt. P-value","sim. P-Value") } warning <- FALSE if(min(ns) < 5) warning <- TRUE if(dist == FALSE) null.dist <- NULL test.name <- "Jonckheere-Terpstra" object <- list(test.name = test.name, k = k, ns = ns, N = n, n.ties = n - L, JT = JT, warning = warning, null.dist = JTvec, method=method, Nsim=Nsim) class(object) <- "kSamples" object } kSamples/R/ad.test.combined.R0000644000176200001440000003315013117073752015506 0ustar liggesusersad.test.combined <- function (..., data = NULL, method = c("asymptotic","simulated","exact"), dist = FALSE, Nsim = 10000) { ############################################################################# # This function ad.test.combined combines several Anderson-Darling # K-sample test statistics AD.m, m = 1,...,M, into one overall test # statistic AD.combined as suggested in Section 8 of Scholz F.W. and # Stephens M.A. (1987), K-sample Anderson-Darling Tests, # Journal of the American Statistical Association, Vol 82, No. 399, # pp. 918-924. # See also the documentation of ad.test for the comparison of a single # set of K samples. # Each application of the Anderson-Darling K-sample test can be # based on a different K > 1. This combined version tests the hypothesis # that all the hypotheses underlying the individual K-sample tests are # true simultaneously. # The individual K-sample test hypothesis is that all samples from # the m-th group come from a common population. However, that population # may be different from one individual K-sample situation to the next. # Such a combined test is useful in # # 1) examining intra-laboratory measurement equivalence, when samples from # the same material or batch are compared for M laboratories and such # comparisons are made for samples from several different materials or # batches and one assessment over all materials/batches is desired. # # 2) analyzing treatment effects in randomized complete or incomplete # block designs. # # When there are NA's among the sample values they are removed, # with a warning message indicating the number of NA's. # It is up to the user to judge whether such removals make sense. # # Input: ... # can take the form of several lists, # say L.1,...,L.M, where list L.i contains # K.i sample vectors of respective sizes n.i[1], ..., n.i[K.i] # (n.i[j] > 4 is recommended) # # or a single list of such lists # # or a data frame with 3 columns, the first column representing # the responses y, the second column a factor g the levels of # which are used to indicate the samples within each block, and # the third column a factor b indicating the block # # or a formula y ~ g | b with y, g, b as in the previous situation, # where y, g, b may be variable names in a provided data frame, say dat, # supplied via data = dat, # # or just the three vectors y, g, b in this order with same meaning. # # data # an optional data frame containing the variable names used in formula input, # default is NULL, in which case the used variables must exist in the calling # environment. # # method # can take one of three values "asymptotic", "simulated", # and "exact", which determines the mode of P-value calculation. # The asymptotic P-value is always returned. # The simulated P-value simulates splits of the pooled samples in # the i-th list L.i into K.i samples of sizes n.i[1], ..., n.i[K.i], # computing the corresponding AD statistic AD.i (both versions), # doing this independently for i = 1,...,M and adding the AD.i # to get the combined statistic AD.comb. This is repeated Nsim # times and the simulated P-value is the proportion of these # values that are >= the observed combined value. # The exact P-value should only be attempted for small M and small # sample sizes and requires that Nsim be set to >= the total number # of AD.comb enumerations. Otherwise Nsim simulations are run # to get a simulated P-value, as described above. # As example consider: M=2 with K.1 = 2, n.1[1] = 5, n.1[2] = 6, # K.2 = 2, n.2[1] = 5, n.2[2] = 7, then we would have # choose(11,5) = 462 splits of the first set of pooled samples # and choose(12,5) = 792 splits of the second set of pooled samples # and thus 462 * 792 = 365904 combinations of AD.1+AD.2 = AD.comb. # Thus we would need to set Nsim >= 365904 to enable exact # exact enumeration evaluation of the P-value. Since these enumerated # values of AD.comb need to be held inside R in a single vector, # we are limited by the object size in R. In simulations the length # of the simulated vector of AD.comb is only Nsim and is manageable. # # dist # takes values FALSE (default) or TRUE, where TRUE enables the # return of the simulated or exact vectors of generated values # for both versions of AD.comb. # Otherwise NULL is returned for both versions # # Nsim = 10000 (default), number of simulations as discussed above. # # # # An example: # x1 <- c(1, 3, 2, 5, 7), x2 <- c(2, 8, 1, 6, 9, 4), and # x3 <- c(12, 5, 7, 9, 11) # y1 <- c(51, 43, 31, 53, 21, 75) and y2 <- c(23, 45, 61, 17, 60) # then # set.seed(2627) # ad.test.combined(list(x1,x2,x3),list(y1,y2),method="simulated",Nsim=100000) # or equivalently ad.test.combined(list(list(x1,x2,x3),list(y1,y2)), # method="simulated",Nsim=100000) # produces the outcome below. ########################################################################## # # Combination of Anderson-Darling K-Sample Tests. # # Number of data sets = 2 # # Sample sizes within each data set: # Data set 1 : 5 6 5 # Data set 2 : 6 5 # Total sample size per data set: 16 11 # Number of unique values per data set: 11 11 # # AD.i = Anderson-Darling Criterion for i-th data set # Means: 2 1 # Standard deviations: 0.92837 0.64816 # # T.i = (AD.i - mean.i)/sigma.i # # Null Hypothesis: # All samples within a data set come from a common distribution. # The common distribution may change between data sets. # # Based on Nsim = 1e+05 simulations # for data set 1 we get # AD T.AD asympt. P-value sim. P-value # version 1: 3.316 1.4176 0.088063 0.09868 # version 2: 3.510 1.6286 0.070278 0.09115 # # for data set 2 we get # AD T.AD asympt. P-value sim. P-value # version 1: 0.37267 -0.96786 0.96305 0.94529 # version 2: 0.33300 -1.02930 0.98520 0.93668 # # # Combined Anderson-Darling Criterion: AD.comb = AD.1+AD.2 # Mean = 3 Standard deviation = 1.13225 # # T.comb = (AD.comb - mean)/sigma # # Based on Nsim = 1e+05 simulations # AD.comb T.comb asympt. P-value sim. P-value # version 1: 3.68867 0.6082333 0.2205062 0.23733 # version 2: 3.84300 0.7445375 0.1902867 0.21825 # # ############################################################################### # For out.ad.combined <- ad.test.combined(list(x1,x2,x3),list(y1,y2)) # or out.ad.combined <- ad.test.combined(list(list(x1,x2,x3),list(y1,y2))) # we get the object out.ad.combined of class ksamples with the following # components # > names(out.ad.combined) # > names(ad.c.out) # [1] "test.name" "M" "n.samples" "nt" "n.ties" # [6] "ad.list" "mu" "sig" "ad.c" "mu.c" # [11] "sig.c" "warning" "null.dist1" "null.dist2" "method" # [16] "Nsim" # where # test.name = "Anderson-Darling" # M = number of sets of samples being compared # n.samples = is a list of the vectors giving the sample sizes for each # set of samples being compared # nt = vector of total sample sizes involved in each of the M comparisons # n.ties = vector of lenth M giving the number of ties in each comparison group # ad.list = list of M data frames for the results for each of the test results # corresponding to the M block # # mu = vector of means of the M AD statistics # sig = vector of standard deviations of the M AD statistics # ad.c = 2 x 3 (or 2 x 4) matrix containing the AD statistics, # standardized AD statistics, its asymptotic P-value, # (and its exact or simulated P-value), for version 1 in the first row # and for version 2 in the second row. # mu.c = mean of the combined AD statistic # sig.c = standard deviation of the combined AD statistic # warning = logical indicator, warning = TRUE when at least one of # the sample sizes is < 5. # null.dist1 enumerated or simulated values of AD statistic, version 1 # null.dist2 enumerated or simulated values of AD statistic, version 2 # method the method that was used: "asymptotic", "simulated", "exact". # Nsim the number of simulations that were used. # # Fritz Scholz, August 2012 ##################################################################################### convvec <- function(x1,x2){ #---------------------------------------------------- # R routine for calling convvec in conv.c # created 02.05.2012 Fritz Scholz #---------------------------------------------------- n1 <- length(x1) n2 <-length(x2) n <- n1*n2 x <- numeric(n) out <- .C("convvec", x1=as.double(x1), n1=as.integer(n1), x2=as.double(x2),n2=as.integer(n2), x=as.double(x),n=as.integer(n), PACKAGE= "kSamples") out$x } # the following converts individual data sets into a list of such, # if not already in this form. data.sets <- io2(...,data=data) data.sets <- test.list(data.sets) # end of data.sets list conversion method <- match.arg(method) n.sizes <- NULL M <- length(data.sets) # number of data sets (blocks) n.data <- sapply(data.sets, length) # gets vector of number of samples in each component of data.sets n.samples <- list() # intended to contain vectors of sample sizes for # for each respective data set. na.t <- 0 # intended to tally total of NA cases ncomb <- 1 # intended to hold the total number of evaluations # of the full convolution distribution, to check # whether exact computation is reasonable. for(i in 1:M){ out <- na.remove(data.sets[i]) na.t<- na.t + out$na.total data.sets[i] <- out$x.new # replace data set i with the one that has # NA's removed n.sample <- sapply(data.sets[[i]], length) # contains sample sizes for i-th data set n.sizes <- c(n.sizes, n.sample) # accumulates all sample size, warning purpose only if(any(n.sample==0)) stop(paste("one or more samples in data set", i, "has no observations")) n.samples[[i]] <- n.sample N <- sum(n.sample) # total observations in i-th data set k <- length(n.sample) # number of samples in i-th data set # updating ncomb ncomb <- ncomb * choose(N,n.sample[1]) for(j in 2:k){ N <- N-n.sample[j-1] ncomb <- ncomb * choose(N,n.sample[j]) } # end of ncomb update for data set i } Nsim <- min(Nsim,1e7) if(ncomb > Nsim & method == "exact") method <- "simulated" if( na.t > 1) cat(paste("\n",na.t," NAs were removed!\n\n")) if( na.t == 1) cat(paste("\n",na.t," NA was removed!\n\n")) warning <- min(n.sizes) < 5 # set warning flag for caution on # trusting asymptotic p-values # Initializing output objects AD <- 0 sig <- NULL n.ties <- NULL nt <- NULL mu <- NULL ad.list <- list() mu.c <- 0 dist1 <- NULL dist2 <- NULL if(method == "asymptotic"){dist0 <- FALSE}else{dist0 <- TRUE} # the following loop aggregates the (estimated or exact) # convolution distribution of the combined AD statistic versions for(i in 1:M){ out <- ad.test(data.sets[[i]],method=method,dist=dist0,Nsim=Nsim) if(dist0==TRUE){ if(i == 1){ dist1 <- out$null.dist1 dist2 <- out$null.dist2 }else{ if(method=="simulated"){ dist1 <- dist1+out$null.dist1 dist2 <- dist2+out$null.dist2 }else{ dist1 <- convvec(dist1,out$null.dist1) dist2 <- convvec(dist2,out$null.dist2) } } } ad.list[[i]] <- out$ad sig.i <- out$sig mu <- c(mu, length(data.sets[[i]])-1) AD.i <- out$ad[,1] sig <- c(sig, sig.i) # accumulated stand. dev.'s of AD stats AD <- AD+AD.i # aggregates combined AD stats (version 1 and 2) mu.c <- mu.c + length(data.sets[[i]]) - 1 # aggregates mean of combined AD stats n.ties <- c(n.ties, out$n.ties) nt <- c(nt, sum(out$ns)) # accumulates total observations in data sets } AD <- as.numeric(AD) # get exact or simulated P-value if(dist0==T){ nrow <- length(dist1) ad.pval1.dist <- sum(dist1 >= AD[1])/nrow ad.pval2.dist <- sum(dist2 >= AD[2])/nrow } sig.c <- sqrt(sum(sig^2)) # standard deviation of combined AD stats if(sig.c >0){ tc.obs <- (AD - mu.c)/sig.c # standardized values of AD stats }else{ tc.obs <- NA } # get asymptotic P-value if(sig.c >0){ ad.pval1 <- ad.pval(tc.obs[1], mu.c,1) ad.pval2 <- ad.pval(tc.obs[2], mu.c,2) }else{ ad.pval1 <- 1 ad.pval2 <- 1 } if(method == "asymptotic"){ ad.c <- matrix(c(signif(AD[1],7),signif(tc.obs[1],7),round(ad.pval1,7), signif(AD[2],7),signif(tc.obs[2],7), round(ad.pval2,7)), byrow=TRUE, ncol=3) dimnames(ad.c) <- list(c("version 1:","version 2:"), c("AD.comb","T.comb"," asympt. P-value")) } if(method == "exact"){ ad.c <- matrix(c(signif(AD[1],7),signif(tc.obs[1],7),round(ad.pval1,7), round(ad.pval1.dist,7), signif(AD[2],7),signif(tc.obs[2],7), round(ad.pval2,7), round(ad.pval2.dist,7)),byrow=TRUE, ncol=4) dimnames(ad.c) <- list(c("version 1:","version 2:"), c("AD.comb","T.comb"," asympt. P-value"," exact P-value")) } if(method == "simulated"){ ad.c <- matrix(c(signif(AD[1],7),signif(tc.obs[1],7),round(ad.pval1,7), round(ad.pval1.dist,7), signif(AD[2],7),signif(tc.obs[2],7), round(ad.pval2,7), round(ad.pval2.dist,7)),byrow=TRUE, ncol=4) dimnames(ad.c) <- list(c("version 1:","version 2:"), c("AD.comb","T.comb"," asympt. P-value"," sim. P-value")) } if(dist==FALSE){ dist1 <- NULL dist2 <- NULL } object <- list(test.name ="Anderson-Darling", M=M, n.samples=n.samples, nt=nt, n.ties=n.ties, ad.list=ad.list, mu=mu, sig=sig, ad.c = ad.c, mu.c=mu.c, sig.c=round(sig.c,5), warning=warning,null.dist1=dist1, null.dist2=dist2,method=method,Nsim=Nsim) class(object) <- "kSamples" object } kSamples/R/Harding.R0000644000176200001440000000062413117074222013732 0ustar liggesusersHarding <- function (nn) { nn <- sort(as.integer(nn)) nvec <- rev(cumsum(rev(nn))) k <- length(nn) L1 <- sum(nn[1:(k-1)]*nvec[2:k])+1 freq <- double(L1) out <- .C("Harding0", k=as.integer(k), L1=as.integer(L1), nn=as.integer(nn), nvec=as.integer(nvec), freq=as.double(freq), PACKAGE = "kSamples") freq <- out$freq if(is.nan(sum(freq)) || abs(sum(freq)-1) > .0000001) freq <- NaN freq } kSamples/R/Steelnormal.R0000644000176200001440000000343612611026754014654 0ustar liggesusersSteelnormal <- function(mu,sig0,sig,tau,Wvec,ni, alternative=c("greater","less","two-sided"),continuity.corr=TRUE){ # this function computes the normal approximation of the p-value for the Steel test, # based on the sizes ni = c(n1,...,nk) of the k treatment samples # based on the vector of Mann-Whitney statistics comparing the treatment sample values (Y) # against the common control sample values (X), Wvec consists of k such comparison statistics # counting X_i < Y_j and 0.5 of X_i = Y_j. # mu , sig0, sig, and, tau are parameters required for the power evaluation. alternative <- match.arg(alternative) if(continuity.corr==TRUE){ cont.corr <- .5 }else{ cont.corr <- 0 } k <- length(ni) if(alternative=="greater"){ Sx <- max((Wvec-mu)/tau) i0 <- min((1:k)[Sx == (Wvec-mu)/tau]) S <- (Wvec[i0]-cont.corr-mu[i0])/tau[i0] funz <- function(z,k,sig0,sig,tau,S,ni){ fac <- 1 for(i in 1:k){ fac <- fac * pnorm((S*tau[i]-ni[i]*sig0*z)/sig[i]) } dnorm(z)*fac } pval <- 1-integrate(funz,-Inf,Inf,k,sig0,sig,tau,S,ni)$value } if(alternative=="less"){ Sx <- min((Wvec-mu)/tau) i0 <- min((1:k)[Sx == (Wvec-mu)/tau]) S <- (Wvec[i0]+cont.corr-mu[i0])/tau[i0] funz <- function(z,k,sig0,sig,tau,S,ni){ fac <- 1 for(i in 1:k){ fac <- fac * (1-pnorm((S*tau[i]-ni[i]*sig0*z)/sig[i])) } dnorm(z)*fac } pval <- 1-integrate(funz,-Inf,Inf,k,sig0,sig,tau,S,ni)$value } if(alternative=="two-sided"){ Sx <- max(abs(Wvec-mu)/tau) i0 <- min((1:k)[Sx == abs(Wvec-mu)/tau]) S <- (abs(Wvec[i0]-mu[i0])-cont.corr)/tau[i0] funz <- function(z,k,sig0,sig,tau,S,ni){ fac <- 1 for(i in 1:k){ fac <- fac * (pnorm((S*tau[i]-ni[i]*sig0*z)/sig[i])- pnorm((-S*tau[i]-ni[i]*sig0*z)/sig[i])) } dnorm(z)*fac } pval <- 1-integrate(funz,-Inf,Inf,k,sig0,sig,tau,S,ni)$value } pval } # end of Steelnormal kSamples/R/Steel.test.R0000644000176200001440000002570113117074704014420 0ustar liggesusersSteel.test <- function (..., data = NULL, method=c("asymptotic","simulated","exact"), alternative = c("greater","less","two-sided"), dist=FALSE,Nsim=10000) { ############################################################################# # This function "Steel.test" tests whether k-1 samples (k>1) come from the # same continuous distribution as the control sample, taking as test statistic # the maximum standardized Wilcoxon test statistic, or the # minimum standardized Wilcoxon test statistic, or the # maximum absolute standardized Wilcoxon test statistic, # (the Steel statistic) for all two sample # comparisons with the control sample. See Lehmann (2006), # Nonparametrics, Statistical Methods Based on Ranks, Chap. 5.5. # While the asymptotic P-value is always returned, there is the option # to get an estimated P-value based on Nsim simulations or an exact P-value # value based on the full enumeration distribution, provided method = "exact" # is chosen and the number of full enumerations is <= Nsim, as specified. # The latter makes sure that the user is aware of the computation effort involved. # If the number of full enumerations is > Nsim, simulation is used with the # indicated Nsim. # These asymptotic, simulated or exact P-values are appropriate under the # continuity assumption or, when ties are present, they are still appropriate # conditionally given the tied rank pattern, provided randomization took # place in allocating subjects to the respective samples, i.e., also # under random sampling from a common discrete parent population. # # # # Inputs: # ...: can either be a sequence of k (>1) sample vectors, # # or a list of k (>1) sample vectors, # # or y, g, where y contains the concatenated # samples and g is a factor which by its levels # identifies the samples in y, # # or a formula y ~ g with y and g as in previous case. # # method: takes values "asymptotic", "simulated", or "exact". # The value "asymptotic" causes calculation of P-values based # on the multivariate normal approximation for the correlated # rank sums. This is always done, but when "asymptotic" is # is specified, that is all that's done. Useful for large samples, # or when a fast P-value assessment is desired. # # The value "simulated" causes estimation of P-values # by randomly splitting the pooled data into # samples of sizes ns[1], ..., ns[k], where # ns[i] is the size of the i-th sample vector, # and n = ns[1] + ... + ns[k] is the pooled sample size. # For each such random split the Steel statistic is # computed. This is repeated Nsim times and the proportions # of simulated values exceeding the actually observed Steel value # in the appropriate direction is reported as P-value estimate. # # The value "exact" enumerates all ncomb = n!/(ns[1]! * ... * ns[k]!) # splits of the pooled sample and computes the Steel statistic. # The proportion of all enumerated Steel statistics exceeding # the actually observed Steel statistic value in the appropriate # direction is reported as exact (conditional) P-value. # This is only done when ncomb <= Nsim. # # alternative: takes values "greater", "less", and "two-sided". # # For the value "greater" the maximum standardized treatment # rank sum is used as test statistic, using conditional means and # standard deviations given the overall tie pattern among all # n observations for standardization. The test rejects for # large values of this maximum. # # For the value "less" the minimum standardized treatment # rank sum is used as test statistic. The test rejects # for low values of minimum. # # For the value "two-sided" the maximum absolute standardized # treatment rank sum is used as test statistic. The test rejects # for large values of this maximum. # # # dist: = FALSE (default) or TRUE, TRUE causes the simulated # or fully enumerated vector of the Steel statistic # to be returned as null.dist. TRUE should be used # judiciously, keeping in mind the size ncomb or Nsim # of the returned vector. # # Nsim: number of simulations to perform, # # # When there are NA's among the sample values they are removed, # with a warning message indicating the number of NA's. # It is up to the user to judge whether such removals make sense. # # An example: using the data from Steel's paper on the effect of # birth conditions on IQ # z1 <- c(103, 111, 136, 106, 122, 114) # z2 <- c(119, 100, 97, 89, 112, 86) # z3 <- c( 89, 132, 86, 114, 114, 125) # z4 <- c( 92, 114, 86, 119, 131, 94) #set.seed(27) #Steel.test(z1,z2,z3,z4,method="simulated", # alternative="less",Nsim=100000) # or #Steel.test(list(z1,z2,z3,z4),method="simulated", # alternative="less",Nsim=100000) #produces the output below. # # Steel Mutiple Wilcoxon Test: k treatments against a common control # # #Number of samples: 4 #Sample sizes: 6, 6, 6, 6 #Number of ties: 7 # # #Null Hypothesis: All samples come from a common population. # #Based on Nsim = 1e+05 simulations # # test statistic asympt. P-value sim. P-Value # -1.77126551 0.09459608 0.10474000 ############################################################################# # In order to get the output list, call # st.out <- Steel.test(list(z1,z2,z3,z4),method="simulated", # alternative="less",Nsim=100000) # then st.out is of class ksamples and has components # names(st.out) # > names(st.out) # [1] "test.name" "k" "ns" "N" "n.ties" "st" # [7] "warning" "null.dist" "method" "Nsim" "mu" "sig0" # [13] "sig" "tau" "W" # # where # test.name = "Steel" # k = number of samples being compared, including the control sample # ns = vector of the k sample sizes ns[1],...,ns[k] # N = ns[1] + ... + ns[k] total sample size # n.ties = number of ties in the combined set of all N observations # st = 2 (or 3) vector containing the Steel statistics, its asymptotic P-value, # (and its exact or simulated P-value). # warning = logical indicator, warning = TRUE indicates that at least # one of the sample sizes is < 5. # null.dist is a vector of simulated values of the Steel statistic # or the full enumeration of such values. # This vector is given when dist = TRUE is specified, # otherwise null.dist = NULL is returned. # method = one of the following values: "asymptotic", "simulated", "exact" # as it was ultimately used. # Nsim = number of simulations used, when applicable. # mu = the vector of means for the Mann-Whitney statistics W.XY # sig0 = standard deviation of V.0, when W.X1Xi are viewed as n.i^2 * V.0 + V.i # with V.0, V.1, ..., V.(k-1) independent with means zero. # sig = vector of standard deviations ofV.1, ..., V.(k-1) # tau = vector of standard deviations of W.X1X2, ..., W.X1Xk # all these means and standard deviations are conditional on the tie # pattern and are either used in the standardization of the W.X1Xi # or in the normal approximation. # W = vector of Mann-Whitney statistics W.X1X2, ..., W.X1Xk # # The class ksamples causes st.out to be printed in a special output # format when invoked simply as: > st.out # An example was shown above. # # Fritz Scholz, May 2012 # ################################################################################# samples <- io(..., data = data) method <- match.arg(method) alternative <- match.arg(alternative) if(alternative=="greater") alt <- 1 if(alternative=="less") alt <- -1 if(alternative=="two-sided") alt <- 0 out <- na.remove(samples) na.t <- out$na.total if( na.t > 1) print(paste("\n",na.t," NAs were removed!\n\n")) if( na.t == 1) print(paste("\n",na.t," NA was removed!\n\n")) samples <- out$x.new k <- length(samples) if (k < 2) stop("Must have at least two samples.") ns <- sapply(samples, length) n <- sum(ns) if (any(ns == 0)) stop("One or more samples have no observations.") x <- unlist(samples) # reverse the sign of x to change testing alt = -1 to alt = 1 if(alt==-1){ x <- -x alt <- 1 } Wvec <- numeric(k-1) xst <- 0 for(i in 2:k){ xst <- xst + ns[i-1] Wvec[i-1] <- sum( rank( c(x[1:ns[1]],x[xst+1:ns[i]]) )[ns[1]+1:ns[i]])-ns[i]*(ns[i]+1)/2 } Steelobs <- 0 pval <- 0 rx <- rank(x) dvec <- as.vector(table(rx)) d2 <- sum(dvec*(dvec-1)) d3 <- sum(dvec*(dvec-1)*(dvec-2)) n2 <- n*(n-1) n3 <- n2*(n-2) n0 <- ns[1] ni <- ns[-1] sig02 <- (n0/12)*(1-d3/n3) sig0 <- sqrt(sig02) sig2 <- (n0*ni/12)*(n0+1-3*d2/n2-(n0-2)*d3/n3) sig <- sqrt(sig2) tau <- sqrt(ni^2 * sig02 + sig2) mu <- n0*ni/2 L <- length(unique(rx)) # computing total number of combination splits if(dist == TRUE) Nsim <- min(Nsim,1e8) # limits the size of null.dist # whether method = "exact" or = "simulated" ncomb <- 1 np <- n for(i in 1:(k-1)){ ncomb <- ncomb * choose(np,ns[i]) np <- np-ns[i] } # it is possible that ncomb overflows to Inf if( method == "exact" & Nsim < ncomb) { method <- "simulated" } if( method == "exact" & dist == TRUE ) nrow <- ncomb if( method == "simulated" & dist == TRUE ) nrow <- Nsim if( method == "simulated" ) ncomb <- 1 # don't need ncomb anymore if(method == "asymptotic"){ Nsim <- 1 dist <- FALSE } useExact <- FALSE if( method == "exact") useExact <- TRUE if(useExact){nrow <- ncomb} if(dist==TRUE){ Steelvec <- numeric(nrow) }else{ Steelvec <- 0 } out <- .C("Steeltest", pval=as.double(pval), Nsim=as.integer(Nsim), k=as.integer(k), rx=as.double(rx), ns=as.integer(ns), useExact=as.integer(useExact), getSteeldist=as.integer(dist), ncomb=as.double(ncomb), alt=as.integer(alt), mu=as.double(mu), tau=as.double(tau), Steelobs=as.double(Steelobs), Steelvec = as.double(Steelvec), PACKAGE = "kSamples") Steelobs <- out$Steelobs # changes Steelobs back to make up for earlier sign change if(alternative == "less") Steelobs <- -Steelobs pval <- out$pval if(dist){ Steelvec <- out$Steelvec if(alternative == "less") Steelvec <- - Steelvec } if(dist == F | method == "asymptotic") Steelvec <- NULL if(alternative != "less"){ pval.asympt <- Steelnormal(mu,sig0,sig,tau,Wvec,ni,alternative)}else{ pval.asympt <- Steelnormal(mu,sig0,sig,tau,Wvec,ni,"greater") } if(method=="asymptotic"){ st <- c(Steelobs,pval.asympt) }else{ st <- c(Steelobs,pval.asympt,pval) } if(method=="asymptotic"){ names(st) <- c("test statistic"," asympt. P-value") } if(method=="exact"){ names(st) <- c("test statistic"," asympt. P-value","exact P-Value") } if(method=="simulated"){ names(st) <- c("test statistic"," asympt. P-value","sim. P-Value") } warning <- FALSE if(min(ns) < 5) warning <- TRUE if(dist == FALSE) null.dist <- NULL object <- list(test.name = "Steel", k = k, alternative = alternative, ns = ns, N = n, n.ties = n - L, st = st, warning = warning, null.dist = Steelvec, method=method, Nsim=Nsim, W=Wvec, mu=mu, tau=tau, sig0=sig0, sig=sig) class(object) <- "kSamples" object } kSamples/R/qn.test.combined.R0000644000176200001440000003002513117074562015536 0ustar liggesusersqn.test.combined <- function (...,data = NULL, test = c("KW","vdW","NS"), method=c("asymptotic","simulated","exact"), dist=FALSE,Nsim=10000) { ############################################################################# # This function qn.test.combined combines several QN K-sample test statistics # QN.m, m = 1,...,M, into one overall test # statistic QN.combined as suggested for the Kruskal-Wallis test in # Lehmann, E.L. (2006), Nonparametrics, Statistical Methods Based on Ranks, # Ch. 6, Sec. 5D. # See also the documentation of qn.test for the comparison of a single # set of K samples. # Each application of the QN K-sample test can be based on a different K > 1. # This combined version tests the hypothesis that all the hypotheses # underlying the individual K-sample tests are # true simultaneously. # The individual K-sample test hypothesis is that all samples from # the m-th group come from a common population. However, that population # may be different from one individual K-sample situation to the next. # Such a combined test is useful in # # 1) examining intra-laboratory measurement equivalence, when samples from # the same material or batch are compared for M laboratories and such # comparisons are made for samples from several different materials or # batches and one assessment over all materials/batches is desired. # # 2) analyzing treatment effects in randomized complete or incomplete # block designs. # # When there are NA's among the sample values they are removed, # with a warning message indicating the number of NA's. # It is up to the user to judge whether such removals make sense. # # Input: ... # can take the form of several lists, # say L.1,...,L.M, where list L.i contains # K.i sample vectors of respective sizes n.i[1], ..., n.i[K.i] # (n.i[j] > 4 is recommended) # # or a single list of such lists # # or a data frame with first column representing the responses y, # the second column a factor g the levels of which are used to # indicate the samples within each block, and the third column # a factor b indicating the block # # or a formula y ~ g | b with y, g, b as in the previous situation # # or just the three vectors y, g, b in this order with same meaning. # # test: specifies the ranks scores to be used, averaging the scores # of tied observations. # test = "KW" uses scores 1:N ( ==> Kruskal-Wallis test) # test = "vdW" uses the van der Waerden scores qnorm(1:N/(N+1)) # test = "NS" uses normal scores, expected standard normal order # statistics, uses function normOrder of package SuppDists. # Other scores could easily be added to this function and # to qn.test. # # method # can take one of three values "asymptotic", "simulated", # and "exact", which determines the mode of P-value calculation. # The asymptotic P-value based on the chi-square approximation # is always returned. # The simulated P-value simulates splits of the pooled samples in # the i-th list L.i into K.i samples of sizes n.i[1], ..., n.i[K.i], # computing the corresponding QN statistic QN.i, # doing this independently for i = 1,...,M and adding the QN.i # to get the combined statistic QN.comb. This is repeated Nsim # times and the simulated P-value is the proportion of these # values that are >= the observed combined value. # The exact P-value should only be attempted for small M and small # sample sizes and requires that Nsim be set to >= the total number # of QN.comb enumerations. Otherwise Nsim simulations are run # to get a simulated P-value, as described above. # As example consider: M=2 with K.1 = 2, n.1[1] = 5, n.1[2] = 6, # K.2 = 2, n.2[1] = 5, n.2[2] = 7, then we would have # choose(11,5) = 462 splits of the first set of pooled samples # and choose(12,5) = 792 splits of the second set of pooled samples # and thus 462 * 792 = 365904 combinations of QN.1+QN.2 = QN.comb. # Thus we would need to set Nsim >= 365904 to enable exact # exact enumeration evaluation of the P-value. Since these enumerated # values of QN.comb need to be held inside R in a single vector, # we are limited by the object size in R. In simulations the length # of the simulated vector of QN.comb is only Nsim and is manageable. # # dist # takes values FALSE (default) or TRUE, where TRUE enables the # return of the simulated or exact vectors of generated values # of QN.comb. # Otherwise NULL is returned for both versions # # Nsim # = 10000 (default), number of simulations as discussed above. # # # # An example: # x1 <- c(1, 3, 2, 5, 7) # x2 <- c(2, 8, 1, 6, 9, 4) # x3 <- c(12, 5, 7, 9, 11) # y1 <- c(51, 43, 31, 53, 21, 75) # y2 <- c(23, 45, 61, 17, 60) # then # set.seed(2627) # qn.test.combined(list(x1,x2,x3),list(y1,y2),test="KW", method="simulated",Nsim=100000) # or equivalently qn.test.combined(list(list(x1,x2,x3),list(y1,y2)), # test="KW",method="simulated",Nsim=100000) # produces the outcome below. ########################################################################## # # Combination of Kruskal-Wallis K-Sample Tests. # # Number of data sets = 2 # # Sample sizes within each data set: # Data set 1 : 5 6 5 # Data set 2 : 6 5 # Total sample size per data set: 16 11 # Number of unique values per data set: 11 11 # # Null Hypothesis: # All samples within a data set come from a common distribution. # The common distribution may change between data sets. # # Based on Nsim = 1e+05 simulations # for data set 1 we get # QN asympt. P-value sim. P-Value # 5.64851852 0.05935261 0.05156000 # # for data set 2 we get # QN asympt. P-value sim. P-Value # 0.03333333 0.85513214 0.93001000 # # Combined Kruskal-Wallis Criterion: QN.combined = QN.1+QN.2 # # Based on Nsim = 1e+05 simulations # QN.comb asympt. P-value sim. P-Value # 5.6818519 0.1281575 0.1216000 # ############################################################################### # For out.qn.combined <- qn.test.combined(list(x1,x2,x3),list(y1,y2)) # or out.qn.combined <- qn.test.combined(list(list(x1,x2,x3),list(y1,y2))) # we get the object out.qn.combined of class ksamples with the following # components # > names(qn.combined.out) # [1] "test.name" "M" "n.samples" "nt" "n.ties" "qn.list" # [7] "qn.c" "warning" "null.dist" "method" "Nsim" # where # test.name "Kruskal-Wallis", "van der Waerden scores", or "normal scores" # M number of sets of samples being compared # n.samples = is a list of the vectors giving the sample sizes for each # set of samples being compared # nt vector of total sample sizes involved in each of the M comparisons # n.ties vector of lenth M giving the number of ties in each comparison group # qn.list list of M data frames for the results for each of the test results # corresponding to the M block # qn.c 2 (or 3) vector containing QN.obs, asymptotic P-value, # (and simulated or exact P-value) for the combined test. # Here qn.obs is the observed value of QN.comb and # P-value = P(QN.comb >= qn.obs). # warning logical indicator, warning = TRUE when at least one of # the sample sizes is < 5. # null.dist vector of simulated or fully enumerated QN statistics, if requested, # otherwise it is NULL # method one of the following values: "asymptotic", "simulated", "exact" # as it was ultimately used. # Nsim number of simulations used, when applicable. # Nsim # # Fritz Scholz, April 2012 ##################################################################################### convvec <- function(x1,x2){ #---------------------------------------------------- # R routine for calling convvec in conv.c # created 02.05.2012 Fritz Scholz #---------------------------------------------------- n1 <- length(x1) n2 <-length(x2) n <- n1*n2 x <- numeric(n) out <- .C("convvec", x1=as.double(x1), n1=as.integer(n1), x2=as.double(x2),n2=as.integer(n2), x=as.double(x),n=as.integer(n), PACKAGE = "kSamples") out$x } # the following converts individual data sets into a list of such, # if not already in this form. It drops blocks (sublist) with at # most one sample in it. data.sets <- io2(...,data = data) data.sets <- test.list(data.sets) # end of data.sets list conversion test <- match.arg(test) method <- match.arg(method) n.sizes <- NULL M <- length(data.sets) # number of data sets n.data <- sapply(data.sets, length) # gets vector of number of samples in each component of data.sets n.samples <- list() # intended to contain vectors of sample sizes for # for each respective data set. na.t <- 0 # intended to tally total of NA cases ncomb <- 1 # intended to hold the total number of evaluations # of the full convolution distribution, to check # whether exact computation is reasonable. for(i in 1:M){ out <- na.remove(data.sets[i]) na.t<- na.t + out$na.total data.sets[i] <- out$x.new # replace data set i with the one that has # NA's removed n.sample <- sapply(data.sets[[i]], length) # contains sample sizes for i-th data set n.sizes <- c(n.sizes, n.sample) # accumulates all sample size, warning purpose only if(any(n.sample==0)) stop(paste("one or more samples in data set", i, "has no observations")) n.samples[[i]] <- n.sample N <- sum(n.sample) # total observations in i-th data set k <- length(n.sample) # number of samples in i-th data set # updating ncomb ncomb <- ncomb * choose(N,n.sample[1]) for(j in 2:k){ N <- N-n.sample[j-1] ncomb <- ncomb * choose(N,n.sample[j]) } # end of ncomb update for data set i } if(ncomb > Nsim & method == "exact") method <- "simulated" if( na.t > 1) cat(paste("\n",na.t," NAs were removed!\n\n")) if( na.t == 1) cat(paste("\n",na.t," NA was removed!\n\n")) warning <- min(n.sizes) < 5 # set warning flag for caution on # trusting asymptotic p-values # Initializing output objects QNobs <- 0 sig <- NULL n.ties <- NULL nt <- NULL mu <- NULL qn.list <- list() mu.c <- 0 null.dist <- NULL if(method == "asymptotic"){dist0 <- FALSE}else{dist0 <- TRUE} # the following loop aggregates the (estimated or exact) # convolution distribution of the combined QN statistic versions for(i in 1:M){ out <- qn.test(data.sets[[i]],test=test,method=method,dist=dist0,Nsim=Nsim) if(dist0==T){ if(i == 1){ null.dist <- out$null.dist }else{ if(method=="simulated"){ null.dist <- null.dist+out$null.dist }else{ null.dist <- convvec(null.dist,out$null.dist) } } } qn.list[[i]] <- out$qn mu <- c(mu, length(data.sets[[i]])-1) QNobs <- QNobs + out$qn[1] # aggregates combined QN stats n.ties <- c(n.ties, out$n.ties) nt <- c(nt, sum(out$ns)) # accumulates total observations in data sets } # get exact or simulated P-value if(dist0==T){ nrow <- length(null.dist) pval <- sum(null.dist >= QNobs)/nrow } # get asymptotic P-value pval.asympt <- 1-pchisq(QNobs, sum(mu)) if(method=="asymptotic"){ qn.c <- c(QNobs,pval.asympt) }else{ qn.c <- c(QNobs,pval.asympt,pval) } if(method=="asymptotic"){ names(qn.c) <- c("comb.statistic"," asympt. P-value") } if(method=="exact"){ names(qn.c) <- c("comb.statistic"," asympt. P-value","exact P-Value") } if(method=="simulated"){ names(qn.c) <- c("comb.statistic"," asympt. P-value","sim. P-Value") } if(dist==FALSE){ null.dist <- NULL } if(test == "vdW") test.name <- "van der Waerden scores" if(test == "NS") test.name <- "normal scores" if(test == "KW") test.name <- "Kruskal-Wallis" object <- list(test.name =test.name, M = M, n.samples=n.samples, nt=nt, n.ties=n.ties, qn.list=qn.list, qn.c = qn.c, warning=warning,null.dist=null.dist, method=method,Nsim=Nsim) class(object) <- "kSamples" object } kSamples/R/ad.test.R0000644000176200001440000002476013117073657013742 0ustar liggesusersad.test <- function (..., data = NULL, method=c("asymptotic","simulated","exact"),dist=FALSE,Nsim=10000) { ############################################################################# # This function "ad.test" tests whether k samples (k>1) come from a common # continuous distribution, using the nonparametric (rank) test described in # Scholz F.W. and Stephens M.A. (1987), K-sample Anderson-Darling Tests, # Journal of the American Statistical Association, Vol 82, No. 399, # pp. 918-924. # This test is consistent against all alternatives. # Ties are handled by using midranks, and according to the above # reference two versions of the test statistic are returned. # They are labeled version 1 and version 2, in the order introduced # in the above reference. # While the asymptotic P-value is always returned, there is the option # to get an estimate based on Nsim simulations or an exact value based # on the full enumeration distribution, provided method = "exact" is chosen # and the number of full enumerations is <= the Nsim specified. # If the latter is not the case, simulation is used with the indicated Nsim. # These simulated or exact P-values are appropriate under the continuity # assumption or, when ties are present, they are still appropriate # conditionally on the tied rank pattern, provided randomization took # place in allocating subjects to the respective samples, i.e., also # under random sampling from a common discrete parent population. # # # # Inputs: # ...: can either be a sequence of k (>1) sample vectors, # # or a list of k (>1) sample vectors, # # or y, g, where y contains the concatenated # samples and g is a factor which by its levels # identifies the samples in y, # # or a formula y ~ g with y and g as in previous case. # # # data: data frame with variables usable in formula input, default = NULL. # # method: takes values "asymptotic", "simulated", or "exact". # The value "asymptotic" causes calculation of P-values # using the asymptotic approximation, always done. # # The value "simulated" causes estimation of P-values # by randomly splitting the the pooled data into # samples of sizes ns[1], ..., ns[k], where # ns[i] is the size of the i-th sample vector, # and n = ns[1] + ... + ns[k] is the pooled sample size. # For each such random split the AD statistics are # computed. This is repeated Nsim times and the proportions # of simulated values >= the respective actually # observed AD values are reported as P-value estimates. # # The value "exact" enumerates all n!/(ns[1]! * ... * ns[k]) # splits of the pooled sample and computes the respective # AD statistics. The proportion of all enumerated AD statistics # which are >= the respective actually observed AD values # are reported as exact P-values. # # dist: = FALSE (default) or TRUE, TRUE causes the simulated # or fully enumerated vectors of both AD statstics to be returned # as null.dist1 and null.dist2. # # Nsim: number of simulations to perform, # for method = "exact" to take hold, it needs to be at least # equal the number of all possible splits of the pooled # data into samples of sizes ns[1], ..., ns[k], where # ns[i] is the size of the i-th sample vector. # # When there are NA's among the sample values they are removed, # with a warning message indicating the number of NA's. # It is up to the user to judge whether such removals make sense. # # An example: # z1 <- c(0.824, 0.216, 0.538, 0.685) # z2 <- c(0.448, 0.348, 0.443, 0.722) # z3 <- c(0.403, 0.268, 0.440, 0.087) # ad.test(z1,z2,z3,method="exact",dist=T,Nsim=100000) # or # ad.test(list(z1,z2,z3),method="exact",dist=T,Nsim=100000) # which produces the output below. ############################################################################# # Anderson-Darling k-sample test. # # Number of samples: 3 # Sample sizes: 4, 4, 4 # Number of ties: 0 # # Mean of Anderson-Darling Criterion: 2 # Standard deviation of Anderson-Darling Criterion: 0.88133 # # T.AD = ( Anderson-Darling Criterion - mean)/sigma # # Null Hypothesis: All samples come from a common population. # # AD T.AD asympt. P-value exact P-value # version 1: 2.6367 0.72238 0.18525 0.20924 # version 2: 2.6200 0.70807 0.18819 0.21703 # # # Warning: At least one sample size is less than 5. # asymptotic p-values may not be very accurate. # ############################################################################# # In order to get the output list, call # ad.out <- ad.test(z1,z2,z3,method="exact",dist=T,Nsim=100000) # then ad.out is of class ksamples and has components # > names(ad.out) # [1] "test.name" "k" "ns" "N" "n.ties" # [6] "sig" "ad" "warning" "null.dist1" "null.dist2" # [11] "method" "Nsim" # # where # test.name = "Anderson-Darling" # k = number of samples being compared # ns = vector of the k sample sizes ns[1],...,ns[k] # N = ns[1] + ... + ns[k] total sample size # n.ties = number of ties in the combined set of all n observations # sig = standard deviation of the AD statistic (for continuous population case) # ad = 2 x 3 (or 2 x 4) matrix containing the AD statistics, # standardized AD statistics, its asymptotic P-value, # (and its exact or simulated P-value), for version 1 in the first row # and for version 2 in the second row. # warning = logical indicator, warning = TRUE indicates that at least # one of the sample sizes is < 5. # null.dist1 is a vector of simulated values of the AD statistic (version 1) # or the full enumeration of such values. # This vector is given when dist = TRUE is specified, # otherwise null.dist1 = NULL is returned. # null.dist2 is the corresponding vector for the 2nd AD statistic version. # method = one of the following values: "asymptotic", "simulated", "exact" # as it was ultimately used. # Nsim = number of simulations used, when applicable. # # The class ksamples causes ad.out to be printed in a special output # format when invoked simply as: > ad.out # An example was shown above. # # Fritz Scholz, August 2012 # ################################################################################# samples <- io(...,data = data) method <- match.arg(method) out <- na.remove(samples) na.t <- out$na.total if( na.t > 1) print(paste("\n",na.t," NAs were removed!\n\n")) if( na.t == 1) print(paste("\n",na.t," NA was removed!\n\n")) samples <- out$x.new k <- length(samples) if (k < 2) stop("Must have at least two samples.") ns <- sapply(samples, length) if (any(ns == 0)) stop("One or more samples have no observations.") x <- unlist(samples) n <- length(x) Z.star <- sort(unique(x)) L <- length(Z.star) if(dist == TRUE) Nsim <- min(Nsim,1e8) # limits the size of null.dist1 and null.dist2 # whether method = "exact" or = "simulated" ncomb <- 1 np <- n for(i in 1:(k-1)){ ncomb <- ncomb * choose(np,ns[i]) np <- np-ns[i] } # it is possible that ncomb overflows to Inf if( method == "exact" & Nsim < ncomb) { method <- "simulated" } if( method == "exact" & dist == TRUE ) nrow <- ncomb if( method == "simulated" & dist == TRUE ) nrow <- Nsim if( method == "simulated" ) ncomb <- 1 # don't need ncomb anymore if(method == "asymptotic"){ Nsim <- 1 dist <- FALSE } dist1 <- NULL dist2 <- NULL pv <- c(NA,NA) getA2mat <- dist useExact <- FALSE if(method == "exact") useExact <- TRUE if(getA2mat){ a2mat <- matrix(0,nrow=nrow,ncol=2)}else{ a2mat <- 0 } ans <- numeric(2) pval <- numeric(2) out0 <- .C("adkTestStat0",ans=as.double(ans),k=as.integer(k),x=as.double(x), ns=as.integer(ns),Z.star=as.double(Z.star),L=as.integer(L), PACKAGE = "kSamples") if(method != "asymptotic"){ out1 <- .C("adkPVal0",pval=as.double(pval), Nsim=as.integer(Nsim),k=as.integer(k), x=as.double(x),ns=as.integer(ns), zstar=as.double(Z.star),L=as.integer(L), useExact=as.integer(useExact),getA2mat=as.integer(getA2mat), ncomb=as.double(ncomb),a2mat=as.double(a2mat), PACKAGE= "kSamples") pv <- out1$pval if(getA2mat){ a2mat <- matrix(out1$a2mat, nrow=nrow, ncol=2, byrow=FALSE, dimnames=list(NULL, c("AkN2", "AakNk2"))) dist1 <- round(a2mat[,1],8) dist2 <- round(a2mat[,2],8) } } AkN2 <- out0[[1]][1] AakN2 <- out0[[1]][2] if(n > 3){ coef.d <- 0 coef.c <- 0 coef.b <- 0 coef.a <- 0 H <- sum(1/ns) h <- sum(1/(1:(n - 1))) g <- 0 for (i in 1:(n - 2)) { g <- g + (1/(n - i)) * sum(1/((i + 1):(n - 1))) } coef.a <- (4 * g - 6) * (k - 1) + (10 - 6 * g) * H coef.b <- (2 * g - 4) * k^2 + 8 * h * k + (2 * g - 14 * h - 4) * H - 8 * h + 4 * g - 6 coef.c <- (6 * h + 2 * g - 2) * k^2 + (4 * h - 4 * g + 6) * k + (2 * h - 6) * H + 4 * h coef.d <- (2 * h + 6) * k^2 - 4 * h * k sig2 <- (coef.a * n^3 + coef.b * n^2 + coef.c * n + coef.d)/((n - 1) * (n - 2) * (n - 3)) sig <- sqrt(sig2) TkN <- (AkN2 - (k - 1))/sig TakN <- (AakN2 - (k - 1))/sig pvalTkN <- ad.pval(TkN, k - 1,1) pvalTakN <- ad.pval(TakN, k - 1,2) } if(n == 3 && k == 3 | n == 2){ sig <- 0 TkN <- NA TakN <- NA pvalTkN <- 1 pvalTakN <- 1 } if(n == 3 && k == 2){ sig <- .3535534 TkN <- (AkN2 - (k - 1))/sig TakN <- (AakN2 - (k - 1))/sig pvalTkN <- ad.pval(TkN, k - 1,1) pvalTakN <- ad.pval(TakN, k - 1,2) } warning <- min(ns) < 5 if(method=="asymptotic"){ ad.mat <- matrix(c(signif(AkN2,5), signif(TkN, 5), signif(pvalTkN, 5), signif(AakN2,3) , signif(TakN, 5), signif(pvalTakN, 5)), byrow = TRUE, ncol = 3) }else{ ad.mat <- matrix(c(signif(AkN2,5), signif(TkN, 5), signif(pvalTkN, 5), signif(pv[1],5),signif(AakN2,3) , signif(TakN, 5), signif(pvalTakN, 5), signif(pv[2],5)), byrow = TRUE, ncol = 4) } if(method=="asymptotic"){ dimnames(ad.mat) <- list(c("version 1:","version 2:"), c("AD","T.AD"," asympt. P-value")) } if(method=="exact"){ dimnames(ad.mat) <- list(c("version 1:","version 2:"), c("AD","T.AD"," asympt. P-value"," exact P-value")) } if(method=="simulated"){ dimnames(ad.mat) <- list(c("version 1:","version 2:"), c("AD","T.AD"," asympt. P-value"," sim. P-value")) } object <- list(test.name ="Anderson-Darling", k = k, ns = ns, N = n, n.ties = n - L, sig = round(sig, 5), ad = ad.mat, warning = warning, null.dist1 = dist1, null.dist2 = dist2, method=method, Nsim=Nsim) class(object) <- "kSamples" object } kSamples/R/print.kSamples.R0000644000176200001440000001555712614006342015302 0ustar liggesusersprint.kSamples <- function (x, ...) { ###################################################### # # This is a print function for objects of class kSamples, # as they are produced by ad.test, kw.test, # ad.combined.test, kw.combined.test, contingency2xt, # contingency2xt.comb, Steel.test, SteelConfInt, # and JT.test. # # Fritz Scholz, August 2015 # ####################################################### if(names(x)[2]=="k"){# checking whether the object x #came from ad.test or qn.test of JT.test if(x$test.name=="Steel"){ cat("\nSteel Multiple Comparison Wilcoxon Test:\nk treatments against a common control (1st sample)\n\n") }else{ cat(paste("\n\n",x$test.name,"k-sample test.\n")) } cat(paste("\nNumber of samples: ", x$k)) cat("\nSample sizes: ",paste(x$ns,collapse=", ")) cat(paste("\nNumber of ties:", x$n.ties)) if(x$test.name == "Anderson-Darling"){ cat(paste("\n\nMean of ",x$test.name," Criterion:", x$k-1)) cat(paste("\nStandard deviation of ",x$test.name," Criterion:", x$sig)) cat(paste("\n\nT.AD = (",x$test.name," Criterion - mean)/sigma")) } if(x$test.name != "Jonckheere-Terpstra" ){ cat("\n\nNull Hypothesis: All samples come from a common population.\n\n") }else{ cat("\n\nNull Hypothesis: All samples come from a common population.\n") cat("Alternative: Samples indicate a positive trend.\n\n") } if(x$method=="simulated") cat(paste("Based on Nsim =",x$Nsim,"simulations\n\n")) if(x$test.name == "Anderson-Darling" ){print(signif(x$ad,4))} if(x$test.name == "van der Waerden scores" ) print(signif(x$qn,4)) if(x$test.name == "Kruskal-Wallis" ) print(signif(x$qn,4)) if(x$test.name == "normal scores" ) print(signif(x$qn,4)) if(x$test.name == "Jonckheere-Terpstra" ){ print(signif(x$JT,4)) } if (x$warning) { cat("\n\nWarning: At least one sample size is less than 5,\n") cat(" asymptotic p-values may not be very accurate.\n") } invisible(x) } if(names(x)[2]=="M"){# checking whether the object x came from ad.combined.test cat(paste("Combination of",x$test.name,"K-Sample Tests.\n")) cat(paste("\nNumber of data sets =", x$M,"\n")) cat("\nSample sizes within each data set:\n") ns <- NULL k <- length(x$n.samples) d.sets <- paste("Data set",1:k) for(i in 1:k){ cat(d.sets[i],": ",x$n.samples[[i]]) cat("\n") } if(k>3) AD.name=paste("AD.1","...",paste("AD.",k,sep=""),sep="+") if(k==2)AD.name=paste("AD.1+AD.2") if(k==3)AD.name=paste("AD.1+AD.2+AD.3") if(k>3) QN.name=paste("QN.1","...",paste("QN.",k,sep=""),sep="+") if(k==2)QN.name=paste("QN.1+QN.2") if(k==3)QN.name=paste("QN.1+QN.2+QN.3") cat("Total sample size per data set: ") cat(x$nt,"\n") cat("Number of unique values per data set: ") cat(x$nt-x$n.ties,"\n") if(x$test.name=="Anderson-Darling"){ cat(paste("\nAD.i =",x$test.name,"Criterion for i-th data set\n")) cat("Means:",x$mu,"\n") cat("Standard deviations:", x$sig,"\n") cat("\nT.i = (AD.i - mean.i)/sigma.i\n") } cat("\nNull Hypothesis:\nAll samples within a data set come from a common distribution.\n") cat("The common distribution may change between data sets.\n\n") if(x$test.name=="Anderson-Darling"){ nx <- length(x$ad.list) if(x$method=="simulated") cat(paste("Based on Nsim =",x$Nsim,"simulations\n\n")) for(i in 1:nx){ cat(paste("for data set",i,"we get\n")) print(signif(x$ad.list[[i]],4)) cat("\n") } cat("Combined Anderson-Darling Criterion: AD.comb =",AD.name,"\n") cat("Mean =",x$mu.c," Standard deviation =",round(x$sig.c,5),"\n") cat("\nT.comb = (AD.comb - mean)/sigma\n") cat("\n") if(x$method=="simulated") cat(paste("Based on Nsim =",x$Nsim,"simulations\n\n")) ad.c <- x$ad.c print(signif(ad.c,4)) } if(x$test.name == "van der Waerden scores" | x$test.name == "normal scores" | x$test.name == "Kruskal-Wallis" ){ nx <- length(x$qn.list) if(x$method=="simulated") cat(paste("Based on Nsim =",x$Nsim,"simulations\n\n")) for(i in 1:nx){ cat(paste("for data set",i,"we get\n")) print(signif(x$qn.list[[i]],4)) cat("\n") } cat("Combined Criterion: QN.combined =",QN.name,"\n") cat("\n") if(x$method=="simulated") cat(paste("Based on Nsim =",x$Nsim,"simulations\n\n")) print(signif(x$qn.c,4)) } if (x$warning) { cat("\n\nWarning: At least one sample size is less than 5,\n") cat(" asymptotic p-values may not be very accurate.\n") } cat("\n") invisible(x) } if(x$test.name == "2 x t Contingency Table"){ cat(paste("\n Kruskal-Wallis Test for 2 x",x$t,"Contingency Table\n\n")) if(x$method=="simulated") cat(paste(" Based on Nsim =", x$Nsim,"simulations\n\n")) print(signif(x$KW.cont,4)) cat("\n") invisible(x) } if(x$test.name == "Combined 2 x t Contingency Tables"){ cat("\n Combined Kruskal-Wallis Tests for 2 x t Contingency Tables\n\n") if(x$method=="simulated") cat(paste(" Based on Nsim =", x$Nsim,"simulations\n\n")) nx <- length(x$kw.list) for( i in 1:nx){ cat(paste("for data set",i,"we get\n")) print(signif(x$kw.list[[i]],4)) cat("\n") } if(nx>3) KW.name=paste("KW.1","...",paste("KW.",k,sep=""),sep="+") if(nx==2)KW.name=paste("KW.1+KW.2") if(nx==3)KW.name=paste("KW.1+KW.2+KW.3") cat("Combined Criterion: KW.combined =",KW.name,"\n") cat("\n") if(x$method=="simulated") cat(paste("Based on Nsim =",x$Nsim,"simulations\n\n")) print(signif(x$kw.c,4)) cat("\n") invisible(x) } if(x$test.name == "Steel"){ print(signif(x$st,4)) invisible(x) } if(x$test.name == "Steel.bounds"){ cat("\nSteel Multiple Comparison Confidence Bounds for Shift Parameters\nBased on Wilcoxon Tests, k Treatments against a Common Control\n\n") cat("size of control sample: ",x$n0,"\n") if(length(x$ns) > 1){ cat("sizes of treatment samples: ",paste(x$ns,collapse=", "),"\n") }else{ cat("size of treatment sample: ",x$ns,"\n") } if(x$n.ties > 0){ cat("number of ties: ",x$n.ties,"\n") cat("intervals should be widened on each end by the rounding epsilon\n") cat("to conservatively maintain the stated joint confidence level\n") } cat("\nconservative bounds based on asymptotics\n\n") print(x$bounds[[1]]) cat("\nbounds based on asymptotics,\n") cat("with level closest to nominal\n\n") print(x$bounds[[2]]) if(x$method=="simulated"){ cat("\nconservative bounds based on simulation\n\n") print(x$bounds[[3]]) cat("\nbounds based on simulation,\n") cat("with level closest to nominal\n\n") print(x$bounds[[4]]) } invisible(x) } } kSamples/MD50000644000176200001440000000430213470620413012340 0ustar liggesusers84bb880d9ac4a92c1861ed5876fe48f7 *DESCRIPTION 9b158965f51570da03c9d654b76db944 *NAMESPACE 4c64c183f47e3f41ffe25dc0d88f4e07 *R/Harding.R e313845ee478bcaa2382697d214e619e *R/Steel.test.R 51714e3fb71ea3cd0f6b2262419e7ccd *R/SteelConfInt.R 12c3ff797671dff5384337902e1c57f8 *R/Steelnormal.R 5c7b051d5a45292f700851ceb55be388 *R/Steelnormal0.R afe2c0d61dd08abc93589e3b097a47b3 *R/ad.pval.R 1076a25be3c41fcab93736ffd462cc3f *R/ad.test.R 76013ba5dee4dc2710f3e58b5b495479 *R/ad.test.combined.R 2ec0db9c12bed805ff910af7368bfd4e *R/contingency2xt.R 3c005b348c61668f49c956b24edff597 *R/contingency2xt.comb.R 5c4e224801cc2d5147084c526f7566b7 *R/conv.R b89e3159e9f25e596605de179a5b7e1d *R/djt.R 3421ad7dfdcbd2dbf9ce8ae0104d8fb7 *R/io.R 0bf8da5b7f33dcf9de38808de5f15870 *R/io2.R 47c9c22b86b2085491326bf0e88150d9 *R/jt.test.R 963947a43928ddb0cc3950540834bc79 *R/listmake.R d04037e9a5453fb491cae38db3da5751 *R/na.remove.R 9113dfc48a023decadbba16033220eda *R/pjt.R 618ba2a7f6ab3b5c037865fc5ccbc50a *R/pp.kSamples.R 948ba6c5987dbbe9c6d555fb31676649 *R/print.kSamples.R 011a519ff05381c1524a7f311286f9c9 *R/qjt.R adbaf82af612fc0cf705c8563da21dc3 *R/qn.test.R 2f865dcb0654adaa84c6c2c71d40edb7 *R/qn.test.combined.R e7c6e7d2aad1c6f7ffa87743f9c5aab5 *R/test.list.R ec378c8638881be52ca3784d16101578 *data/ShorelineFireEMS.rda c6aa8922b464e40ef79cc0612ee1c913 *inst/NEWS.Rd 496d2c8396d0b721bb7457917957760a *man/ShorelineFireEMS.Rd 8f19e3fa51d295d13752759965afdb13 *man/Steel.test.Rd df3bedb012cc07148434c9ebdf3b085d *man/SteelConfInt.Rd 2b9475a4a5902b9c1f1af953e11ff8ba *man/ad.pval.Rd ea3edf5c9b778c3a2ebccf9292eacad5 *man/ad.test.Rd 4bb81b480faa191e6466b81d68700206 *man/ad.test.combined.Rd 9acc49ee2948d595154b0ba4769ca811 *man/contingency2xt.Rd 2787f09874aa07a3af67f46ae8cf6028 *man/contingency2xt.comb.Rd 4aace97e560edc4f6326cfe51b80fd5b *man/conv.Rd df2c992645b7cd778312a2b3a8f34445 *man/jt.dist.Rd 2ec14b8dc13b4c61fe17483882e2bfaa *man/jt.test.Rd 0b25cd1686e279fbce5d2b84aaa82302 *man/kSamples-package.Rd 8a6a281da26180cffbbf51102c25ce06 *man/pp.kSamples.Rd 68faa176d8f8e9193add70b2c29e4425 *man/qn.test.Rd d71a387a6f492885d9369cf14102eaca *man/qn.test.combined.Rd a19c755fd44a3129fb06841b69978359 *src/kSamples.c f15b6fd29a344f0f5824e876ac5129dd *src/myfuns.h kSamples/DESCRIPTION0000644000176200001440000000274713470620413013551 0ustar liggesusersPackage: kSamples Type: Package Title: K-Sample Rank Tests and their Combinations Version: 1.2-9 Date: 2019-05-20 Authors@R: c(person("Fritz", "Scholz",role=c("aut","cre"),email = "fscholz@u.washington.edu"), person("Angie", "Zhu", role=c("aut"), email= "a.zhu.stat@gmail.com")) Author: Fritz Scholz [aut, cre], Angie Zhu [aut] Maintainer: Fritz Scholz Depends: SuppDists Imports: methods, graphics, stats Description: Compares k samples using the Anderson-Darling test, Kruskal-Wallis type tests with different rank score criteria, Steel's multiple comparison test, and the Jonckheere-Terpstra (JT) test. It computes asymptotic, simulated or (limited) exact P-values, all valid under randomization, with or without ties, or conditionally under random sampling from populations, given the observed tie pattern. Except for Steel's test and the JT test it also combines these tests across several blocks of samples. Also analyzed are 2 x t contingency tables and their blocked combinations using the Kruskal-Wallis criterion. Steel's test is inverted to provide simultaneous confidence bounds for shift parameters. A plotting function compares tail probabilities obtained under asymptotic approximation with those obtained via simulation or exact calculations. License: GPL (>= 2) LazyLoad: yes NeedsCompilation: yes Packaged: 2019-05-20 21:03:07 UTC; fritz Repository: CRAN Date/Publication: 2019-05-20 21:50:03 UTC kSamples/man/0000755000176200001440000000000013470613013012602 5ustar liggesuserskSamples/man/ad.test.combined.Rd0000644000176200001440000002046112657712416016232 0ustar liggesusers\name{ad.test.combined} \alias{ad.test.combined} \title{ Combined Anderson-Darling k-Sample Tests } \description{ This function combines several independent Anderson-Darling \eqn{k}-sample tests into one overall test of the hypothesis that the independent samples within each block come from a common unspecified distribution, while the common distributions may vary from block to block. Both versions of the Anderson-Darling test statistic are provided. } \usage{ ad.test.combined(\dots, data = NULL, method = c("asymptotic", "simulated", "exact"), dist = FALSE, Nsim = 10000) } \arguments{ \item{\dots}{ Either a sequence of several lists, say \eqn{L_1, \ldots, L_M} (\eqn{M > 1}) where list \eqn{L_i} contains \eqn{k_i > 1} sample vectors of respective sizes \eqn{n_{i1}, \ldots, n_{ik_i}}, where \eqn{n_{ij} > 4} is recommended for reasonable asymptotic \eqn{P}-value calculation. \eqn{N_i=n_{i1}+\ldots+n_{ik_i}} is the pooled sample size for block \eqn{i}, or a list of such lists, or a formula, like y ~ g | b, where y is a numeric response vector, g is a factor with levels indicating different treatments and b is a factor indicating different blocks; y, g, b are or equal length. y is split separately for each block level into separate samples according to the g levels. The same g level may occur in different blocks. The variable names may correspond to variables in an optionally supplied data frame via the data = argument, } \item{data}{= an optional data frame providing the variables in formula input } \item{method}{= \code{c("asymptotic","simulated","exact")}, where \code{"asymptotic"} uses only an asymptotic \eqn{P}-value approximation, reasonable for P in [0.00001, .99999], linearly extrapolated via \eqn{\log(P/(1-P))}{log(P/(1-P))} outside that range. See \code{\link{ad.pval}} for details. The adequacy of the asymptotic \eqn{P}-value calculation may be checked using \code{\link{pp.kSamples}}. \code{"simulated"} uses simulation to get \code{Nsim} simulated \eqn{AD} statistics for each block of samples, adding them across blocks component wise to get \code{Nsim} combined values. These are compared with the observed combined value to obtain the estimated \eqn{P}-value. \code{"exact"} uses full enumeration of the test statistic values for all sample splits of the pooled samples within each block. The test statistic vectors for the first 2 blocks are added (each component against each component, as in the R \code{outer(x,y,} \code{"+")} command) to get the convolution enumeration for the combined test statistic. The resulting vector is convoluted against the next block vector in the same fashion, and so on. It is possible only for small problems, and is attempted only when \code{Nsim} is at least the (conservatively maximal) length \deqn{\frac{N_1!}{n_{11}!\ldots n_{1k_1}!}\times\ldots\times \frac{N_M!}{n_{M1}!\ldots n_{Mk_M}!}} of the final distribution vector. Otherwise, it reverts to the simulation method using the provided \code{Nsim}. } \item{dist}{\code{FALSE} (default) or \code{TRUE}. If \code{TRUE}, the simulated or fully enumerated convolution vectors \code{null.dist1} and \code{null.dist2} are returned for the respective test statistic versions. Otherwise, \code{NULL} is returned for each. } \item{Nsim}{\code{= 10000} (default), number of simulation splits to use within each block of samples. It is only used when \code{method = "simulated"} or when \code{method =} \code{"exact"} reverts to \code{method = "simulated"}, as previously explained. Simulations are independent across blocks, using \code{Nsim} for each block. \code{Nsim} is limited by \code{1e7}. } } \details{ If \eqn{AD_i} is the Anderson-Darling criterion for the i-th block of \eqn{k_i} samples, its standardized test statistic is \eqn{T_i = (AD_i - \mu_i)/\sigma_i}, with \eqn{\mu_i} and \eqn{\sigma_i} representing mean and standard deviation of \eqn{AD_i}. This statistic is used to test the hypothesis that the samples in the i-th block all come from the same but unspecified continuous distribution function \eqn{F_i(x)}. The combined Anderson-Darling criterion is \eqn{AD_{comb}=AD_1 + \ldots + AD_M} and \eqn{T_{comb} = } \eqn{(AD_{comb} - \mu_c)/\sigma_c} is the standardized form, where \eqn{\mu_c=\mu_1+\ldots+\mu_M} and \eqn{\sigma_c = \sqrt{\sigma_1^2 +\ldots+\sigma_M^2}} represent the mean and standard deviation of \eqn{AD_{comb}}. The statistic \eqn{T_{comb}} is used to simultaneously test whether the samples in each block come from the same continuous distribution function \eqn{F_i(x), i=1,\ldots,M}. The unspecified common distribution function \eqn{F_i(x)} may change from block to block. According to the reference article, two versions of the test statistic and its corresponding combinations are provided. The \eqn{k_i} for each block of \eqn{k_i} independent samples may change from block to block. NA values are removed and the user is alerted with the total NA count. It is up to the user to judge whether the removal of NA's is appropriate. The continuity assumption can be dispensed with if we deal with independent random samples, or if randomization was used in allocating subjects to samples or treatments, independently from block to block, and if we view the simulated or exact \eqn{P}-values conditionally, given the tie patterns within each block. Of course, under such randomization any conclusions are valid only with respect to the blocks of subjects that were randomly allocated. The asymptotic \eqn{P}-value calculation assumes distribution continuity. No adjustment for lack thereof is known at this point. The same comment holds for the means and standard deviations of respective statistics. } \value{ A list of class \code{kSamples} with components \item{test.name}{\eqn{=} \code{"Anderson-Darling"}} \item{M}{number of blocks of samples being compared} \item{n.samples}{list of \code{M} vectors, each vector giving the sample sizes for each block of samples being compared} \item{nt}{\eqn{= (N_1,\ldots,N_M)}} \item{n.ties}{vector giving the number of ties in each the \code{M} comparison blocks} \item{ad.list}{list of \code{M} matrices giving the \code{ad} results for \code{ad.test} applied to the samples in each of the \code{M} blocks} \item{mu}{vector of means of the \eqn{AD} statistic for the \code{M} blocks} \item{sig}{vector of standard deviations of the \eqn{AD} statistic for the \code{M} blocks} \item{ad.c}{2 x 3 (2 x 4) matrix containing \eqn{AD_{comb}, T_{comb}}, asymptotic \eqn{P}-value, (simulated or exact \eqn{P}-value), for each version of the combined test statistic, version 1 in row 1 and version 2 in row 2} \item{mu.c}{mean of \eqn{AD_{comb}}} \item{sig.c}{standard deviation of \eqn{AD_{comb}}} \item{warning}{logical indicator, \code{warning = TRUE} when at least one \eqn{n_{ij} < 5}} \item{null.dist1}{simulated or enumerated null distribution of version 1 of \eqn{AD_{comb}}} \item{null.dist2}{simulated or enumerated null distribution of version 2 of \eqn{AD_{comb}}} \item{method}{the \code{method} used.} \item{Nsim}{the number of simulations used for each block of samples.} } \references{ Scholz, F. W. and Stephens, M. A. (1987), K-sample Anderson-Darling Tests, \emph{Journal of the American Statistical Association}, \bold{Vol 82, No. 399}, 918--924. } \note{ This test is useful in analyzing treatment effects in randomized (incomplete) block experiments and in examining performance equivalence of several laboratories when presented with different test materials for comparison. } \seealso{ \code{\link{ad.test}}, \code{\link{ad.pval}} } \examples{ ## Create two lists of sample vectors. x1 <- list( c(1, 3, 2, 5, 7), c(2, 8, 1, 6, 9, 4), c(12, 5, 7, 9, 11) ) x2 <- list( c(51, 43, 31, 53, 21, 75), c(23, 45, 61, 17, 60) ) # and a corresponding data frame datx1x2 x1x2 <- c(unlist(x1),unlist(x2)) gx1x2 <- as.factor(c(rep(1,5),rep(2,6),rep(3,5),rep(1,6),rep(2,5))) bx1x2 <- as.factor(c(rep(1,16),rep(2,11))) datx1x2 <- data.frame(A = x1x2, G = gx1x2, B = bx1x2) ## Run ad.test.combined. set.seed(2627) ad.test.combined(x1, x2, method = "simulated", Nsim = 1000) # or with same seed # ad.test.combined(list(x1, x2), method = "simulated", Nsim = 1000) # ad.test.combined(A~G|B,data=datx1x2,method="simulated",Nsim=1000) } \keyword{nonparametric} \keyword{htest} \keyword{design} kSamples/man/contingency2xt.comb.Rd0000644000176200001440000001036712652016052016776 0ustar liggesusers\name{contingency2xt.comb} \alias{contingency2xt.comb} \title{ Combined Kruskal-Wallis Tests for the 2 x t Contingency Tables } \description{ This function uses the Kruskal-Wallis criterion to test the hypothesis of no association between the counts for two responses "A" and "B" across t categories and across \eqn{M} blocks. } \usage{ contingency2xt.comb(\dots, method = c("asymptotic", "simulated", "exact"), dist = FALSE, Nsim = 10000) } \arguments{ \item{\dots}{ Either several lists \eqn{L_1,\ldots,L_M}, each of two equal length vectors \eqn{A_i} and \eqn{B_i}, \eqn{i=1,\ldots,M}, of counts \eqn{\ge 0}, where the common length \eqn{t_i} of \eqn{A_i} and \eqn{B_i} may vary from list to list or a list of \code{M} such lists } \item{method}{= \code{c("asymptotic","simulated","exact")}, where \code{"asymptotic"} uses only an asymptotic chi-square approximation with \eqn{(t_1-1)+\ldots+(t_M-1)} degrees of freedom to approximate the \eqn{P}-value, This calculation is always done. \code{"simulated"} uses \code{Nsim} simulated counts for the two vectors \eqn{A_i} and \eqn{B_i} in list \eqn{L_i}, with the observed marginal totals, \eqn{m_i=\sum A_i}, \eqn{n_i = \sum B_i}, \eqn{d_i = A_i+B_i}. It does this independently from list to list using the same \code{Nsim} each time, adding the resulting Kruskal-Wallis criteria across lists to get \code{Nsim} such summed values to estimate the \eqn{P}-value. \code{"exact"} enumerates all counts for \eqn{A_i} and \eqn{B_i} with the respective observed marginal totals to get an exact distribution for each list. These distributions are then convolved to obtain the \eqn{P}-value. It is used only when \code{Nsim} is at least as large as the product across blocks of the number \code{choose(m+t-1,t-1)} of full enumerations per block, where \eqn{t = t_1,\ldots, t_M}. Otherwise, \code{method} reverts to \code{"simulated"} using the given \code{Nsim}. } \item{dist}{\code{FALSE} (default) or \code{TRUE}. If \code{TRUE}, the simulated or fully enumerated null distribution \code{null.dist} is returned for the Kruskal-Wallis test statistic. Otherwise \code{null.dist = NULL} is returned. } \item{Nsim}{\code{=10000} (default), number of simulated \eqn{A_i} splits to use per block. It is only used when \code{method = "simulated"}, or when \code{method = "exact"} reverts to \code{method = "simulated"}, as previously explained. } } \details{ For details on the calculation of the Kruskal-Wallis criterion and its exact or simulated distribution for each block see \code{\link{contingency2xt}}. } \value{ A list of class \code{kSamples} with components \item{test.name}{\code{"Combined 2 x t Contingency Tables"}} \item{t}{vector giving the number of classification categories per block} \item{M}{number of blocked tables} \item{kw.list}{a list of the \code{KW.cont} output componenents from \code{\link{contingency2xt}} for each of the blocks} \item{null.dist}{simulated or enumerated null distribution of the combined test statistic. It is given as an \code{L} by 2 matrix, where the first column (named \code{KW}) gives the \code{L} unique ordered values of the combined Kruskal-Wallis statistic and the second column (named \code{prob}) gives the corresponding (simulated or exact) probabilities. \code{null.dist = NULL} is returned when \code{dist = FALSE} or when \code{method =} \code{"asymptotic"}.} \item{method}{the \code{method} used.} \item{Nsim}{the number of simulations.} } \note{ The required level for \code{Nsim} in order for \code{method = "exact"} to be carried out, is conservative, but there is no transparent way to get a better estimate. The actual dimension \code{L} of the realized \code{null.dist} will typically be much smaller, since the distribution is compacted to its unique support values. } \section{warning}{\code{method = "exact"} should only be used with caution. Computation time is proportional to the number of enumerations. In most cases \code{dist = TRUE} should not be used, i.e., when the returned distribution objects become too large for R's work space.} \examples{ out <- contingency2xt.comb(list(c(25,15,20),c(16,6,18)), list(c(12,4,5),c(13,8,9)),method = "simulated", dist=FALSE, Nsim=1e3) } \keyword{nonparametric} \keyword{htest} kSamples/man/Steel.test.Rd0000644000176200001440000002016412660700160015126 0ustar liggesusers\name{Steel.test} \alias{Steel.test} \title{ Steel's Multiple Comparison Wilcoxon Tests } \description{ This function uses pairwise Wilcoxon tests, comparing a common control sample with each of several treatment samples, in a multiple comparison fashion. The experiment wise significance probabity is calculated, estimated, or approximated, when testing the hypothesis that all independent samples arise from a common unspecified distribution, or that treatments have no effect when assigned randomly to the given subjects. } \usage{ Steel.test(\dots, data = NULL, method = c("asymptotic", "simulated", "exact"), alternative = c("greater","less","two-sided"), dist = FALSE, Nsim = 10000) } \arguments{ \item{\dots}{ Either several sample vectors, say \eqn{x_1, \ldots, x_k}, with \eqn{x_i} containing \eqn{n_i} sample values. \eqn{n_i > 4} is recommended for reasonable asymptotic \eqn{P}-value calculation. The pooled sample size is denoted by \eqn{N=n_1+\ldots+n_k}. The first vector serves as control sample, the others as treatment samples. or a list of such sample vectors. or a formula y ~ g, where y contains the pooled sample values and g (same length as y) is a factor with levels identifying the samples to which the elements of y belong. The lowest factor level corresponds to the control sample, the other levels to treatment samples. } \item{data}{= an optional data frame providing the variables in formula y ~ g or y, g input } \item{method}{= \code{c("asymptotic","simulated","exact")}, where \code{"asymptotic"} uses only an asymptotic normal approximation to approximate the \eqn{P}-value, This calculation is always done. \code{"simulated"} uses \code{Nsim} simulated standardized Steel statistics based on random splits of the pooled samples into samples of sizes \eqn{n_1, \ldots, n_k}, to estimate the \eqn{P}-value. \code{"exact"} uses full enumeration of all sample splits with resulting standardized Steel statistics to obtain the exact \eqn{P}-value. It is used only when \code{Nsim} is at least as large as the number \deqn{ncomb = \frac{N!}{n_1!\ldots n_k!}}{N!/(n_1!\ldots n_k!)} of full enumerations. Otherwise, \code{method} reverts to \code{"simulated"} using the given \code{Nsim}. It also reverts to \code{"simulated"} when \eqn{ncomb > 1e8} and \code{dist = TRUE}. } \item{alternative}{= \code{c("greater","less","two-sided")}, where for \code{"greater"} the maximum of the pairwise standardized Wilcoxon test statistics is used and a large maximum value is judged significant. For \code{"less"} the minimum of the pairwise standardized Wilcoxon test statistics is used and a low minimum value is judged significant. For \code{"two-sided"} the maximum of the absolute pairwise standardized Wilcoxon test statistics is used and a large maximum value is judged significant. } \item{dist}{\code{= FALSE} (default) or \code{TRUE}. If \code{TRUE}, the simulated or fully enumerated null distribution vector \code{null.dist} is returned for the Steel test statistic, as chosen via \code{alternative}. Otherwise, \code{NULL} is returned. When \code{dist = TRUE} then \code{Nsim <- min(Nsim, 1e8)}, to limit object size. } \item{Nsim}{\code{= 10000} (default), number of simulation sample splits to use. It is only used when \code{method = "simulated"}, or when \code{method = "exact"} reverts to \code{method =} \code{ "simulated"}, as previously explained. } } \details{ The Steel criterion uses the Wilcoxon test statistic in the pairwise comparisons of the common control sample with each of the treatment samples. These statistics are used in standardized form, using the means and standard deviations as they apply conditionally given the tie pattern in the pooled data, see Scholz (2016). This conditional treatment allows for correct usage in the presence of ties and is appropriate either when the samples are independent and come from the same distribution (continuous or not) or when treatments are assigned randomly among the total of \code{N} subjects. However, in the case of ties the significance probability has to be viewed conditionally given the tie pattern. The Steel statistic is used to test the hypothesis that the samples all come from the same but unspecified distribution function \eqn{F(x)}, or, under random treatment assigment, that the treatments have no effect. The significance probability is the probability of obtaining test results as extreme or more extreme than the observed test statistic, when testing for the possibility of a treatment effect under any of the treatments. For small sample sizes exact (conditional) null distribution calculations are possible (with or without ties), based on a recursively extended version of Algorithm C (Chase's sequence) in Knuth (2011), which allows the enumeration of all possible splits of the pooled data into samples of sizes of \eqn{n_1, \ldots, n_k}, as appropriate under treatment randomization. This is done in C, as is the simulation of such splits. NA values are removed and the user is alerted with the total NA count. It is up to the user to judge whether the removal of NA's is appropriate. } \value{ A list of class \code{kSamples} with components \item{test.name}{\code{"Steel"}} \item{alternative}{ "greater", "less", or "two-sided"} \item{k}{number of samples being compared, including the control sample as the first one} \item{ns}{vector \eqn{(n_1,\ldots,n_k)} of the \eqn{k} sample sizes} \item{N}{size of the pooled sample \eqn{= n_1+\ldots+n_k}} \item{n.ties}{number of ties in the pooled sample} \item{st}{2 (or 3) vector containing the observed standardized Steel statistic, its asymptotic \eqn{P}-value, (its simulated or exact \eqn{P}-value)} \item{warning}{logical indicator, \code{warning = TRUE} when at least one \eqn{n_i < 5} } \item{null.dist}{simulated or enumerated null distribution vector of the test statistic. It is \code{NULL} when \code{dist = FALSE} or when \code{method = "asymptotic"}. } \item{method}{the \code{method} used.} \item{Nsim}{the number of simulations used.} \item{W}{vector \eqn{(W_{12},\ldots, W_{1k})} of Mann-Whitney statistics comparing each observation under treatment \eqn{i (> 1)} against each observation of the control sample. } \item{mu}{mean vector \eqn{(n_1n_2/2,\ldots,n_1n_k/2)} of \code{W}.} \item{tau}{vector of standard deviations of \code{W}.} \item{sig0}{standard deviation used in calculating the significance probability of the maximum (minimum) of (absolute) standardized Mann-Whitney statistics, see Scholz (2016).} \item{sig}{vector \eqn{(\sigma_1,\ldots, \sigma_k)} of standard deviations used in calculating the significance probability of the maximum (minimum) of (absolute) standardized Mann-Whitney statistics, see Scholz (2016).} } \references{ Knuth, D.E. (2011), \emph{The Art of Computer Programming, Volume 4A Combinatorial Algorithms Part 1}, Addison-Wesley Lehmann, E.L. (2006), \emph{Nonparametrics, Statistical Methods Based on Ranks, Revised First Edition}, Springer Verlag. Scholz, F.W. (2016), "On Steel's Test with Ties", submitted to \emph{Journal of Nonparametric Statistics}. } \section{warning}{\code{method = "exact"} should only be used with caution. Computation time is proportional to the number of enumerations. Experiment with \code{\link{system.time}} and trial values for \code{Nsim} to get a sense of the required computing time. In most cases \code{dist = TRUE} should not be used, i.e., when the returned distribution objects become too large for R's work space.} \examples{ z1 <- c(103, 111, 136, 106, 122, 114) z2 <- c(119, 100, 97, 89, 112, 86) z3 <- c( 89, 132, 86, 114, 114, 125) z4 <- c( 92, 114, 86, 119, 131, 94) y <- c(z1, z2, z3, z4) g <- as.factor(c(rep(1, 6), rep(2, 6), rep(3, 6), rep(4, 6))) set.seed(2627) Steel.test(list(z1, z2, z3, z4), method = "simulated", alternative = "less", Nsim = 1000) # or with same seed # Steel.test(z1, z2, z3, z4,method = "simulated", # alternative = "less", Nsim = 1000) # or with same seed # Steel.test(y ~ g, method = "simulated", # alternative = "less", Nsim=1000) } \keyword{nonparametric} \keyword{htest} \keyword{design} kSamples/man/qn.test.Rd0000644000176200001440000001624713300401105014464 0ustar liggesusers\name{qn.test} \alias{qn.test} \title{ Rank Score k-Sample Tests } \description{ This function uses the \eqn{QN} criterion (Kruskal-Wallis, van der Waerden scores, normal scores) to test the hypothesis that \eqn{k} independent samples arise from a common unspecified distribution. } \usage{ qn.test(\dots, data = NULL, test = c("KW", "vdW", "NS"), method = c("asymptotic", "simulated", "exact"), dist = FALSE, Nsim = 10000) } \arguments{ \item{\dots}{ Either several sample vectors, say \eqn{x_1, \ldots, x_k}, with \eqn{x_i} containing \eqn{n_i} sample values. \eqn{n_i > 4} is recommended for reasonable asymptotic \eqn{P}-value calculation. The pooled sample size is denoted by \eqn{N=n_1+\ldots+n_k}, or a list of such sample vectors, or a formula y ~ g, where y contains the pooled sample values and g (same length as y) is a factor with levels identifying the samples to which the elements of y belong. } \item{data}{= an optional data frame providing the variables in formula y ~ g. } \item{test}{= \code{c("KW", "vdW", "NS")}, where \code{"KW"} uses scores \code{1:N} (Kruskal-Wallis test) \code{"vdW"} uses van der Waerden scores, \code{qnorm( (1:N) / (N+1) )} \code{"NS"} uses normal scores, i.e., expected standard normal order statistics, invoking function \code{normOrder} of \code{package SuppDists (>=1.1-9.4)} } \item{method}{= \code{c("asymptotic","simulated","exact")}, where \code{"asymptotic"} uses only an asymptotic chi-square approximation with \code{k-1} degrees of freedom to approximate the \eqn{P}-value. This calculation is always done. \code{"simulated"} uses \code{Nsim} simulated \eqn{QN} statistics based on random splits of the pooled samples into samples of sizes \eqn{n_1, \ldots, n_k}, to estimate the \eqn{P}-value. \code{"exact"} uses full enumeration of all sample splits with resulting \eqn{QN} statistics to obtain the exact \eqn{P}-value. It is used only when \code{Nsim} is at least as large as the number \deqn{ncomb = \frac{N!}{n_1!\ldots n_k!}}{N!/(n_1!\ldots n_k!)} of full enumerations. Otherwise, \code{method} reverts to \code{"simulated"} using the given \code{Nsim}. It also reverts to \code{"simulated"} when \eqn{ncomb > 1e8} and \code{dist = TRUE}. } \item{dist}{\code{FALSE} (default) or \code{TRUE}. If \code{TRUE}, the simulated or fully enumerated null distribution vector \code{null.dist} is returned for the \eqn{QN} test statistic. Otherwise, \code{NULL} is returned. When \code{dist = TRUE} then \code{Nsim <- min(Nsim, 1e8)}, to limit object size. } \item{Nsim}{\code{= 10000} (default), number of simulation sample splits to use. It is only used when \code{method = "simulated"}, or when \code{method = "exact"} reverts to \code{method =} \code{ "simulated"}, as previously explained. } } \details{ The \eqn{QN} criterion based on rank scores \eqn{v_1,\ldots,v_N} is \deqn{QN=\frac{1}{s_v^2}\left(\sum_{i=1}^k \frac{(S_{iN}-n_i \bar{v}_{N})^2}{n_i}\right)} where \eqn{S_{iN}} is the sum of rank scores for the \eqn{i}-th sample and \eqn{\bar{v}_N} and \eqn{s_v^2} are sample mean and sample variance (denominator \eqn{N-1}) of all scores. The statistic \eqn{QN} is used to test the hypothesis that the samples all come from the same but unspecified continuous distribution function \eqn{F(x)}. \eqn{QN} is always adjusted for ties by averaging the scores of tied observations. Conditions for the asymptotic approximation (chi-square with \eqn{k-1} degrees of freedom) can be found in Lehmann, E.L. (2006), Appendix Corollary 10, or in Hajek, Sidak, and Sen (1999), Ch. 6, problems 13 and 14. For small sample sizes exact null distribution calculations are possible (with or without ties), based on a recursively extended version of Algorithm C (Chase's sequence) in Knuth (2011), which allows the enumeration of all possible splits of the pooled data into samples of sizes of \eqn{n_1, \ldots, n_k}, as appropriate under treatment randomization. This is done in C, as is the simulation. NA values are removed and the user is alerted with the total NA count. It is up to the user to judge whether the removal of NA's is appropriate. The continuity assumption can be dispensed with, if we deal with independent random samples from any common distribution, or if randomization was used in allocating subjects to samples or treatments, and if the asymptotic, simulated or exact \eqn{P}-values are viewed conditionally, given the tie pattern in the pooled sample. Under such randomization any conclusions are valid only with respect to the subjects that were randomly allocated to their respective treatment samples. } \value{ A list of class \code{kSamples} with components \item{test.name}{\code{"Kruskal-Wallis"}, \code{"van der Waerden scores"}, or \code{"normal scores"}} \item{k}{number of samples being compared} \item{ns}{vector \eqn{(n_1,\ldots,n_k)} of the \eqn{k} sample sizes} \item{N}{size of the pooled samples \eqn{= n_1+\ldots+n_k}} \item{n.ties}{number of ties in the pooled sample} \item{qn}{2 (or 3) vector containing the observed \eqn{QN}, its asymptotic \eqn{P}-value, (its simulated or exact \eqn{P}-value)} \item{warning}{logical indicator, \code{warning = TRUE} when at least one \eqn{n_i < 5}} \item{null.dist}{simulated or enumerated null distribution of the test statistic. It is \code{NULL} when \code{dist = FALSE} or when \code{method = "asymptotic"}.} \item{method}{the \code{method} used.} \item{Nsim}{the number of simulations used.} } \section{warning}{\code{method = "exact"} should only be used with caution. Computation time is proportional to the number of enumerations. Experiment with \code{\link{system.time}} and trial values for \code{Nsim} to get a sense of the required computing time. In most cases \code{dist = TRUE} should not be used, i.e., when the returned distribution objects become too large for R's work space.} \references{ Hajek, J., Sidak, Z., and Sen, P.K. (1999), \emph{Theory of Rank Tests (Second Edition)}, Academic Press. Knuth, D.E. (2011), \emph{The Art of Computer Programming, Volume 4A Combinatorial Algorithms Part 1}, Addison-Wesley Kruskal, W.H. (1952), A Nonparametric Test for the Several Sample Problem, \emph{The Annals of Mathematical Statistics}, \bold{Vol 23, No. 4}, 525-540 Kruskal, W.H. and Wallis, W.A. (1952), Use of Ranks in One-Criterion Variance Analysis, \emph{Journal of the American Statistical Association}, \bold{Vol 47, No. 260}, 583--621. Lehmann, E.L. (2006), \emph{Nonparametrics, Statistical Methods Based on Ranks, Revised First Edition}, Springer Verlag. } \seealso{ \code{\link{qn.test.combined}} } \examples{ u1 <- c(1.0066, -0.9587, 0.3462, -0.2653, -1.3872) u2 <- c(0.1005, 0.2252, 0.4810, 0.6992, 1.9289) u3 <- c(-0.7019, -0.4083, -0.9936, -0.5439, -0.3921) yy <- c(u1, u2, u3) gy <- as.factor(c(rep(1,5), rep(2,5), rep(3,5))) set.seed(2627) qn.test(u1, u2, u3, test="KW", method = "simulated", dist = FALSE, Nsim = 1000) # or with same seed # qn.test(list(u1, u2, u3),test = "KW", method = "simulated", # dist = FALSE, Nsim = 1000) # or with same seed # qn.test(yy ~ gy, test = "KW", method = "simulated", # dist = FALSE, Nsim = 1000) } \keyword{nonparametric} \keyword{htest} \keyword{design} kSamples/man/ShorelineFireEMS.Rd0000644000176200001440000000257112006313704016177 0ustar liggesusers\name{ShorelineFireEMS} \alias{ShorelineFireEMS} \docType{data} \title{ Shoreline Fire and EMS Turnout Times } \description{ This data set gives turnout response times for Fire and EMS (Emergency Medical Services) dispatch calls to the Shoreline, WA, Fire Department in 2006. The turnout time refers to time elapsed between the emergency call dispatch and the crew leaving the fire station, or signaling that they are on their way while being on route already. The latter scenario may explain the bimodal distribution character. } \usage{ data(ShorelineFireEMS) } \format{ A list of two sublists \code{$EMSTOT} and \code{$FireTOT}, each with 4 vector components \code{$ST57}, \code{$ST63}, \code{$ST64}, and \code{$ST65} respectively, giving the turnout times (in seconds) (for EMS and Fire) at fire stations ST57, ST63, ST64, and ST65. } \note{These data sets are provided to illustrate usage of \code{ad.test} and \code{qn.test} and their combined versions in testing for performance equivalence across fire stations. } \source{ Thanks to Michael Henderson and the Fire Fighters and Paramedics of the Shoreline Fire Department in Washington State. } \examples{ data(ShorelineFireEMS) boxplot(ShorelineFireEMS$EMSTOT,xlab="Station", ylab="seconds", main="EMS Turnout Time") boxplot(ShorelineFireEMS$FireTOT,xlab="Station", ylab="seconds", main="Fire Turnout Time") } \keyword{datasets} kSamples/man/pp.kSamples.Rd0000644000176200001440000000336312614531172015277 0ustar liggesusers\name{pp.kSamples} \alias{pp.kSamples} \title{ Upper Tail Probability Plots for Objects of Class kSamples } \description{ This function plots upper tail probabilities of the limiting distribution against the corresponding exact or simulated probabilities, both on a log-scale. } \usage{ pp.kSamples(x) } \arguments{ \item{x}{an object of class \code{kSamples}} } \details{Objects of class \code{kSamples} are produced by any of the following functions \code{\link{ad.test}} Anderson-Darling k-sample test. \code{\link{ad.test.combined}} Combined Anderson-Darling k-sample tests. \code{\link{qn.test}} \eqn{QN} rank scores test. \code{\link{qn.test.combined}} Combined \eqn{QN} rank scores tests. \code{\link{contingency2xt}} test for \eqn{2 * t} contingency table. \code{\link{contingency2xt.comb}} test for the combination of \eqn{2 * t} contingency tables. \code{\link{jt.test}} Jonckheere-Terpstra test. \code{\link{Steel.test}} Steel test. This will work only for alternative = "greater" or "two-sided". The approximation quality for "less" is the same as for "greater". The command \code{pp.kSamples(x)} for an object of class \code{kSamples} will only produce a plot when the object \code{x} contains non-NULL entries for the null distribution. The purpose of this function is to give the user a sense of the asymptotic distribution accuracy. } \seealso{ \code{\link{ad.test}}, \code{\link{ad.test.combined}}, \code{\link{qn.test}}, \code{\link{qn.test.combined}}, \code{\link{contingency2xt}}, \code{\link{contingency2xt.comb}} \code{\link{jt.test}} \code{\link{Steel.test}} } \examples{ qn.out <- qn.test(c(1,3,7,2,9),c(1,4,6,11,2),test="KW", method="simulated",dist=TRUE,Nsim=1000) pp.kSamples(qn.out) } \keyword{nonparametric} \keyword{htest} kSamples/man/contingency2xt.Rd0000644000176200001440000001306712613772740016071 0ustar liggesusers\name{contingency2xt} \alias{contingency2xt} \title{ Kruskal-Wallis Test for the 2 x t Contingency Table } \description{ This function uses the Kruskal-Wallis criterion to test the hypothesis of no association between the counts for two responses "A" and "B" across t categories. } \usage{ contingency2xt(Avec, Bvec, method = c("asymptotic", "simulated", "exact"), dist = FALSE, tab0 = TRUE, Nsim = 1e+06) } \arguments{ \item{Avec}{ vector of length \eqn{t} giving the counts \eqn{A_1,\ldots, A_t} for response "A" according to \eqn{t} categories. \eqn{m = A_1 + \ldots + A_t}. } \item{Bvec}{ vector of length \eqn{t} giving the counts \eqn{B_1,\ldots, B_t} for response "B" according to \eqn{t} categories. \eqn{n = B_1 + \ldots + B_t = N-m}. } \item{method}{= \code{c("asymptotic","simulated","exact")}, where \code{"asymptotic"} uses only an asymptotic chi-square approximation with \eqn{t-1} degrees of freedom to approximate the \eqn{P}-value. This calculation is always done. \code{"simulated"} uses \code{Nsim} simulated counts for \code{Avec} and \code{Bvec} with the observed marginal totals, \code{m, n, d = Avec+Bvec}, to estimate the \eqn{P}-value. \code{"exact"} enumerates all counts for \code{Avec} and \code{Bvec} with the observed marginal totals to get an exact \eqn{P}-value. It is used only when \code{Nsim} is at least as large as the number \code{choose(m+t-1,t-1)} of full enumerations. Otherwise, \code{method} reverts to \code{"simulated"} using the given \code{Nsim}. } \item{dist}{\code{FALSE} (default) or \code{TRUE}. If \code{dist = TRUE}, the distribution of the simulated or fully enumerated Kruskal-Wallis test statistics is returned as \code{null.dist}, if \code{dist = FALSE} the value of \code{null.dist} is \code{NULL}. The coice \code{dist = TRUE} also limits \code{Nsim <- min(Nsim,1e8)}. } \item{tab0}{\code{TRUE} (default) or \code{FALSE}. If \code{tab0 = TRUE}, the null distribution is returned in 2 column matrix form when \code{method = "simulated"}. When \code{tab0 = FALSE} the simulated null distribution is returned as a vector of all simulated values of the test statistic. } \item{Nsim}{\code{=10000} (default), number of simulated \code{Avec} splits to use. It is only used when \code{method = "simulated"}, or when \code{method = "exact"} reverts to \code{method =} \code{"simulated"}, as previously explained. } } \details{ For this data scenario the Kruskal-Wallis criterion is \deqn{K.star = \frac{N(N-1)}{mn}(\sum\frac{A_i^2}{d_i}-\frac{m^2}{N})}{K.star = N(N-1)/(mn) (\sum A_i^2/d_i-m^2/N)} with \eqn{d_i=A_i+B_i}, treating "A" responses as 1 and "B" responses as 2, and using midranks as explained in Lehmann (2006), Chapter 5.3. For small sample sizes exact null distribution calculations are possible, based on Algorithm C (Chase's sequence) in Knuth (2011), which allows the enumeration of all possible splits of \eqn{m} into counts \eqn{A_1,\ldots, A_t} such that \eqn{m = A_1 + \ldots + A_t}, followed by the calculation of the statistic \eqn{K.star} for each such split. Simulation of \eqn{A_1,\ldots, A_t} uses the probability model (5.35) in Lehmann (2006) to successively generate hypergeometric counts \eqn{A_1,\ldots, A_t}. Both these processes, enumeration and simulation, are done in C. } \value{ A list of class \code{kSamples} with components \item{test.name}{\code{"2 x t Contingency Table"}} \item{t}{number of classification categories} \item{KW.cont}{2 (3) vector giving the observed KW statistic, its asymptotic \eqn{P}-value (and simulated or exact \eqn{P}-value) } \item{null.dist}{simulated or enumerated null distribution of the test statistic. It is given as an \code{M} by 2 matrix, where the first column (named \code{KW}) gives the \code{M} unique ordered values of the Kruskal-Wallis statistic and the second column (named \code{prob}) gives the corresponding (simulated or exact) probabilities. This format of \code{null.dist} is returned when \code{method = "exact"} and \code{dist} \code{= TRUE} or when \code{method =}\code{ "simulated"} and \code{dist = TRUE} and \code{tab0} \code{= TRUE} are specified. For \code{method =} \code{"simulated"}, \code{dist = TRUE}, and \code{tab0 = FALSE} the null distribution \code{null.dist} is returned as the vector of all simulated test statistic values. This is used in \code{\link{contingency2xt.comb}} in the simulation mode. \code{null.dist = NULL} is returned when \code{dist = FALSE} or when \code{method =} \code{"asymptotic"}. } \item{method}{the \code{method} used.} \item{Nsim}{the number of simulations.} } \section{warning}{\code{method = "exact"} should only be used with caution. Computation time is proportional to the number of enumerations. In most cases \code{dist = TRUE} should not be used, i.e., when the returned distribution objects become too large for R's work space.} \references{ Knuth, D.E. (2011), \emph{The Art of Computer Programming, Volume 4A Combinatorial Algorithms Part 1}, Addison-Wesley Kruskal, W.H. (1952), A Nonparametric Test for the Several Sample Problem, \emph{The Annals of Mathematical Statistics}, \bold{Vol 23, No. 4}, 525-540 Kruskal, W.H. and Wallis, W.A. (1952), Use of Ranks in One-Criterion Variance Analysis, \emph{Journal of the American Statistical Association}, \bold{Vol 47, No. 260}, 583--621. Lehmann, E.L. (2006), \emph{Nonparametrics, Statistical Methods Based on Ranks}, Revised First Edition, Springer, New York. } \examples{ contingency2xt(c(25,15,20),c(16,6,18),method="exact",dist=FALSE, tab0=TRUE,Nsim=1e3) } \keyword{nonparametric} \keyword{htest} kSamples/man/SteelConfInt.Rd0000644000176200001440000002502612657713600015444 0ustar liggesusers\name{SteelConfInt} \alias{SteelConfInt} \title{ Simultaneous Confidence Bounds Based on Steel's Multiple Comparison Wilcoxon Tests } \description{ This function inverts pairwise Wilcoxon tests, comparing a common control sample with each of several treatment samples to provide simultaneous confidence bounds for the respective shift parameters by which the sampled treatment populations may differ from the control population. It is assumed that all samples are independent and that the sampled distributions are continuous to avoid ties. The joint coverage probability for all bounds/intervals is calculated, estimated, or approximated, see Details. For treatment of ties also see Details. } \usage{ SteelConfInt(\dots, data = NULL, conf.level = 0.95, alternative = c("less", "greater", "two.sided"), method = c("asymptotic", "exact", "simulated"), Nsim = 10000) } \arguments{ \item{\dots}{ Either several sample vectors, say \eqn{x_1, \ldots, x_k}, with \eqn{x_i} containing \eqn{n_i} sample values. \eqn{n_i > 4} is recommended for reasonable asymptotic \eqn{P}-value calculation. The pooled sample size is denoted by \eqn{N=n_1+\ldots+n_k}. The first vector serves as control sample, the others as treatment samples. or a list of such sample vectors. or a formula y ~ g, where y contains the pooled sample values and g (same length as y) is a factor with levels identifying the samples to which the elements of y belong. The lowest factor level corresponds to the control sample, the other levels to treatment samples. } \item{data}{= an optional data frame providing the variables in formula y ~ g. } \item{conf.level}{\code{= 0.95} (default) the target joint confidence level for all bounds/intervals. \code{0 < conf.level < 1}. } \item{alternative}{= \code{c("less", "greater", "two.sided")}, where \code{"less"} results in simultaneous upper confidence bounds for all shift parameters and any negative upper bound should lead to the rejection of the null hypothesis of all shift parameters being zero or positive in favor of at least one being less than zero. \code{"greater"} results in simultaneous lower confidence bounds for all shift parameters and any positive lower bound should lead to the rejection of the null hypothesis of all shift parameters being zero or negative in favor of at least one being greater than zero. \code{"two.sided"} results in simultaneous confidence intervals for all shift parameters and any interval not straddling zero should lead to the rejection of the null hypothesis of all shift parameters being zero in favor of at least one being different from zero. } \item{method}{= \code{c("asymptotic", "exact", "simulated")}, where \code{"asymptotic"} uses only an asymptotic normal approximation to approximate the achieved joint coverage probability. This calculation is always done. \code{"exact"} uses full enumeration of all sample splits to obtain the exact achieved joint coverage probability (see Details). It is used only when \code{Nsim} is at least as large as the number of full enumerations. Otherwise, \code{method} reverts to \code{"simulated"} using the given \code{Nsim}. \code{"simulated"} uses \code{Nsim} simulated random splits of the pooled samples into samples of sizes \eqn{n_1, \ldots, n_k}, to estimate the achieved joint coverage probability. } \item{Nsim}{\code{= 10000} (default), number of simulated sample splits to use. It is only used when \code{method = "simulated"}, or when \code{method = "exact"} reverts to \code{method =} \code{"simulated"}, as previously explained. } } \details{The first sample is treated as control sample with sample size \eqn{n_1}. The remaining \eqn{s=k-1} samples are treatment samples. Let \eqn{W_{1i}, i=2,\ldots,k} denote the respective Wilcoxon statistics comparing the common control sample (index 1) with each of the \eqn{s} treatment samples (indexed by \eqn{i}). For each comparison of control and treatment \eqn{i} sample only the observations of the two samples involved are ranked. By \eqn{W_i=W_{1i}-n_i(n_i+1)/2} we denote the corresponding Mann-Whitney test statistic. Furthermore, let \eqn{D_{i(j)}} denote the \eqn{j}-th ordered value (ascending order) of the \eqn{n_1n_i} paired differences between the observations in treatment sample \eqn{i} and those of the control sample. By simple extension of results in Lehmann (2006), pages 87 and 92, the following equations hold, relating the null distribution of the Mann-Whitney statistics and the joint coverage probabilities of the \eqn{D_{i(j_i)}} for any set of \eqn{j_1,\ldots,j_s} with \eqn{1\le j_i \le n_1 n_i}. \deqn{P_\Delta(\Delta_i \le D_{i(j_i)}, i=2,\ldots,k)=P_0(W_i\le j_i -1, i=2,\ldots,k)} and \deqn{P_\Delta(\Delta_i \ge D_{i(j_i)}, i=2,\ldots,s)=P_0(W_{i}\le n_1 n_i -j_i, i=2,\ldots,k)} where \eqn{P_\Delta} refers to the distribution under \eqn{\Delta=(\Delta_2,\ldots,\Delta_k)} and \eqn{P_0} refers to the joint null distribution of the \eqn{W_i} when all sampled distributions are the same and continuous. There are \eqn{k-1} indices \eqn{j_i} that can be manipulated to affect the achieved confidence level. To limit the computational complexity standardized versions of the \eqn{W_i}, i.e., \eqn{(W_i-\mu_i)/\tau_i} with \eqn{\mu_i} and \eqn{\tau_i} representing mean and standard deviation of \eqn{W_i}, are used to choose a common value for \eqn{(j_i -1-\mu_i)/\tau_i} (satisfying the \eqn{\gamma} level) from the multivariate normal approximation for the \eqn{W_i} (see Miller (1981) and Scholz (2016)), and reduce that to integer values for \eqn{j_i}, rounding up, rounding down, and rounding to the nearest integer. These integers \eqn{j_i} are then used in approximating the actual joint probabilities \eqn{P_0(W_i\le j_i -1, i=2,\ldots,k)}, and from these three coverage probabilities the one that is closest to the nominal confidence level \eqn{\gamma} and \eqn{\ge \gamma} and also also the one that is closest without the restriction \eqn{\ge \gamma} are chosen. When \code{method = "exact"} or \code{= "simulated"} is specified, the same process is used, using either the fully enumerated exact distribution of \eqn{W_i, i=2,\ldots,k} (based on a recursive version of Chase's sequence as presented in Knuth (2011)) for all sample splits, or the simulated distribution of \eqn{W_i, i=2,\ldots,k}. However, since these distributions are discrete the starting point before rounding up is the smallest quantile such that the proportion of distribution values less or equal to it is at least \eqn{\gamma}. The starting point before rounding down is the highest quantile such that the proportion of distribution values less or equal to it is at most \eqn{\gamma}. The third option of rounding to the closest integer is performed using the average of the first two. Confidence intervals are constructed by using upper and lower confidence bounds, each with same confidence level of \eqn{(1+\gamma)/2}. When the original sample data appear to be rounded, and especially when there are ties, one should widen the computed intervals or bounds by the rounding \eqn{\epsilon}, as illustrated in Lehmann (2006), pages 85 and 94. For example, when all sample values appear to end in one of \eqn{.0, .2, .4, .6, .8}, the rounding \eqn{\epsilon} would be \eqn{.2}. Ultimately, this is a judgment call for the user. Such widening of intervals will make the actually achieved confidence level \eqn{\ge} the stated achieved level. } \value{ A list of class \code{kSamples} with components \item{test.name}{\code{"Steel.bounds"}} \item{n1}{the control sample size \eqn{= n_1}} \item{ns}{vector \eqn{(n_2,\ldots,n_k)} of the \eqn{s=k-1} treatment sample sizes} \item{N}{size of the pooled sample \eqn{= n_1+\ldots+n_k}} \item{n.ties}{number of ties in the pooled sample} \item{bounds}{a list of data frames. When \code{method = "asymptotic"} is specified, the list consists of two data frames named \code{conservative.bounds.asymptotic} and \code{closest.bounds.asymptotic}. Each data frame consists of \eqn{s} rows corresponding to the \eqn{s} shift parameters \eqn{\Delta_i} and three columns, the first column giving the lower bound, the second column the upper bound, while the first row of the third column states the computed confidence level by asymptotic approximation, applying jointly to all \eqn{s} sets of bounds. For one-sided bounds the corresponding other bound is set to \code{Inf} or \code{-Inf}, respectively. In case of \code{conservative.bounds.asymptotic} the achieved asymptotic confidence level is targeted to be \eqn{\ge} \code{conf.level}, but closest to it among three possible choices (see Details). In the case of \code{closest.bounds.asymptotic} the achieved level is targeted to be closest to \code{conf.level}. When \code{method = "exact"} or \code{method = "simulated"} is specified the list consists in addition of two further data frames named either \code{conservative.bounds.exact} and \code{closest.bounds.exact} or \code{conservative.bounds.simulated} and \code{closest.bounds.simulated}. In either case the structure and meaning of these data frames parallels that of the \code{"asymptotic"} case. } \item{method}{the \code{method} used.} \item{Nsim}{the number of simulations used.} \item{j.LU}{an \eqn{s} by 4 matrix giving the indices \eqn{j_i} used for computing the bounds \eqn{D_{i(j_i)}} for \eqn{\Delta_i, i=1,\ldots, s}. } } \references{ Knuth, D.E. (2011), \emph{The Art of Computer Programming, Volume 4A Combinatorial Algorithms Part 1}, Addison-Wesley Lehmann, E.L. (2006), \emph{Nonparametrics, Statistical Methods Based on Ranks, Revised First Edition}, Springer Verlag. Miller, Rupert G., Jr. (1981), \emph{Simultaneous Statistical Inference, Second Edition}, Springer Verlag, New York. Scholz, F.W. (2016), "On Steel's Test with Ties", submitted to \emph{Journal of Nonparametric Statistics}. } \section{warning}{\code{method = "exact"} should only be used with caution. Computation time is proportional to the number of enumerations. Experiment with \code{\link{system.time}} and trial values for \code{Nsim} to get a sense of the required computing time.} \examples{ z1 <- c(103, 111, 136, 106, 122, 114) z2 <- c(119, 100, 97, 89, 112, 86) z3 <- c( 89, 132, 86, 114, 114, 125) z4 <- c( 92, 114, 86, 119, 131, 94) set.seed(2627) SteelConfInt(list(z1,z2,z3,z4),conf.level=0.95,alternative="two.sided", method="simulated",Nsim=10000) # or with same seed # SteelConfInt(z1,z2,z3,z4,conf.level=0.95,alternative="two.sided", # method="simulated",Nsim=10000) } \keyword{nonparametric} \keyword{htest} \keyword{design} kSamples/man/ad.pval.Rd0000644000176200001440000000641412612761204014426 0ustar liggesusers\name{ad.pval} \alias{ad.pval} \title{ \eqn{P}-Value for the Asymptotic Anderson-Darling Test Distribution } \description{ This function computes upper tail probabilities for the limiting distribution of the standardized Anderson-Darling test statistic. } \usage{ ad.pval(tx,m,version=1) } \arguments{ \item{tx}{ a vector of desired thresholds \eqn{\ge 0} } \item{m}{ The degrees of freedom for the asymptotic standardized Anderson-Darling test statistic } \item{version}{\code{= 1} (default) if \eqn{P}-value for version 1 of the test statistic is desired, otherwise the version 2 \eqn{P}-value is calculated. } } \details{ Extensive simulations (sampling from a common continuous distribution) were used to extend the range of the asymptotic \eqn{P}-value calculation from the original \eqn{[.01,.25]} in Table 1 of the reference paper to 36 quantiles corresponding to \eqn{P} = .00001, .00005, .0001, .0005, .001, .005, .01, .025, .05, .075, .1, .2, .3, .4, .5, .6, .7, .8, .9, .925, .95, .975, .99, .9925, .995, .9975, .999, .99925, .9995, .99975, .9999, .999925, .99995, .999975, .99999. Note that the entries of the original Table 1 were obtained by using the first 4 moments of the asymptotic distribution and a Pearson curve approximation. Using \code{ad.test}, 1 million replications of the standardized \eqn{AD} statistics with sample sizes \eqn{n_i=500}{n.i=500}, \eqn{i=1,\ldots,k} were run for \eqn{k=2,3,4,5,7} (\eqn{k=2} was done twice). These values of \eqn{k} correspond to degrees of freedom \eqn{m=k-1=1,2,3,4,6} in the asymptotic distribution. The random variable described by this distribution is denoted by \eqn{T_m}. The actual variances (for \eqn{n_i=500}) agreed fairly well with the asymptotic variances. Using the convolution nature of the asymptotic distribution, the performed simulations were exploited to result in an effective simulation of 2 million cases, except for \eqn{k=11}, i.e., \eqn{m=k-1=10}, for which the asymptotic distribution of \eqn{T_{10}} was approximated by the sum of the \eqn{AD} statistics for \eqn{k=7} and \eqn{k=5}, for just the 1 million cases run for each \eqn{k}. The interpolation of tail probabilities \eqn{P} for any desired \eqn{k} is done in two stages. First, a spline in \eqn{1/\sqrt{m}}{1/sqrt(m)} is fitted to each of the 36 quantiles obtained for \eqn{m=1,2,3,4,6,8,10,\infty} to obtain the corresponding interpolated quantiles for the \eqn{m} in question. Then a spline is fitted to the \eqn{\log((1-P)/P)}{log((1-P)/P)} as a function of these 36 interpolated quantiles. This latter spline is used to determine the tail probabilities \eqn{P} for the specified threshold \code{tx}, corresponding to either \eqn{AD} statistic version. The above procedure is based on simulations for either version of the test statistic, appealing to the same limiting distribution. } \value{ a vector of upper tail probabilities corresponding to \code{tx} } \references{ Scholz, F. W. and Stephens, M. A. (1987), K-sample Anderson-Darling Tests, \emph{Journal of the American Statistical Association}, \bold{Vol 82, No. 399}, 918--924. } \seealso{ \code{\link{ad.test}}, \code{\link{ad.test.combined}} } \examples{ ad.pval(tx=c(3.124,5.65),m=2,version=1) ad.pval(tx=c(3.124,5.65),m=2,version=2) } \keyword{nonparametric} \keyword{htest} kSamples/man/kSamples-package.Rd0000644000176200001440000000660413304624634016256 0ustar liggesusers\name{kSamples-package} \alias{kSamples-package} \alias{kSamples} \docType{package} \title{ The Package kSamples Contains Several Nonparametric K-Sample Tests and their Combinations over Blocks } \description{ The k-sample Anderson-Darling, Kruskal-Wallis, normal score and van der Waerden score tests are used to test the hypothesis that k samples of sizes \eqn{n_1, \ldots, n_k} come from a common continuous distribution \eqn{F} that is otherwise unspecified. They are rank tests. Average rank scores are used in case of ties. While \code{\link{ad.test}} is consistent against all alternatives, \code{\link{qn.test}} tends to be sensitive mainly to shifts between samples. The combined versions of these tests, \code{\link{ad.test.combined}} and \code{\link{qn.test.combined}}, are used to simultaneously test such hypotheses across several blocks of samples. The hypothesized common distributions and the number k of samples for each block of samples may vary from block to block. The Jonckheere-Terpstra test addresses the same hypothesis as above but is sensitive to increasing alternatives (stochastic ordering). Also treated is the analysis of 2 x t contingency tables using the Kruskal-Wallis criterion and its extension to blocks. Steel's simultaneous comparison test of a common control sample with \eqn{s=k-1} treatment samples using pairwise Wilcoxon tests for each control/treatment pair is provided, and also the simultaneous confidence bounds of treatment shift effects resulting from the inversion of these tests when sampling from continuous populations. Distributional aspects are handled asymptotically in all cases, and by choice also via simulation or exact enumeration. While simulation is always an option, exact calculations are only possible for small sample sizes and only when few samples are involved. These exact calculations can be done with or without ties in the pooled samples, based on a recursively extended version of Algorithm C (Chase's sequence) in Knuth (2011), which allows the enumeration of all possible splits of the pooled data into samples of sizes of \eqn{n_1, \ldots, n_k}, as appropriate under treatment randomization or random sampling, when viewing tests conditionally given the observed tie pattern. } \details{ \tabular{ll}{ Package: \tab kSamples\cr Type: \tab Package\cr Version: \tab 1.2-8\cr Date: \tab 2018-06-02\cr License: \tab GPL (>=2)\cr LazyLoad: \tab yes\cr } } \author{ Fritz Scholz and Angie Zhu Maintainer: Fritz Scholz } \references{ Hajek, J., Sidak, Z., and Sen, P.K. (1999), \emph{Theory of Rank Tests (Second Edition)}, Academic Press. Knuth, D.E. (2011), \emph{The Art of Computer Programming, Volume 4A Combinatorial Algorithms Part 1}, Addison-Wesley Kruskal, W.H. (1952), A Nonparametric Test for the Several Sample Problem, \emph{The Annals of Mathematical Statistics}, \bold{Vol 23, No. 4}, 525-540 Kruskal, W.H. and Wallis, W.A. (1952), Use of Ranks in One-Criterion Variance Analysis, \emph{Journal of the American Statistical Association}, \bold{Vol 47, No. 260}, 583--621. Lehmann, E.L. (2006), \emph{Nonparametrics, Statistical Methods Based on Ranks}, Revised First Edition, Springer, New York. Scholz, F. W. and Stephens, M. A. (1987), K-sample Anderson-Darling Tests, \emph{Journal of the American Statistical Association}, \bold{Vol 82, No. 399}, 918--924. } \keyword{nonparametric} \keyword{htest} \keyword{design} kSamples/man/conv.Rd0000644000176200001440000000176612015512402014042 0ustar liggesusers\name{conv} \alias{conv} \title{ Convolution of Two Discrete Distributions } \description{ This function convolutes two discrete distribution, each given by strictly increasing support vectors and corresponding probability vectors. } \usage{ conv(x1,p1,x2,p2) } \arguments{ \item{x1}{ support vector of the first distribution, with strictly increasing elements. } \item{p1}{vector of probabilities corresponding to \code{x1}. } \item{x2}{ support vector of the second distribution, with strictly increasing elements. } \item{p2}{vector of probabilities corresponding to \code{x2}. } } \details{ The convolution is performed in C, looping through all paired sums, augmenting existing values or inserting them with an update of the corresponding probabilities. } \value{ A matrix with first column the new support vector and the second column the corresponding probability vector. } \examples{ x1 <- c(1,2,3.5) p1 <- c(.2,.3,.5) x2 <- c(0,2.3,3,4) p2 <- c(.1,.3,.3,.3) conv(x1,p1,x2,p2) } kSamples/man/jt.dist.Rd0000644000176200001440000000472712606575532014477 0ustar liggesusers\name{JT.dist} \alias{djt} \alias{pjt} \alias{qjt} \title{ Null Distribution of the Jonckheere-Terpstra k-Sample Test Statistic } \description{ The Jonckheere-Terpstra k-sample test statistic JT is defined as \eqn{JT = \sum_{i= p_i}. } \references{ Harding, E.F. (1984), An Efficient, Minimal-storage Procedure for Calculating the Mann-Whitney U, Generalized U and Similar Distributions, \emph{Appl. Statist.} \bold{33} No. 1, 1-6. Jonckheere, A.R. (1954), A Distribution Free \emph{k}-sample Test against Ordered Alternatives, \emph{Biometrika}, \bold{41}, 133-145. Lehmann, E.L. (2006), \emph{Nonparametrics, Statistical Methods Based on Ranks, Revised First Edition}, Springer Verlag. Terpstra, T.J. (1952), The Asymptotic Normality and Consistency of Kendall's Test against Trend, when Ties are Present in One Ranking, \emph{Indagationes Math.} \bold{14}, 327-333. } \examples{ djt(c(-1.5,1.2,3), 2:4) pjt(c(2,3.4,7), 3:5) qjt(c(0,.2,.5), 2:4) } \keyword{nonparametric} \keyword{htest} \keyword{design} kSamples/man/jt.test.Rd0000644000176200001440000001247212657707036014511 0ustar liggesusers\name{jt.test} \alias{jt.test} \title{ Jonckheere-Terpstra k-Sample Test for Increasing Alternatives } \description{ The Jonckheere-Terpstra k-sample test statistic JT is defined as \eqn{JT = \sum_{i 4} is recommended for reasonable asymptotic \eqn{P}-value calculation. The pooled sample size is denoted by \eqn{N=n_1+\ldots+n_k}. The order of samples should be as stipulated under the alternative or a list of such sample vectors, or a formula y ~ g, where y contains the pooled sample values and g (same length as y) is a factor with levels identifying the samples to which the elements of y belong, the factor levels reflecting the order under the stipulated alternative, } \item{data}{= an optional data frame providing the variables in formula y ~ g. } \item{method}{= \code{c("asymptotic","simulated","exact")}, where \code{"asymptotic"} uses only an asymptotic normal \eqn{P}-value approximation. \code{"simulated"} uses \code{Nsim} simulated \eqn{JT} statistics based on random splits of the pooled samples into samples of sizes \eqn{n_1, \ldots, n_k}, to estimate the \eqn{P}-value. \code{"exact"} uses full enumeration of all sample splits with resulting \eqn{JT} statistics to obtain the exact \eqn{P}-value. It is used only when \code{Nsim} is at least as large as the number \deqn{ncomb = \frac{N!}{n_1!\ldots n_k!}}{N!/(n_1!\ldots n_k!)} of full enumerations. Otherwise, \code{method} reverts to \code{"simulated"} using the given \code{Nsim}. It also reverts to \code{"simulated"} when \eqn{ncomb > 1e8} and \code{dist = TRUE}. } \item{dist}{\code{= FALSE} (default) or \code{TRUE}. If \code{TRUE}, the simulated or fully enumerated distribution vector \code{null.dist} is returned for the JT test statistic. Otherwise, \code{NULL} is returned. When \code{dist = TRUE} then \code{Nsim <- min(Nsim, 1e8)}, to limit object size. } \item{Nsim}{\code{= 10000} (default), number of simulation sample splits to use. It is only used when \code{method = "simulated"}, or when \code{method = "exact"} reverts to \code{method =} \code{ "simulated"}, as previously explained. } } \details{ The JT statistic is used to test the hypothesis that the samples all come from the same but unspecified continuous distribution function \eqn{F(x)}. It is specifically aimed at alternatives where the sampled distributions are stochastically increasing. NA values are removed and the user is alerted with the total NA count. It is up to the user to judge whether the removal of NA's is appropriate. The continuity assumption can be dispensed with, if we deal with independent random samples, or if randomization was used in allocating subjects to samples or treatments, and if we view the simulated or exact \eqn{P}-values conditionally, given the tie pattern in the pooled samples. Of course, under such randomization any conclusions are valid only with respect to the group of subjects that were randomly allocated to their respective samples. The asymptotic \eqn{P}-value calculation is valid provided all sample sizes become large. } \value{ A list of class \code{kSamples} with components \item{test.name}{\code{"Jonckheere-Terpstra"}} \item{k}{number of samples being compared} \item{ns}{vector \eqn{(n_1,\ldots,n_k)} of the \eqn{k} sample sizes} \item{N}{size of the pooled sample \eqn{= n_1+\ldots+n_k}} \item{n.ties}{number of ties in the pooled sample} \item{qn}{4 (or 5) vector containing the observed \eqn{JT}, its mean and standard deviation and its asymptotic \eqn{P}-value, (and its simulated or exact \eqn{P}-value)} \item{warning}{logical indicator, \code{warning = TRUE} when at least one \eqn{n_i < 5}{n.i < 5}} \item{null.dist}{simulated or enumerated null distribution of the test statistic. It is \code{NULL} when \code{dist = FALSE} or when \code{method = "asymptotic"}.} \item{method}{the \code{method} used.} \item{Nsim}{the number of simulations used.} } \references{ Harding, E.F. (1984), An Efficient, Minimal-storage Procedure for Calculating the Mann-Whitney U, Generalized U and Similar Distributions, \emph{Appl. Statist.} \bold{33} No. 1, 1-6. Jonckheere, A.R. (1954), A Distribution Free \emph{k}-sample Test against Ordered Alternatives, \emph{Biometrika}, \bold{41}, 133-145. Lehmann, E.L. (2006), \emph{Nonparametrics, Statistical Methods Based on Ranks, Revised First Edition}, Springer Verlag. Terpstra, T.J. (1952), The Asymptotic Normality and Consistency of Kendall's Test against Trend, when Ties are Present in One Ranking, \emph{Indagationes Math.} \bold{14}, 327-333. } \examples{ x1 <- c(1,2) x2 <- c(1.5,2.1) x3 <- c(1.9,3.1) yy <- c(x1,x2,x3) gg <- as.factor(c(1,1,2,2,3,3)) jt.test(x1, x2, x3,method="exact",Nsim=90) # or # jt.test(list(x1, x2, x3), method = "exact", Nsim = 90) # or # jt.test(yy ~ gg, method = "exact", Nsim = 90) } \keyword{nonparametric} \keyword{htest} \keyword{design} kSamples/man/ad.test.Rd0000644000176200001440000002027213470613006014440 0ustar liggesusers\name{ad.test} \alias{ad.test} \title{ Anderson-Darling k-Sample Test } \description{ This function uses the Anderson-Darling criterion to test the hypothesis that \eqn{k} independent samples with sample sizes \eqn{n_1,\ldots, n_k} arose from a common unspecified distribution function \eqn{F(x)} and testing is done conditionally given the observed tie pattern. Thus this is a permutation test. Both versions of the \eqn{AD} statistic are computed. } \usage{ ad.test(\dots, data = NULL, method = c("asymptotic", "simulated", "exact"), dist = FALSE, Nsim = 10000) } \arguments{ \item{\dots}{ Either several sample vectors, say \eqn{x_1, \ldots, x_k}, with \eqn{x_i} containing \eqn{n_i} sample values. \eqn{n_i > 4} is recommended for reasonable asymptotic \eqn{P}-value calculation. The pooled sample size is denoted by \eqn{N=n_1+\ldots+n_k}, or a list of such sample vectors, or a formula y ~ g, where y contains the pooled sample values and g is a factor (of same length as y) with levels identifying the samples to which the elements of y belong. } \item{data}{= an optional data frame providing the variables in formula y ~ g. } \item{method}{= \code{c("asymptotic","simulated","exact")}, where \code{"asymptotic"} uses only an asymptotic \eqn{P}-value approximation, reasonable for P in [.00001, .99999] when all \eqn{n_i > 4}. Linear extrapolation via \eqn{\log(P/(1-P))}{log(P/(1-P))} is used outside [.00001, .99999]. This calculation is always done. See \code{\link{ad.pval}} for details. The adequacy of the asymptotic \eqn{P}-value calculation may be checked using \code{\link{pp.kSamples}}. \code{"simulated"} uses \code{Nsim} simulated \eqn{AD} statistics, based on random splits of the pooled samples into samples of sizes \eqn{n_1, \ldots, n_k}, to estimate the exact conditional \eqn{P}-value. \code{"exact"} uses full enumeration of all sample splits with resulting \eqn{AD} statistics to obtain the exact conditional \eqn{P}-values. It is used only when \code{Nsim} is at least as large as the number \deqn{ncomb = \frac{N!}{n_1!\ldots n_k!}}{N!/(n_1!\ldots n_k!)} of full enumerations. Otherwise, \code{method} reverts to \code{"simulated"} using the given \code{Nsim}. It also reverts to \code{"simulated"} when \eqn{ncomb > 1e8} and \code{dist = TRUE}. } \item{dist}{\code{= FALSE} (default) or \code{TRUE}. If \code{TRUE}, the simulated or fully enumerated distribution vectors \code{null.dist1} and \code{null.dist2} are returned for the respective test statistic versions. Otherwise, \code{NULL} is returned. When \code{dist = TRUE} then \code{Nsim <- min(Nsim, 1e8)}, to limit object size. } \item{Nsim}{\code{= 10000} (default), number of simulation sample splits to use. It is only used when \code{method = "simulated"}, or when \code{method = "exact"} reverts to \code{method =} \code{ "simulated"}, as previously explained. } } \details{ If \eqn{AD} is the Anderson-Darling criterion for the \eqn{k} samples, its standardized test statistic is \eqn{T.AD = (AD - \mu)/\sigma}, with \eqn{\mu = k-1} and \eqn{\sigma} representing mean and standard deviation of \eqn{AD}. This statistic is used to test the hypothesis that the samples all come from the same but unspecified continuous distribution function \eqn{F(x)}. According to the reference article, two versions of the \eqn{AD} test statistic are provided. The above mean and standard deviation are strictly valid only for version 1 in the continuous distribution case. NA values are removed and the user is alerted with the total NA count. It is up to the user to judge whether the removal of NA's is appropriate. The continuity assumption can be dispensed with, if we deal with independent random samples, or if randomization was used in allocating subjects to samples or treatments, and if we view the simulated or exact \eqn{P}-values conditionally, given the tie pattern in the pooled samples. Of course, under such randomization any conclusions are valid only with respect to the group of subjects that were randomly allocated to their respective samples. The asymptotic \eqn{P}-value calculation assumes distribution continuity. No adjustment for lack thereof is known at this point. For details on the asymptotic \eqn{P}-value calculation see \code{\link{ad.pval}}. } \value{ A list of class \code{kSamples} with components \item{test.name}{\code{"Anderson-Darling"}} \item{k}{number of samples being compared} \item{ns}{vector of the \eqn{k} sample sizes \eqn{(n_1,\ldots,n_k)}} \item{N}{size of the pooled sample \eqn{= n_1+\ldots+n_k}} \item{n.ties}{number of ties in the pooled samples} \item{sig}{standard deviations \eqn{\sigma} of version 1 of \eqn{AD} under the continuity assumption} \item{ad}{2 x 3 (2 x 4) matrix containing \eqn{AD, T.AD}, asymptotic \eqn{P}-value, (simulated or exact \eqn{P}-value), for each version of the standardized test statistic \eqn{T}, version 1 in row 1, version 2 in row 2.} \item{warning}{logical indicator, warning = TRUE when at least one \eqn{n_i < 5}{n.i < 5}} \item{null.dist1}{simulated or enumerated null distribution of version 1 of the test statistic, given as vector of all generated \eqn{AD} statistics.} \item{null.dist2}{simulated or enumerated null distribution of version 2 of the test statistic, given as vector of all generated \eqn{AD} statistics.} \item{method}{The \code{method} used.} \item{Nsim}{The number of simulations.} } \section{warning }{\code{method = "exact"} should only be used with caution. Computation time is proportional to the number of enumerations. In most cases \code{dist = TRUE} should not be used, i.e., when the returned distribution vectors \code{null.dist1} and \code{null.dist2} become too large for the R work space. These vectors are limited in length by 1e8. } \note{ For small sample sizes and small \eqn{k} exact null distribution calculations are possible (with or without ties), based on a recursively extended version of Algorithm C (Chase's sequence) in Knuth (2011), Ch. 7.2.1.3, which allows the enumeration of all possible splits of the pooled data into samples of sizes of \eqn{n_1, \ldots, n_k}, as appropriate under treatment randomization. The enumeration and simulation are both done in C. } \note{ It has recently come to our attention that the Anderson-Darling test, originally proposed by Pettitt (1976) in the 2-sample case and generalized to k samples by Scholz and Stephens, has a close relative created by Baumgartner et al (1998) in the 2 sample case and populatized by Neuhaeuser (2012) with at least 6 papers among his cited references and generalized by Murakami (2006) to k samples. } \references{ Baumgartner, W., Weiss, P. and Schindler, H. (1998), A nonparametric test for the general two-sample problem, \emph{Bionetrics}, \bold{54}, 1129-1135. Knuth, D.E. (2011), \emph{The Art of Computer Programming, Volume 4A Combinatorial Algorithms Part 1}, Addison-Wesley Neuhaeuser, M. (2012), \emph{Nonparametric Statistical Tests, A Computational Approach}, CRC Press. Murakami, H. (2006), A k-sample rank test based on modified Baumgartner statistic and it power comparison, \emph{Jpn. Soc. Comp. Statist.}, \bold{19}, 1-13. Murakami, H. (2012), Modified Baumgartner statistic for the two-sample and multisample problems: a numerical comparison. \emph{J. of Statistical Comput. and Simul.}, \bold{82:5}, 711-728. Pettitt, A.N. (1976), A two-sample Anderson_Darling rank statistic, \emph{Biometrika}, \bold{63}, 161-168. Scholz, F. W. and Stephens, M. A. (1987), K-sample Anderson-Darling Tests, \emph{Journal of the American Statistical Association}, \bold{Vol 82, No. 399}, 918--924. } \seealso{ \code{\link{ad.test.combined}}, \code{\link{ad.pval}} } \examples{ u1 <- c(1.0066, -0.9587, 0.3462, -0.2653, -1.3872) u2 <- c(0.1005, 0.2252, 0.4810, 0.6992, 1.9289) u3 <- c(-0.7019, -0.4083, -0.9936, -0.5439, -0.3921) y <- c(u1, u2, u3) g <- as.factor(c(rep(1, 5), rep(2, 5), rep(3, 5))) set.seed(2627) ad.test(u1, u2, u3, method = "exact", dist = FALSE, Nsim = 1000) # or with same seed # ad.test(list(u1, u2, u3), method = "exact", dist = FALSE, Nsim = 1000) # or with same seed # ad.test(y ~ g, method = "exact", dist = FALSE, Nsim = 1000) } \keyword{nonparametric} \keyword{htest} \keyword{design} kSamples/man/qn.test.combined.Rd0000644000176200001440000001760613300401002016237 0ustar liggesusers\name{qn.test.combined} \alias{qn.test.combined} \title{ Combined Rank Score k-Sample Tests } \description{ This function combines several independent rank score \eqn{k}-sample tests into one overall test of the hypothesis that the independent samples within each block come from a common unspecified distribution, while the common distributions may vary from block to block. } \usage{ qn.test.combined(\dots, data = NULL, test = c("KW", "vdW", "NS"), method = c("asymptotic", "simulated", "exact"), dist = FALSE, Nsim = 10000) } \arguments{ \item{\dots}{ Either a sequence of several lists, say \eqn{L_1, \ldots, L_M} (\eqn{M > 1}) where list \eqn{L_i} contains \eqn{k_i > 1} sample vectors of respective sizes \eqn{n_{i1}, \ldots, n_{ik_i}}, where \eqn{n_{ij} > 4} is recommended for reasonable asymptotic \eqn{P}-value calculation. \eqn{N_i=n_{i1}+\ldots+n_{ik_i}} is the pooled sample size for block \eqn{i}, or a list of such lists, or a formula, like y ~ g | b, where y is a numeric response vector, g is a factor with levels indicating different treatments and b is a factor indicating different blocks; y, g, b have same length. y is split separately for each block level into separate samples according to the g levels. The same g level may occur in different blocks. The variable names may correspond to variables in an optionally supplied data frame via the data = argument. } \item{data}{= an optional data frame providing the variables in formula input } \item{test}{= \code{c("KW", "vdW", "NS")}, where \code{"KW"} uses scores \code{1:N} (Kruskal-Wallis test) \code{"vdW"} uses van der Waerden scores, \code{qnorm( (1:N) / (N+1) )} \code{"NS"} uses normal scores, i.e., expected values of standard normal order statistics, invoking function \code{normOrder} of \code{package SuppDists (>=1.1-9.4)} For the above scores \eqn{N} changes from block to block and represents the total pooled sample size \eqn{N_i}{N.i} for block \eqn{i}. } \item{method}{=\code{c("asymptotic","simulated","exact")}, where \code{"asymptotic"} uses only an asymptotic chi-square approximation for the \eqn{P}-value. The adequacy of asymptotic \eqn{P}-values for use with moderate sample sizes may be checked with \code{method = "simulated"}. \code{"simulated"} uses simulation to get \code{Nsim} simulated \eqn{QN} statistics for each block of samples, adding them component wise across blocks to get \code{Nsim} combined values, and compares these with the observed combined value to get the estimated \eqn{P}-value. \code{"exact"} uses full enumeration of the test statistic value for all sample splits of the pooled samples within each block. The test statistic vectors for each block are added (each component against each component, as in the R \code{outer(x,y,} \code{"+")} command) to get the convolution enumeration for the combined test statistic. This "addition" is done one block at a time. It is possible only for small problems, and is attempted only when \code{Nsim} is at least the (conservatively maximal) length \deqn{\frac{N_1!}{n_{11}!\ldots n_{1k_1}!}\times\ldots\times \frac{N_M!}{n_{M1}!\ldots n_{Mk_M}!}} of the final distribution vector, were \eqn{N_i = n_{i1}+\ldots+n_{ik_i}} is the sample size of the pooled samples for the i-th block. Otherwise, it reverts to the simulation method using the provided \code{Nsim}. } \item{dist}{\code{FALSE} (default) or \code{TRUE}. If \code{TRUE}, the simulated or fully enumerated convolution vector \code{null.dist} is returned for the \eqn{QN} test statistic. Otherwise, \code{NULL} is returned. } \item{Nsim}{\code{= 10000} (default), number of simulation splits to use within each block of samples. It is only used when \code{method =} \code{"simulated"} or when \code{method =} \code{"exact"} reverts to \code{method =} \code{"simulated"}, as previously explained. Simulations are independent across blocks, using \code{Nsim} for each block. } } \details{ The rank score \eqn{QN} criterion \eqn{QN_i}{QN.i} for the \eqn{i}-th block of \eqn{k_i}{k.i} samples, is used to test the hypothesis that the samples in the \eqn{i}-th block all come from the same but unspecified continuous distribution function \eqn{F_i(x)}{F.i(x)}. See \code{\link{qn.test}} for the definition of the \eqn{QN} criterion and the exact calculation of its null distribution. The combined \eqn{QN} criterion \eqn{QN_{\rm comb} = QN_1 + \ldots + QN_M}{% QN.comb = QN.1 + \ldots + QN.M} is used to simultaneously test whether the samples in block i come from the same continuous distribution function \eqn{F_i(x)}{F.i(x)}. However, the unspecified common distribution function \eqn{F_i(x)}{F.i(x)} may change from block to block. The \eqn{k} for each block of \eqn{k} independent samples may change from block to block. The asymptotic approximating chi-square distribution has \eqn{f = (k_1-1)+\ldots+(k_M-1)}{f = (k.1-1)+\ldots+(k.M-1)} degrees of freedom. NA values are removed and the user is alerted with the total NA count. It is up to the user to judge whether the removal of NA's is appropriate. The continuity assumption can be dispensed with if we deal with independent random samples, or if randomization was used in allocating subjects to samples or treatments, independently from block to block, and if the asymptotic, simulated or exact \eqn{P}-values are viewed conditionally, given the tie patterns within each block. Under such randomization any conclusions are valid only with respect to the blocks of subjects that were randomly allocated. In case of ties the average rank scores are used across tied observations within each block. } \value{ A list of class \code{kSamples} with components \item{test.name}{\code{"Kruskal-Wallis"}, \code{"van der Waerden scores"}, or \code{"normal scores"}} \item{M}{number of blocks of samples being compared} \item{n.samples}{list of \code{M} vectors, each vector giving the sample sizes for each block of samples being compared} \item{nt}{vector of length \code{M} of total sample sizes involved in each of the \code{M} comparisons of \eqn{k_i}{k.i} samples, respectively} \item{n.ties}{vector giving the number of ties in each the \code{M} comparison blocks} \item{qn.list}{list of \code{M} matrices giving the \code{qn} results from \code{qn.test}, applied to the samples in each of the \code{M} blocks} \item{qn.c}{2 (or 3) vector containing the observed \eqn{QN_{\rm comb}}{QN.comb}, asymptotic \eqn{P}-value, (simulated or exact \eqn{P}-value).} \item{warning}{logical indicator, \code{warning = TRUE} when at least one \eqn{n_{ij} < 5}{n.ij < 5}.} \item{null.dist}{simulated or enumerated null distribution of the \eqn{QN_{\rm comb}}{QN.comb} statistic. It is \code{NULL} when \code{dist = FALSE} or when \code{method = "asymptotic"}.} \item{method}{The \code{method} used.} \item{Nsim}{The number of simulations used for each block of samples.} } \references{ Lehmann, E.L. (2006), \emph{Nonparametric, Statistical Methods Based on Ranks}, Springer Verlag, New York. Ch. 6, Sec. 5D. } \note{ These tests are useful in analyzing treatment effects of shift nature in randomized (incomplete) block experiments. } \seealso{ \code{\link{qn.test}} } \examples{ ## Create two lists of sample vectors. x1 <- list( c(1, 3, 2, 5, 7), c(2, 8, 1, 6, 9, 4), c(12, 5, 7, 9, 11) ) x2 <- list( c(51, 43, 31, 53, 21, 75), c(23, 45, 61, 17, 60) ) # and a corresponding data frame datx1x2 x1x2 <- c(unlist(x1),unlist(x2)) gx1x2 <- as.factor(c(rep(1,5),rep(2,6),rep(3,5),rep(1,6),rep(2,5))) bx1x2 <- as.factor(c(rep(1,16),rep(2,11))) datx1x2 <- data.frame(A = x1x2, G = gx1x2, B = bx1x2) ## Run qn.test.combined. set.seed(2627) qn.test.combined(x1, x2, method = "simulated", Nsim = 1000) # or with same seed # qn.test.combined(list(x1, x2), method = "simulated", Nsim = 1000) # or qn.test.combined(A~G|B,data=datx1x2,method="simulated",Nsim=1000) } \keyword{nonparametric} \keyword{htest} \keyword{design}