clusterGeneration/0000755000176000001440000000000012471155450014003 5ustar ripleyusersclusterGeneration/NAMESPACE0000644000176000001440000000046610700717035015224 0ustar ripleyusers export(genRandomClust, simClustDesign, rcorrmatrix, genPositiveDefMat, sepIndexTheory, sepIndexData, getSepProjTheory, getSepProjData, nearestNeighborSepVal, plot1DProjection, plot2DProjection, viewClusters ) import(MASS) clusterGeneration/NEWS0000644000176000001440000000510112470012662014473 0ustar ripleyusersv1.3.2 >>>> Feb 14, 2015 # (1) fixed a few bugs in function 'getSepProjData': # 'u.cl <- unique(cl)' should be # 'u.cl <- sort(unique(cl)' # 'yi <- y[cl == u.cl[i], , drop = FALSE]' # should be # 'yi <- y[which(cl == u.cl[i]), , drop = FALSE]' v1.3.1 >>>> Jan 7, 2013 # (1) added a space between 'Weiliang Qiu' and # '' in the 'Maintainer' slot # in the DESCRIPTION file # Thank Dr. Kurt Hornik for his kind help! # v1.3.0 >>>> Jan 6, 2013 # (1) rename 'log.txt' file to 'NEWS'. # Thanks for Mr. Suraj Gupta () for this suggestion! # v1.2.9 >>>> April 2, 2012 # (1) fixed a bug pointed by Dr. Anton Korobeynikov # # Dear Dr. Weiliang Qiu, # # Recently we tried to used your package clusterGeneration but found # that the behavior of genRandomClust() with clustszind == 3 is # definitely wrong compared to the one documented. # After looking into the implementation it became obvious that # genMemSize() function does wrong things: it tries to sample from 1:G # using provided clusterSizes as weights. Surely the output clusters # have wrong sizes (not the ones specified). # # The fix is pretty simple: change the code for clustszind == 3 to # something like this: # # mem <- sample(unlist(lapply(1:G, function(x) rep.int(x, times = # clustSizes[x])))) # N <- sum(clustSizes) # # Or, maybe if you want to keep the current behavior it'd be better to # introduce new clustszind variant. # # # # (2) add 'clustSizes<-as.integer(clustSizes)' before checking # '!is.integer(clustSizes[i])' # v1.2.8 >>>> March 19, 2012 (1) fixed a few warning messages: # (a)>> #* checking R code for possible problems ... NOTE #genNoisyMeanCov: warning in eigen(Sigma.noisy, sym = TRUE): partial # argument match of 'sym' to 'symmetric' # # (b)>>> #** running examples for arch 'i386' ... WARNING #Found the following significant warnings: # # Warning: sd() is deprecated. # Warning: sd() is deprecated. # Warning: sd() is deprecated. # Warning: sd() is deprecated. #Deprecated functions may be defunct as soon as of the next release of #R. #See ?Deprecated. #** running examples for arch 'x64' ... WARNING #Found the following significant warnings: # # Warning: sd() is deprecated. # Warning: sd() is deprecated. # Warning: sd() is deprecated. # Warning: sd() is deprecated. #Deprecated functions may be defunct as soon as of the next release of #R. #See ?Deprecated. # (2) add 'na.rm=TRUE' to functions 'min', 'max', 'sum', 'mean', 'median', etc. clusterGeneration/R/0000755000176000001440000000000012470015106014174 5ustar ripleyusersclusterGeneration/R/genCluster.R0000644000176000001440000027775011736333624016470 0ustar ripleyusers# v1.2.5 # (1) fixed a bug in function 'genMemSize'. The 'numClust' inside # the function 'genMemSize' should be 'G' # # Generating cluster data sets with specified degree of separation. # The separation between any cluster and its nearest neighboring cluster # can be set to a specified value. # The covariance matrices of clusters can have arbitrary diameters, # shapes and orientations. ################ # Cluster generating Algorithm. # (1) The covariance matrices of generated clusters can be arbitrary # positive definite matrices. # (2) The minimum separation index between a cluster and its nearest # neighboring cluster is equal to a user-specified value. # # References: # Joe, H. (2006) # Generating Random Correlation Matrices Based on Partial Correlations. # \emph{Journal of Multivariate Analysis}, \bold{97}, 2177--2189. # # Milligan G. W. (1985) # An Algorithm for Generating Artificial Test Clusters. # \emph{Psychometrika} \bold{50}, 123--127. # # Qiu, W.-L. and Joe, H. (2006a) # Generation of Random Clusters with Specified Degree of Separaion. # \emph{Journal of Classification}, \bold{23}(2), 315--334. # # Qiu, W.-L. and Joe, H. (2006b) # Separation Index and Partial Membership for Clustering. # \emph{Computational Statistics and Data Analysis}, \bold{50}, 585--603. # # Su, J. Q. and Liu, J. S. (1993) # Linear Combinations of Multiple Diagnostic Markers. # \emph{Journal of the American Statistical Association}, \bold{88}, 1350--1355 ################# ################# # The shifted vertex method is proposed by Dr. Harry Joe. The algorithm is: # # Let e1=(1,0,...,0). The first two vertices are -e1 and e1. # Let the numNonNoisy+1 vertices be labelled v(1),...,v(numNonNoisy+1). # numNonNoisy is the number of non-noisy variables. # v1=A* (-e1), v2=A*e1 # vi=sqrt(3)*A*e_{i-1}, i=2,...,numNonNoisy+1 # sqrt(3) make sure the triangle are equilateral # One way to get an arbitrary number G of clusters in numNonNoisy dimensions: # For G<=numNonNoisy+1, use vertices up to v(G). # For G>numNonNoisy+1, start adding vertices from the following sequence # after v(numNonNoisy+1). # # v(2)+2*A*e1, ..., v(numNonNoisy+1)+2*A*e1, # v(2)+4*A*e1, ..., v(numNonNoisy+1)+4*A*e1, # v(2)+6*A*e1, ..., v(numNonNoisy+1)+6*A*e1, # Essentially this just keeps on adding points on a shifted symmetric simplex. # # Algorithm: outline of the steps are: # #a) input G, numNonNoisy, sepVal, #noisy #b) generate the G vertices/centers in numNonNoisy dimensions #c) determine a multiplier A #d) generate G covariance matrices #e) generate a rotation matrix in dimension numNonNoisy #f) apply the rotation matrix to the G centers and covariance matrices #g) add noisy variables #h) apply random permutations to rows and columns of cluster data matrices ################# ################# # The degree of separation is based on the separation index. # To calibrate the concepts "well-separated", "separated", and "close", # we use two clusters from # two univariate normal distributions N(0, sigma1^2) and N(A, sigma2^2). # A=4 corresponds to "close" cluster structure; # A=6 corresponds to "separated" cluster structure; # A=8 corresponds to "well-separated" cluster structure; # The corresponding separation indices are: # 0.01011020, 0.2096862, 0.34229 ################# ################# # The arguments below are also inputs to other functions beside # the main function genRandomClust(). The documentation of the # the input arguments will not be repeated. # numClust -- the number of clusters. # numNonNoisy -- the number of non-noisy variables. # sepVal -- the minimum separation index specified as a priori. The default # value is 0.01 which is the value of the separation index for # two univariate clusters generated from N(0, 1) and N(0, A), # respectively, with A=4. # With A=4, sepVal=0.01 indicates a close cluster structure. # With A=6, sepVal=0.21 indicates a separate cluster structure. # With A=8, sepVal=0.34 indicates a well-separated cluster structure. # numReplicate -- the number of data sets to generate for each combination. # the default value is 3 as in the design in Milligan (1985) # (An algorithm for generating artificial test clusters, # Psychometrika, 50, 123-127) # numNoisy -- the number of noisy variables. The default values of 'numNoisy' # and 'numOutlier' are 0 so that we get 'clean' data sets. # numOutlier -- number/ratio of outliers. If numOutlier is a positive integer, # then numOutlier means the number of outliers. If numOutlier is a real # number in (0, 1), then numOutlier means the ratio of outliers, i.e. # the number of outliers is equal to round(numOutlier*N), # where N is the total number of non-outliers. # If numOutlier is a real number greater than 1, then numOutlier is rounded # to an integer. # The default values of 'numNoisy' and 'numOutlier' are 0 so that we # get 'clean' data sets. # fileName --- the fileNames of data sets will start with "fileName" and # followed by numbers, then followed by ".dat". # The log, membership, and noisy set files have the same format # except the file extension are ".log", ".mem", and ".noisy" # respectively. The default value is 'test'. # clustszind -- cluster size indicator. # clustszind=1 indicates that all cluster have equal size. # The size is specified by clustSizeEq. # clustszind=2 indicates that the cluster sizes are randomly # generated from the range rangeN. # clustszind=3 indicates that the cluster sizes are specified # via the vector clustSizes. # The default value is 2 so that the generated clusters are more realistic. # The default value is 2 so that the generated clusters are more realistic. # clustSizeEq -- if clustszind=1, then this is the constant cluster size # The default value 100 is a reasonable cluster size. # rangeN -- if clustszind=2, then the cluster sizes are randomly generaged # from the range 'rangeN'. The default range is [50, 200] which # can produce reasonable variability of cluster sizes. # clustSizes -- if clustszind=3, then the cluster sizes are specified via # clustSizes. An input is required with # the default value of 'clustSizes' set as 'NULL'. # covMethod -- methods to generate random covariance matrices. # 'eigen' method first generates the eigenvalues of the covariance matrix, # then generate eigenvectors to construct the covariance matrix. # 'unifcorrmat' method first generates a correlation # matrix via the method proposed by # Joe H (2006). Generating random correlation matrices based on # partial correlations. J. Mult. Anal. Vol. 97, 2177--2189 # Then, it generate variances from the range 'rangeVar' to # construct the covariance matrix. # 'onion' method # extension of onion method, using Cholesky instead of msqrt #Ghosh, S., Henderson, S. G. (2003). Behavior of the NORTA method for #correlated random vector generation as the dimension increases. #ACM Transactions on Modeling and Computer Simulation (TOMACS) #v 13 (3), 276-294. # # 'c-vine' method # # Random correlation with the C-vine (different order of partial # correlations). Reference for vines: Kurowicka and Cooke, 2006, # Uncertainty Analysis with High Dimensional Dependence Modelling, # Wiley, 2006. # The default method is 'eigen' so that the user can directly # specify the range of the 'diameters' of clusters. # rangeVar -- if 'covMethod="unifcorrmat"', then to generate a covariance # matrix, we first generate a correlation matrix, then randomly generate # variances from the range specified by 'rangeVar'. # The default range is [1, 10] which can generate reasonable # variability of variances. # lambdaLow -- if 'covMethod="eigen"', when generating a covariance matrix, # we first generate its eigenvalues. The eigenvalues are randomly # generated from the interval [lambdaLow, lambdaLow*ratioLambda]. # Note that lambdaLow should be positive. # In our experience, the range [lambdaLow=1, ratioLambda=10] # can give reasonable variability of the diameters of clusters. # ratioLambda -- if 'covMethod="eigen"', lambdaUpp=lambdaLow*ratioLambda, # is the upper bound of the eigenvalues of a covariance matrix # and lambdaLow is the lower bound. # In our experience, the range [lambdaLow=1, lambdaUpp=10] # can give reasonable variability of the diameters of clusters. # alphad -- parameter for c-vine and onion method to generate random correlation matrix # eta=1 for uniform. eta should be > 0 # eta -- parameter for c-vine and onion method to generate random correlation matrix # eta=1 for uniform. eta should be > 0 # rotateind-- if rotateind =TRUE, then rotate data so that we may not detect the # full cluster structure from scatterplots, otherwise do not rotate. # By default, 'rotateind=TRUE' to generate more realistic data sets. # iniProjDirMethod -- indicating the method to get initial projection direction # By default, the sample version of the Su and Liu (SL) projection # direction ((n1-1)*S1+(n2-1)*S2)^{-1}(mu2-mu1) is used, # where mui and Si are the mean vector and covariance # matrix of cluster i. Alternatively, the naive projection # direction (mu2-mu1) is used. # Su and Liu (1993) JASA 1993, vol. 88 1350-1355. # projDirMethod -- takes values "fixedpoint" or "newton" # "fixedpoint" means that we get optimal projection direction # by solving the equation d J(w) / d w = 0, where # J(w) is the separation index with projection direction 'w' # "newton" method means that we get optimal projection driection # by the method proposed in the appendix of Qiu and Joe (2006) # "Generation of random clusters with specified degree of separation", # Journal of Classification Vol 23(2), 315--334 # alpha -- tuning parameter for separation index to indicating the percentage # of data points to downweight. We set 'alpha=0.05' like we set # the significance level in hypothesis testing as 0.05. # ITMAX -- when calculating the projection direction, we need iterations. # ITMAX gives the maximum iteration allowed. # The actual #iterations is usually much less than the default value 20. # eps -- A small positive number to check if a quantitiy \eqn{q} is # equal to zero. If \eqn{|q|<}\code{eps}, then we regard \eqn{q} # as equal to zero. \code{eps} is used to check if an algorithm # converges. The default value is \eqn{1.0e-10}. # quiet -- a flag to switch on/off the outputs of intermediate results. # The default value is 'TRUE'. # outputEmpirical -- indicates if empirical projection directions # and separation indices should be output to files. # These information usually are useful to check the cluster # structures. Hence, by default, 'outputEmpirical=TRUE'. # outputInfo -- indicates if separation information dataframe should be output. # The file name has the format [fileName]_info.log ############# genRandomClust<-function(numClust, sepVal=0.01, numNonNoisy=2, numNoisy=0, numOutlier=0, numReplicate=3, fileName="test", clustszind=2, clustSizeEq=50, rangeN=c(50,200), clustSizes=NULL, covMethod=c("eigen", "onion", "c-vine", "unifcorrmat"), rangeVar=c(1, 10), lambdaLow=1, ratioLambda=10, alphad=1, eta=1, rotateind=TRUE, iniProjDirMethod=c("SL", "naive"), projDirMethod=c("newton", "fixedpoint"), alpha=0.05, ITMAX=20, eps=1.0e-10, quiet=TRUE, outputDatFlag=TRUE, outputLogFlag=TRUE, outputEmpirical=TRUE, outputInfo=TRUE) { iniProjDirMethod<-match.arg(arg=iniProjDirMethod, choices=c("SL", "naive")) projDirMethod<-match.arg(arg=projDirMethod, choices=c("newton", "fixedpoint")) covMethod<-match.arg(arg=covMethod, choices=c("eigen", "onion", "c-vine", "unifcorrmat")) numClust<-as.integer(numClust) # checks for valid inputs if(numClust<1 || !is.integer(numClust)) { stop("The number 'numClust' of clusters should be a positive integer!\n") } numNonNoisy<-as.integer(numNonNoisy) if(numNonNoisy<2 || !is.integer(numNonNoisy)) { stop("The number 'numNonNoisy' of non-noisy variables should be an integer greater than 1!\n") } if(sepVal<= -0.999 || sepVal >= 0.999) { stop("The desired separation index 'sepVal' should be in the range (-0.999, 0.999)!\n") } numReplicate<-as.integer(numReplicate) if(numReplicate<1 || !is.integer(numReplicate)) { stop("The number 'numReplicate' should be a positive integer!\n") } numNoisy<-as.integer(numNoisy) if(numNoisy<0 || !is.integer(numNoisy)) { stop("The number 'numNoisy' of noisy variables should be a non-negative integer!\n") } if(numOutlier<0) { stop("'numOutlier' should be non-negative!\n") } if(!is.element(clustszind, c(1,2,3))) { stop("Cluster size indicator 'clustszind' should be 1, 2, or 3!\n") } clustSizeEq<-as.integer(clustSizeEq) if(clustSizeEq<2 || !is.integer(clustSizeEq)) { stop("Cluster size 'clustSizeEq' should be an integer greater than 1!\n") } rangeN<-as.integer(rangeN) if(length(rangeN)!=2) { stop("The range 'rangeN' for cluster size should be a numeric vector of length 2!\n") } if(rangeN[1]>rangeN[2]) { stop("First element of 'rangeN' should be smaller than second!\n") } if(rangeN[1]<2 || !is.integer(rangeN[1])) { stop("The lower bound 'rangeN[1]' for the range of cluster sizes should be an integer greater than 1!\n") } if(!is.integer(rangeN[2])) { stop("The upper bound 'rangeN[2]' for the range of cluster size should be integer!\n") } if(clustszind==3) { len<-length(clustSizes) if(len!=numClust || is.null(clustSizes)) { stop("The number of elements in 'clustSizes' should be equal to the number 'numClust' of clusters when the value of 'clustszind' is equal to 3!\n") } clustSizes<-as.integer(clustSizes) for(i in 1:len) { if(clustSizes[i]<1 || !is.integer(clustSizes[i])) { stop(paste("The cluster size for the ", i, "-th cluster should be an integer greater than 1!\n", sep="")) } } } if(rangeVar[1]>rangeVar[2]) { stop("First element of 'rangeVar' should be smaller than second!\n") } if(rangeVar[1]<0) { stop("The lower bound 'rangeVar[1]' for the range of variances should be positive!\n") } if(lambdaLow<0) { stop("The lower bound 'lambdaLow' of eigenvalues of cluster covariance matrices should be greater than zero!\n") } if(ratioLambda<1) { stop("The ratio 'ratioLambda' of the upper bound of the eigenvalues to the lower bound of the eigenvalues of cluster covariance matrices should be greater than 1!\n") } alphad<-as.numeric(alphad) if(alphad<0) { stop("'alphad' should be positive!\n") } eta<-as.numeric(eta) if(eta<0) { stop("'eta' should be positive!\n") } if(!is.logical(rotateind)) { stop("The value of the rotation indicator 'rotateind' should be logical, i.e., either 'TRUE' or 'FALSE'!\n") } if(alpha<=0 || alpha>0.5) { stop("The tuning parameter 'alpha' should be in the range (0, 0.5]!\n") } ITMAX<-as.integer(ITMAX) if(ITMAX<=0 || !is.integer(ITMAX)) { stop("The maximum iteration number allowed 'ITMAX' should be a positive integer!\n") } if(eps<=0 || eps > 0.01) { stop("The convergence threshold 'eps' should be between (0, 0.01]!\n") } if(!is.logical(quiet)) { stop("The value of the quiet indicator 'quiet' should be logical, i.e., either 'TRUE' or 'FALSE'!\n") } if(!is.logical(outputDatFlag)) { stop("The value of the indicator 'outputDatFlag' should be logical, i.e., either 'TRUE' or 'FALSE'!\n") } if(!is.logical(outputLogFlag)) { stop("The value of the indicator 'outputLogFlag' should be logical, i.e., either 'TRUE' or 'FALSE'!\n") } if(!is.logical(outputEmpirical)) { stop("The value of the indicator 'outputEmpirical' should be logical, i.e., either 'TRUE' or 'FALSE'!\n") } if(!is.logical(outputInfo)) { stop("The value of the indicator 'outputInfo' should be logical, i.e., either 'TRUE' or 'FALSE'!\n") } # end of checks of valid inputs, algorithm begins if(!quiet) { cat(" *********** Begin generating ", numReplicate, " data sets *************\n") } # for each data set to be generated for(b in 1:numReplicate) { # get file name datafileName<-paste(fileName, "_", b, ".dat", sep="") if(!quiet) { cat(" *********** data set >> ", datafileName, " *************\n") cat(" *** Step 2.1: Generate membership and cluster sizes ***\n") } # total number of dimension p<-numNonNoisy+numNoisy # generate membership and cluster sizes tmpmem<-genMemSize(clustszind, numClust, clustSizeEq, rangeN, clustSizes, p, quiet) # mem is the set of the memberships of data points mem<-tmpmem$mem # size are the numbers of data points in clusters size<-tmpmem$size # N is the total number of data points N<-tmpmem$N if(!quiet) { cat(" *********** data set >> ", datafileName, " *************\n") cat(" *** Step 2.2: Generate covariance matrices ***\n") } # generate covariance matrices tmpms<-genMeanCov(numNonNoisy,numClust, rangeVar, sepVal, lambdaLow, ratioLambda, covMethod, alphad, eta, iniProjDirMethod, projDirMethod, alpha, ITMAX, eps, quiet) #generate mean and cov # 'thetaMat' is a 'numClust' by 'numNonNoisy' matrix # i-th row is the mean vector for the i-th cluster thetaMat<-tmpms$thetaMat # an array of covariance matrices; s[,,i] is the covariance matrix # for the i-th cluster s<-tmpms$s # The scalar 'A' is for the length of the simplex, # the vertexes of which allocate # cluster centers. The scalar 'A' is to adjust the distance between # two clusters so that the separation of the two clusters with # specified covariance matrices is equal to 'sepVal'. A<-tmpms$A # 'neighbors.mat' is a 'numClust' by 6 matrix. Each row corresponds to a cluster. # Columns # 1 -- cluster label # 2 -- cluster label of its nearest neighboring cluster # 3 -- separation index for the cluster and its nearest neighboring cluster # 4 -- cluster label of its farthest neighboring cluster # 5 -- separation index for the cluster and its farthest neighboring cluster # 6 -- median separation index for the cluster and its neighboring clusters neighbors.mat<-tmpms$neighbors.mat # 'egvaluesMat' is a 'numClust' by 'numNonNoisy' matrix. Each row correspond a cluster. # rows are the eigenvalues of covariance matrices of the cluster egvaluesMat<-tmpms$egvaluesMat # cluster fractions mypi<-size/sum(size, na.rm=TRUE) # Obtain the mean vector and covariance matrix for noisy variables tmp<-genNoisyMeanCov(numNoisy, mypi, numClust, numNonNoisy, thetaMat, s, covMethod, alphad, eta, rangeVar, eps, quiet) # 'thetaMat' is a 'numClust' by 'p' matrix # i-th row is the mean vector for the i-th cluster thetaMat<-tmp$thetaMat # an array of covariance matrices; s[,,i] is the covariance matrix # for the i-th cluster s<-tmp$s # number of variables = numNonNoisy+numNoisy p<-tmp$p # update mean vectors and covariance matrices of variables # e.g. rotating data and randomizing the order of variables tmp<-updateMeanCov(rotateind, thetaMat, s, numClust, p, numNoisy, quiet) # 'muMat' is a 'numClust' by 'p' matrix # i-th row is the mean vector for the i-th cluster muMat<-tmp$muMat # an array of covariance matrices; SigmaArray[,,i] is the covariance matrix # for the i-th cluster SigmaArray<-tmp$SigmaArray # set of noisy variables noisySet<-tmp$noisySet # Q is the rotation matrix (orthogonal matrix) Q<-tmp$Q; if(!quiet) { cat(" *********** data set >> ", datafileName, " *************\n") cat(" *** Step 2.3: Generate data set ***\n") } # generate data based on the mean vectors, covariance matrices # and cluster sizes. y<-genData(mem, N, numClust, p, muMat, SigmaArray, size, quiet) if(!quiet) { cat(" *********** data set >> ", datafileName, " *************\n") cat(" *** Step 2.4: Get separation indices and projection directions ***\n") } # get the theoretical separation index and projection direction tmpsep<-getSepProjTheory(muMat, SigmaArray, iniProjDirMethod, projDirMethod, alpha, ITMAX, eps, quiet) # 'sepValMat' is a 'numClust' by 'numClust' matrix # 'sepValMat[i,j]' is the separation index between clusters i and j sepValMat<-tmpsep$sepValMat # 'myprojDir' is a 'numClust' by 'numClust' by 'p' array # 'myprojDir[i,j,]' is the projection direction for clusters i and j myprojDir<-tmpsep$projDirArray if(outputEmpirical) { # get empirical separation indices and projection directions tmp<-getSepProjData(y, mem, iniProjDirMethod, projDirMethod, alpha, ITMAX, eps, quiet) # 'Jhat2' is a 'numClust' by 'numClust' matrix # 'Jhat2[i,j]' is the separation index between clusters i and j Jhat2<-tmp$sepValMat # 'empProjDir' is a 'numClust' by 'numClust' by 'p' array # 'empProjDir[i,j,]' is the projection direction for clusters i and j empProjDir<-tmp$projDirArray } else { empProjDir<-NULL Jhat2<-NULL } if(!quiet) { cat(" *********** data set >> ", datafileName, " *************\n") cat(" *** Step 2.5: Generating outliers ***\n") } # Generate outliers tmpout<-genOutliers(numOutlier, y) # number of outliers nOut<-tmpout$nOut # 'y.out' is a 'nOut' by 'p' matrix, where 'nOut' is the number of outliers y.out<-tmpout$y.out if(nOut>0) { # add outliers y<-rbind(y, y.out) # the memberships of outliers are zero mem<-c(mem, rep(0, nOut)) } if(!quiet) { cat(" *********** data set >> ", datafileName, " *************\n") cat(" *** Step 2.6: Output log information ***\n") } # output log information, e.g. mean vectors and covariance matrices, etc. if(outputLogFlag) { outputLog(b, fileName, alpha, sepVal, numClust, size, N, p, numNoisy, noisySet, nOut, thetaMat, s, muMat, SigmaArray, Q, sepValMat, Jhat2, myprojDir, empProjDir, egvaluesMat, quiet, outputEmpirical) } # output data set # output membership if(outputDatFlag) { memfileName<-paste(fileName, "_", b, ".mem", sep="") #membership write.table(mem,file=memfileName,quote=FALSE, row.names=FALSE,col.names=FALSE) } # output noisy variables if(outputDatFlag) { noisyfileName<-paste(fileName, "_",b,".noisy",sep="") #noisy var write.table(t(noisySet),file=noisyfileName, quote=FALSE,row.names=FALSE, col.names=FALSE) } # record theoretical and sample separation indices infoMatTheory<-neighbors.mat if(outputEmpirical) { # for empirical values, we get the corresponding 'numClust' by '6' matrix infoMatData<-nearestNeighborSepVal(Jhat2) } else { infoMatData<-NULL } nn<-nrow(infoMatTheory) if(b>1) { infoFrameTheory<-rbind(infoFrameTheory, infoMatTheory) if(outputEmpirical) { infoFrameData<-rbind(infoFrameData, infoMatData) } else { infoFrameData<-NULL } fileNameVec<-c(fileNameVec, rep(paste(fileName, "_", b, sep=""), nn)) datList[[b]]<-outputData(b, fileName, y, p, outputDatFlag) memList[[b]]<-mem noisyList[[b]]<-noisySet } else { # the first data set infoFrameTheory<-infoMatTheory if(outputEmpirical) { infoFrameData<- infoMatData } else { infoFrameData<-NULL } fileNameVec<-rep(paste(fileName, "_", b, sep=""), nn) datList<-list(outputData(b, fileName, y, p, outputDatFlag)) memList<-list(mem) noisyList<-list(noisySet) } } infoFrameTheory<-data.frame(cluster=infoFrameTheory[,1], nearestClust=infoFrameTheory[,2], nearestSep=infoFrameTheory[,3], farthestClust=infoFrameTheory[,4], farthestSep=infoFrameTheory[,5], medianSep=infoFrameTheory[,6]) infoFrameTheory$fileName<-fileNameVec if(outputEmpirical) { infoFrameData<-data.frame(cluster=infoFrameData[,1], nearestClust=infoFrameData[,2], nearestSep=infoFrameData[,3], farthestClust=infoFrameData[,4], farthestSep=infoFrameData[,5], medianSep=infoFrameData[,6]) infoFrameData$fileName<-fileNameVec } else { infoFrameData<-NULL } tmpnames<-paste(fileName, "_", 1:numReplicate, sep="") names(datList)<-tmpnames names(memList)<-tmpnames names(noisyList)<-tmpnames if(outputInfo) { fileNameInfo<-paste(fileName, "_info.log", sep="") msg<-"Theoretical separation information data frame>>>>>>>>>>>\n" write.table(msg, append=FALSE, file=fileNameInfo, quote=FALSE, row.names=FALSE, col.names=FALSE) msg<-colnames(infoFrameTheory) write(msg, append=TRUE, file=fileNameInfo, ncolumns=7) tt<-infoFrameTheory tt[,1:6]<-round(tt[,1:6], 3) write.table(tt, file=fileNameInfo, quote=FALSE, sep="\t", row.names=FALSE, col.names=FALSE, append=TRUE) msg<-"\nEmpirical separation information data frame>>>>>>>>>>>\n" write.table(msg, append=TRUE, file=fileNameInfo, quote=FALSE, row.names=FALSE, col.names=FALSE) msg<-colnames(infoFrameData) write(msg, append=TRUE, file=fileNameInfo, ncolumns=7) if(outputEmpirical) { tt<-infoFrameData tt[,1:6]<-round(tt[,1:6], 3) write.table(tt, file=fileNameInfo, quote=FALSE, sep="\t", row.names=FALSE, col.names=FALSE, append=TRUE) } } if(!quiet) { cat(" *********** End of generating ", numReplicate, " data sets *************\n") } invisible(list(infoFrameTheory=infoFrameTheory, infoFrameData=infoFrameData, datList=datList, memList=memList, noisyList=noisyList)) } # Get numNonNoisy+1 vertexs of a simplex in numNonNoisy dimension # numNonNoisy is the non-noisy number of dimension genVertexes<-function(numNonNoisy) { numNonNoisy<-as.integer(numNonNoisy) if(numNonNoisy<=1 || !is.integer(numNonNoisy)) { stop("The number of non-noisy variables should be a positive integer that is greater than 1!\n") } mx<-numNonNoisy+1; # the number of vertices # The i-th row of the matrix "vert" is the coordinates of the i-th vertex vert<-matrix(0,nrow=mx, ncol=numNonNoisy) vert[1,1]<- -1. vert[2,1]<-1. for(p in 3:mx) { # Value of vert[p] to keep constant distance (of 2) between vertices # get centre of previous vertices, then add co-ordinate in pth dimension dd<-0.0 for(j in 1:(p-2)) { s<-sum(vert[1:(p-1),j], na.rm=TRUE) tem<-s/(p-1.) vert[p,j]<-tem if(j==1) tem<-1. dd<-dd+tem*tem } vert[p,p-1]<-sqrt(4.-dd) } return(vert) } # Get G vertices in numNonNoisy dimension, where G can be any integer > 0 genShiftedVertexes<-function(G, numNonNoisy) { G<-as.integer(G) if(G<1 || !is.integer(G)) { stop("The number of vertices 'G' should be a positive integer!\n") } numNonNoisy<-as.integer(numNonNoisy) if(numNonNoisy<=1 || !is.integer(numNonNoisy)) { stop("The number of non-noisy variables should be a positive integer that is greater than 1!\n") } # First get numNonNoisy+1 vertices in a numNonNoisy-dimensional space vertex<-genVertexes(numNonNoisy) if(G<=numNonNoisy+1) { return(vertex[1:G,]) } #For G>numNonNoisy+1, start adding vertices from the following sequence after v(numNonNoisy+1). #v(2)+2*e1, ..., v(numNonNoisy+1)+2*e1, #v(2)+4*e1, ..., v(numNonNoisy+1)+4*e1, #v(2)+6*e1, ..., v(numNonNoisy+1)+6*e1, #Essentially this just keeps on adding points on a shifted symmetric simplex. # e1 is the numNonNoisy x 1 vector whose elements are all zero except that e1[1]=1. vertex2<-matrix(0, nrow=G, ncol=numNonNoisy) vertex2[1:(numNonNoisy+1),]<-vertex e1<-rep(0,numNonNoisy) e1[1]<-1 tmpNumNonNoisy<-(G-numNonNoisy-1)/numNonNoisy nG<-floor(tmpNumNonNoisy) res<-(G-numNonNoisy-1)%%numNonNoisy m<-numNonNoisy+1 # m indicates the label of the current cluster # nG indicates how many shifted symmetric simplex we need if(nG>0) { for(j in 1:nG) { for(i in 1:numNonNoisy) { m<-m+1 vertex2[m,]<-vertex[i+1,]+2*j*e1 } } } # res is the number of remindar vertices. if(res>0) { for(i in 1:res) { m<-m+1 vertex2[m,]<-vertex[i+1,]+2*(nG+1)*e1 } } rownames(vertex2)<-paste("cluster", 1:G, sep="") colnames(vertex2)<-paste("variable", 1:numNonNoisy, sep="") return(vertex2) } # Generate mean vectors and covariance matrices for non-noisy variables # numNonNoisy -- number of non-noisy variables # G -- number of clusters # See documentation of genRandomClust for explanation of arguments: # rangeVar, sepVal, lambdaLow, ratioLambda, covMethod, iniProjDirMethod, # projDirMethod, alpha, ITMAX, eps, quiet genMeanCov<-function(numNonNoisy, G, rangeVar, sepVal, lambdaLow=1, ratioLambda=10, covMethod=c("eigen", "onion", "c-vine", "unifcorrmat"), alphad=1, eta=1, iniProjDirMethod=c("SL", "naive"), projDirMethod=c("newton", "fixedpoint"), alpha=0.05, ITMAX=20, eps=1.0e-10, quiet=TRUE) { covMethod<-match.arg(arg=covMethod, choices=c("eigen", "onion", "c-vine", "unifcorrmat")) iniProjDirMethod<-match.arg(arg=iniProjDirMethod, choices=c("SL", "naive")) projDirMethod<-match.arg(arg=projDirMethod, choices=c("newton", "fixedpoint")) numNonNoisy<-as.integer(numNonNoisy) # check for valid inputs if(numNonNoisy<2 || !is.integer(numNonNoisy)) { stop("The number 'numNonNoisy' of non-noisy variables should be a positive integer greater than 1!\n") } G<-as.integer(G) if(G<1 || !is.integer(G)) { stop("The number 'G' of clusters should be positive integer!\n") } if(rangeVar[1]>rangeVar[2]) { stop("First element of 'rangeVar' should be smaller than second!\n") } if(rangeVar[1]<0) { stop("The lower bound 'rangeVar[1]' for the range of variances should be positive!\n") } if(lambdaLow<0) { stop("The lower bound 'lambdaLow' of eigenvalues of cluster covariance matrices should be greater than zero!\n") } if(ratioLambda<1) { stop("The ratio 'ratioLambda' of the upper bound of the eigenvalues to the lower bound of the eigenvalues of cluster covariance matrices should be greater than 1!\n") } if(alpha<=0 || alpha>0.5) { stop("The tuning parameter 'alpha' should be in the range (0, 0.5]!\n") } if(sepVal<= -0.999 || sepVal >= 0.999) { stop("The desired separation index 'sepVal' should be in the range (-1, 1)!\n") } ITMAX<-as.integer(ITMAX) if(ITMAX<=0 || !is.integer(ITMAX)) { stop("The maximum iteration number allowed 'ITMAX' should be a positive integer!\n") } if(eps<=0 || eps > 0.01) { stop("The convergence threshold 'eps' should be between (0, 0.01]!\n") } if(!is.logical(quiet)) { stop("The value of the quiet indicator 'quiet' should be logical, i.e., either 'TRUE' or 'FALSE'!\n") } # end of checks of valid inputs s<-array(0, c(numNonNoisy,numNonNoisy,G)) dimnames(s)<-list(NULL, NULL, paste("cluster", 1:G, sep="")) egvaluesMat<-matrix(0, nrow=G, ncol=numNonNoisy) rownames(egvaluesMat)<-paste("cluster", 1:G, sep="") colnames(egvaluesMat)<-paste("variable", 1:numNonNoisy, sep="") if(G==1) # only one cluster { thetaMat<-matrix(0, nrow=1, ncol=numNonNoisy) rownames(thetaMat)<-"cluster1" colnames(thetaMat)<-paste("variable", 1:numNonNoisy, "\n") tmp<-genPositiveDefMat(numNonNoisy, covMethod, alphad, eta, rangeVar, lambdaLow, ratioLambda) s[,,1]<-tmp$Sigma egvaluesMat[1,]<-tmp$egvalue # The scalar 'A' is for the length of the simplex, # the vertexes of which allocate # cluster centers. The scalar 'A' is to adjust the distance between # two clusters so that the separation of the two clusters with # specified covariance matrices is equal to 'sepVal'. A<-0 neighbors.mat<-NULL if(!quiet) { cat("Warning: only one cluster in function 'genMeanCov'!\n 'neighbors.mat' is set to be 'NULL'\n") } } else { a<-rep(0,numNonNoisy) a[1]<-1 asa<-rep(0, G); # sqrt(a^T s a) for(i in 1:G) # generate covariance matrices { tmp<-genPositiveDefMat(numNonNoisy, covMethod, alphad, eta, rangeVar, lambdaLow, ratioLambda) s[,,i]<-tmp$Sigma asa[i]<-sqrt(as.vector(t(a)%*%s[,,i]%*%a)) # the sd of projected data egvaluesMat[i,]<-tmp$egvalues } # obtain the largest two standard deviations tmpsd<-sort(asa, decreasing=TRUE) s1<-sqrt(tmpsd[1]) s2<-sqrt(tmpsd[2]) if(!quiet) { cat(" *** Step 2.2: Get the suitable value of A.***\n") } # calculate the initial upper bound of A # The scalar 'A' is for the length of the edge of the simplex, # the vertexes of which allocate cluster centers. The scalar # 'A' is to adjust the distance between # two clusters so that the separation of the two clusters with # specified covariance matrices is equal to 'sepVal'. za<-qnorm(1-alpha/2) A2<-(1+sepVal)*za*(s1+s2)/(2*(1-sepVal)) A2<-A2+1 for(gg in 1:G) { eg<-eigen(s[,,gg])$values } tmp<-getA2(G, s, sepVal, A2, iniProjDirMethod, projDirMethod, alpha, ITMAX, eps) # get value of A if(!quiet) { cat(" *** Step 2.3: Generate mean vectors for clusters ***\n") } A<-tmp$minA thetaMat<-A*tmp$vertex # obtain the centers of the clusters tmp<-getSepProjTheory(thetaMat, s, iniProjDirMethod, projDirMethod, alpha, ITMAX, eps, quiet) sepValMat<-tmp$sepValMat #projDirArray<-tmp$projDirArray d<-as.matrix(dist(thetaMat)) neighbors.mat<-nearestNeighbor(d, A, sepValMat) if(!quiet) { # output intermediate results cat("true sepVal=", sepVal, "\n") cat("before scaling the covariance matrices, neighbors.mat>>\n"); cat("1st column has the labels of clusters\n"); cat("2nd column has the labels of its nearest neighboring cluster\n"); cat("3rd column has the separation indices of the clusters to their nearest neighbors\n") cat("4nd column has the labels of its farthest neighboring cluster\n") cat("5rd column has the separation indices of the clusters to their farthest neighbors\n") cat("6th column has the median separation indices of the clusters to their neighbors\n") print(neighbors.mat); } # Refine Covariance matrices so that the separation indices between # any cluster and its nearest neighboring cluster is equal to sepVal. tmp<-refineCov(thetaMat, s, A, sepVal, iniProjDirMethod, projDirMethod, alpha, ITMAX, eps, quiet) s<-tmp$s tmp<-getSepProjTheory(thetaMat, s, iniProjDirMethod, projDirMethod, alpha, ITMAX, eps, quiet) sepValMat<-tmp$sepValMat d<-as.matrix(dist(thetaMat)) neighbors.mat<-nearestNeighbor(d, A, sepValMat) if(!quiet) { # output intermediate results # recalcuate the separation index matrix cat("true sepVal=", sepVal, "\n") cat("before scaling the covariance matrices, neighbors.mat>>\n"); cat("1st column has the labels of clusters\n"); cat("2nd column has the labels of its nearest neighboring cluster\n"); cat("3rd column has the separation indices of the clusters to their nearest neighbors\n") cat("4nd column has the labels of its farthest neighboring cluster\n") cat("5rd column has the separation indices of the clusters to their farthest neighbors\n") cat("6th column has the median separation indices of the clusters to their neighbors\n") print(neighbors.mat); } } return(list(thetaMat=thetaMat, s=s, A=A, neighbors.mat=neighbors.mat, egvaluesMat=egvaluesMat)) } # Refine Covariance matrices so that the separation indices between # clusters and their nearest neighboring clusters are equal to sepVal. # thetaMat -- cluster center matrix, # thetaMat[i,] is the cluster center for the i-th cluster. # s -- array of covariance matrices. s[,,i] is the covariance matrix # for the i-th cluster # A -- scalar so that the minimum separation index among clusters is equal # to sepVal. # The scalar 'A' is for the length of the simplex, # the vertexes of which allocate cluster centers. # The scalar 'A' is used to adjust the distance between # two clusters so that the separation of the two clusters with # specified covariance matrices is equal to 'sepVal'. # sepVal -- the minimum separation index set as a priori # See documentation of genRandomClust for explanation of arguments: # iniProjDirMethod, projDirMethod, alpha, ITMAX, eps refineCov<-function(thetaMat, s, A, sepVal, iniProjDirMethod=c("SL", "naive"), projDirMethod=c("newton", "fixedpoint"), alpha=0.05, ITMAX=20, eps=1.0e-10, quiet=TRUE) { iniProjDirMethod<-match.arg(arg=iniProjDirMethod, choices=c("SL", "naive")) projDirMethod<-match.arg(arg=projDirMethod, choices=c("newton", "fixedpoint")) if(A<0) { stop("The scalar 'A' should be positive!\n") } if(sepVal<= -0.999 || sepVal >= 0.999) { stop("The desired separation index 'sepVal' should be in the range (-1, 1)!\n") } if(alpha<=0 || alpha>0.5) { stop("The tuning parameter 'alpha' should be in the range (0, 0.5]!\n") } ITMAX<-as.integer(ITMAX) if(ITMAX<=0 || !is.integer(ITMAX)) { stop("The maximum iteration number allowed 'ITMAX' should be a positive integer!\n") } if(eps<=0 || eps > 0.01) { stop("The convergence threshold 'eps' should be between (0, 0.01]!\n") } k.max.old<-0 myc<-1 while(1) { # We first find neighboring clusters for each cluster. Then we calculate # the separation index of each cluster to its nearest neighboring # cluster. Then we choose the cluster whose minimum separation index is # the largest. tmp<-neighborsMaxMin(thetaMat, s, A, iniProjDirMethod, projDirMethod, alpha, ITMAX, eps, quiet) k.maxmin<-tmp$k.maxmin neighbors.maxmin<-tmp$neighbors.maxmin if(k.maxmin==k.max.old) { break } k.max.old<-k.maxmin maxminJ<-tmp$maxminJ # If maxminJ is already equal to sepVal, then we do not need rescale # covariance matrices further. if(abs(maxminJ-sepVal)0.5) { stop("The tuning parameter 'alpha' should be in the range (0, 0.5]!\n") } ITMAX<-as.integer(ITMAX) if(ITMAX<=0 || !is.integer(ITMAX)) { stop("The maximum iteration number allowed 'ITMAX' should be a positive integer!\n") } d<-as.matrix(dist(thetaMat)) numClust<-nrow(thetaMat) p<-ncol(thetaMat) # calculate the separation index matrix tmp<-getSepProjTheory(thetaMat, s, iniProjDirMethod, projDirMethod, alpha, ITMAX, eps, quiet) sepValMat<-tmp$sepValMat # Find neighboring clusters of each cluster and find the cluster whose # minimum separation index with its neighboring clusters is the largest. tmp<-findNeighbors(d, A, sepValMat) k.maxmin<-tmp$k.maxmin; # the cluster which we are interested in maxminJ<-tmp$maxminJ; # the maximum minimum separation index neighbors<-tmp$neighbors # neighboring clusters of each cluster # find the neighboring clusters of the cluster k.maxmin, whose separation # indices with the cluster k.maxmin is equal to maxminJ tmpJ<-sepValMat[k.maxmin,] k2<-which(tmpJ==maxminJ) # To check if the neighboring clusters in the set k2 have larger minimum # separation indices with their neighboring clusters than maxminJ len<-length(k2) minJ<-rep(0,len) for(i in 1:len) { pos<-k2[i] tmpJ<-sepValMat[pos,] tmpJ[pos]<-1 minJ[i]<-min(tmpJ, na.rm=TRUE) } maxminJ2<-max(minJ, na.rm=TRUE) pos<-which(minJ==maxminJ2) pos<-k2[pos[1]] if(maxminJ0.5) { stop("The tuning parameter 'alpha' should be in the range (0, 0.5]!\n") } if(sepVal<= -0.999 || sepVal >= 0.999) { stop("The desired separation index 'sepVal' should be in the range (-0.999, 0.999)!\n") } ITMAX<-as.integer(ITMAX) if(ITMAX<=0 || !is.integer(ITMAX)) { stop("The maximum iteration number allowed 'ITMAX' should be a positive integer!\n") } myc<-2 numClust<-nrow(thetaMat) p<-ncol(thetaMat) while(1) { tmps<-s # scale the covariance matrix of the first cluster tmps[,,1]<-myc*s[,,1] # calculate the separation index matrix tmp<-getSepProjTheory(thetaMat, tmps, iniProjDirMethod, projDirMethod, alpha, ITMAX, eps, quiet) sepValMat<-tmp$sepValMat sepVal1<-sepValMat[1,] sepVal1[1]<-1 # find the minimum separation index of the first cluster minsepVal<-min(sepVal1, na.rm=TRUE) if(minsepVal0.5) { stop("The tuning parameter 'alpha' should be in the range (0, 0.5]!\n") } if(sepVal<= -0.999 || sepVal >= 0.999) { stop("The desired separation index 'sepVal' should be in the range (-0.999, 0.999)!\n") } ITMAX<-as.integer(ITMAX) if(ITMAX<=0 || !is.integer(ITMAX)) { stop("The maximum iteration number allowed 'ITMAX' should be a positive integer!\n") } tmps<-s tmps[,,1]<-myc*tmps[,,1] numClust<-nrow(thetaMat) p<-ncol(thetaMat) tmp<-getSepProjTheory(thetaMat, tmps, iniProjDirMethod, projDirMethod, alpha, ITMAX, eps, quiet) sepValMat<-tmp$sepValMat sepVal1<-sepValMat[1,] sepVal1[1]<-1 minsepVal<-min(sepVal1, na.rm=TRUE) res<-minsepVal-sepVal return(res) } # Find the neighboring cluster of the cluster k. # dk is set of the distances from cluster k to other clusters # dk[k,k]=0, i.e. the distance from cluster k to itself is zero. # A -- scalar so that the minimum separation index among clusters is equal # to sepVal myNeighbor<-function(d, k, A) { k<-as.integer(k) if(k<0 || !is.integer(k)) { stop("The cluster label 'k' should be positive integer!\n") } if(A<0) { stop("The scalar 'A' should be positive\n") } len<-length(d) if(len<1) { stop("The distance vector 'd' should contains at least 1 element!\n") } pos<-which(d<0) if(length(pos)>0) { stop("The element of the distance vector 'd' should be positive!\n") } dk<-as.vector(d[k,]) # In our design, the distances from a cluster to its neighboring clusters # are equal to A*2. tmp<-which(abs(dk-A*2)<1.0e-10) tmp<-sort(tmp) return(tmp) } # Find neighboring clusters of each cluster and find the cluster whose # minimum separation index with its neighboring clusters is the largest. # d -- distance matrix of numClust clusters # A -- scalar so that the minimum separation index among clusters is equal # to sepVal # sepValMat -- the separation index matrix findNeighbors<-function(d, A, sepValMat) { tmp<-as.vector(d) len<-length(tmp) if(len<1) { stop("The distance matrix 'd' should contains at least 1 element!\n") } pos<-which(tmp<0) if(length(pos)>0) { stop("The element of the distance matrix 'd' should be positive!\n") } if(A<0) { stop("The scalar 'A' should be positive\n") } tmp<-as.vector(sepValMat) len<-length(tmp) if(len<1) { stop("The separation index matrix 'sepValMat' should contains at least 1 element!\n") } pos<-which(tmp< -1 || tmp>1) if(length(pos)>0) { stop("The element of the separation index matrix 'sepValMat' should be between [-1, 1]!\n") } numClust<-nrow(sepValMat) # find the neighboring clusters of the first cluster neighbors<-list(myNeighbor(d, 1, A)) # the separation indices between 1st cluster and its neighboring clusters sepVal.neighbors<-list(sepValMat[1,neighbors[[1]]]) tmpJ<-as.vector(sepVal.neighbors[[1]]) maxminJ<-min(tmpJ, na.rm=TRUE) k.maxmin<-1 for(k in 2:numClust) { neighbors[[k]]<-myNeighbor(d, k, A) sepVal.neighbors[[k]]<-sepValMat[k,neighbors[[k]]] tmpJ<-as.vector(sepVal.neighbors[[k]]) tmpminJ<-min(tmpJ, na.rm=TRUE) if(tmpminJ>maxminJ) { k.maxmin<-k maxminJ<-tmpminJ } } res<-list(k.maxmin=k.maxmin, maxminJ=maxminJ, neighbors=neighbors, sepVal.neighbors=sepVal.neighbors) return(res) } # Separation information matrix containing # the nearest neighbor and farthest neighbor of each cluster # sepValMat -- the separation index matrix nearestNeighborSepVal<-function(sepValMat) { if(!is.matrix(sepValMat)) { stop("'sepValMat' should be a matrix!\n") } if(nrow(sepValMat) != ncol(sepValMat)) { stop("'sepValMat' should be a squared matrix!\n") } tmp<-as.vector(sepValMat) len<-length(tmp) if(len<1) { stop("The separation index matrix 'sepValMat' should contains at least 1 element!\n") } pos<-which(tmp< -1 || tmp>1) if(length(pos)>0) { stop("The element of the separation index matrix 'sepValMat' should be between [-1, 1]!\n") } numClust<-nrow(sepValMat) # 'neighbors.mat' is a 'numClust' by '6' matrix. # 1st column has the labels of clusters # 2nd column has the labels of its nearest neighboring cluster # 3rd column has the separation indices of the clusters to their nearest # neighbors # 4nd column has the labels of its farthest neighboring cluster # 5rd column has the separation indices of the clusters to their farthest # neighbors # 6th column has the median separation indices of the clusters to their # neighbors neighbors.mat<-matrix(0, nrow=numClust, ncol=6) rownames(neighbors.mat)<-1:numClust colnames(neighbors.mat)<-c("cluster", "neighbor_nearest", "sep_nearest", "neighbor_farthest", "sep_farthest", "sep_median") neighbors.mat[,1]<-1:numClust for(i in 1:numClust) { Jvec<-as.vector(sepValMat[i,]) pos.max<-which(Jvec==max(Jvec))[1] neighbors.mat[i,4]<-pos.max neighbors.mat[i,5]<-Jvec[pos.max] Jvec2<-Jvec[-i] minJ<-min(Jvec2, na.rm=TRUE) neighbors.mat[i,3]<-minJ pos.min<-which(Jvec==minJ)[1] neighbors.mat[i,2]<-pos.min medianJ<-median(Jvec2, na.rm=TRUE) neighbors.mat[i,6]<-medianJ } return(neighbors.mat) } # find the nearest neighbor and farthest neighbor of each cluster # d -- distance matrix of numClust clusters # A -- scalar so that the minimum separation index among clusters is equal # to sepVal # sepValMat -- the separation index matrix nearestNeighbor<-function(d, A, sepValMat) { # 1st column has the labels of clusters # 2nd column has the labels of its nearest neighboring cluster # 3rd column has the separation indices of the clusters to their nearest # neighbors # 4nd column has the labels of its farthest neighboring cluster # 5rd column has the separation indices of the clusters to their farthest # neighbors # 6th column has the median separation indices of the clusters to their # neighbors numClust<-nrow(sepValMat) neighbors.mat<-matrix(0, nrow=numClust, ncol=6) rownames(neighbors.mat)<-1:numClust colnames(neighbors.mat)<-c("cluster", "neighbor_nearest", "sep_nearest", "neighbor_farthest", "sep_farthest", "sep_median") neighbors.mat[,1]<-1:numClust # find neighbors of 1st cluster for(k in 1:numClust) { neighbork<-myNeighbor(d, k, A) tmpJ<-as.vector(sepValMat[k,neighbork]) maxJ<-max(tmpJ, na.rm=TRUE) pos.max<-neighbork[which(tmpJ==maxJ)] minJ<-min(tmpJ, na.rm=TRUE) pos.min<-neighbork[which(tmpJ==minJ)] medianJ<-median(tmpJ, na.rm=TRUE) neighbors.mat[k,2]<-pos.min[1] neighbors.mat[k,3]<-minJ neighbors.mat[k,4]<-pos.max[1] neighbors.mat[k,5]<-maxJ neighbors.mat[k,6]<-medianJ } return(neighbors.mat) } # Generate membership and cluster sizes # clustszind -- cluster size indicator. # clustszind=1 indicates that all cluster have equal size. # The size is specified by clustSizeEq. # clustszind=2 indicates that the cluster sizes are randomly # generated from the range rangeN. # clustszind=3 indicates that the cluster sizes are specified # via the vector clustSizes. # The default value is 2 so that the generated clusters are more realistic. # G -- the number of clusters # clustSizeEq -- if clustszind=1, then this is the constant cluster size # The default value 100 is a reasonable cluster size. # rangeN -- if clustszind=2, then the cluster sizes are randomly generaged # from the range 'rangeN'. The default range is [50, 200] which # can produce reasonable variability of cluster sizes. # clustSizes -- if clustszind=3, then the cluster sizes are specified via # clustSizes. An input is required with # the default value of 'clustSizes' set as 'NULL'. # p -- number of variables (non-noisy and noisy variables) # quiet -- a flag to switch on/off the outputs of intermediate results. # The default value is 'TRUE'. genMemSize<-function(clustszind, G, clustSizeEq, rangeN, clustSizes, p, quiet=TRUE) { if(!is.element(clustszind, c(1,2,3))) { stop("Cluster size indicator 'clustszind' should be 1, 2, or 3!\n") } clustSizeEq<-as.integer(clustSizeEq) if(clustSizeEq<2 || !is.integer(clustSizeEq)) { stop("Cluster size 'clustSizeEq' should be an integer greater than 1!\n") } if(length(rangeN)!=2) { stop("The range 'rangeN' for cluster size should be a numeric vector of length 2!\n") } if(rangeN[1]>rangeN[2]) { stop("First element of 'rangeN' should be smaller than second!\n") } rangeN<-as.integer(rangeN) if(rangeN[1]<2 || !is.integer(rangeN[1])) { stop("The lower bound 'rangeN[1]' for the range of cluster sizes should be an integer greater than 1!\n") } if(!is.integer(rangeN[2])) { stop("The upper bound 'rangeN[2]' for the range of cluster size should be integer!\n") } if(clustszind==3) { len<-length(clustSizes) if(len!=G || is.null(clustSizes)) { stop("The number of elements in 'clustSizes' should be equal the number 'G' of clusters when the value of 'clustszind' is equal to 3!\n") } clustSizes<-as.integer(clustSizes) for(i in 1:len) { if(clustSizes[i]<1 || !is.integer(clustSizes[i])) { stop(paste("The cluster size for the ", i, "-th cluster should be an integer greater than 1!\n", sep="")) } } } p<-as.integer(p) if(p<1 || !is.integer(p)) { stop("The number 'p' of variables should be positive integer!\n") } if(!is.logical(quiet)) { stop("The value of the quiet indicator 'quiet' should be logical, i.e., either 'TRUE' or 'FALSE'!\n") } if(!quiet) { cat(" *** Step 2.4: Generate cluster sizes and membership of each data point ***\n") } if(clustszind==1) { # N is the total sample size; mem is the set of the memberships of # data points. N<-G*clustSizeEq mem<-rep(1:G, rep(clustSizeEq, G)) mem<-sample(mem) # randomize the order of the membership } else if(clustszind==2) { n.low<-rangeN[1] n.upp<-rangeN[2] ratio<-n.upp/n.low if(n.low0) { stop("All elements of 'mypi' should be positive!\n") } tmp<-which(mypi>1) if(length(tmp)>0) { stop("All elements of 'mypi' should be less than 1!\n") } if(eps<=0 || eps > 0.01) { stop("The convergence threshold 'eps' should be in (0, 0.01]!\n") } if(abs(sum(mypi, na.rm=TRUE)-1.0)>eps) { stop("summation of 'mypi' should be equal to 1!\n") } G<-as.integer(G) if(G<1 || !is.integer(G)) { stop("The number of clusters should be positive integer!\n") } numNonNoisy<-as.integer(numNonNoisy) if(numNonNoisy<2 || !is.integer(numNonNoisy)) { stop("The number of non-noisy variables should be positive integer greater than 1!\n") } if(rangeVar[1]<0) { stop("The lower bound 'rangeVar[1]' for the range of variances should be positive!\n") } if(!is.logical(quiet)) { stop("The value of the quiet indicator 'quiet' should be logical, i.e., either 'TRUE' or 'FALSE'!\n") } if(numNoisy==0) { return(list(thetaMat=thetaMat, s=s, p=numNonNoisy)) } if(!quiet) { cat(" *** Step 2.5: Generate mean vector and covariance matrix for the noisy variables.***\n") } p2<-numNoisy # number of noisy variables # Obtain the mean vector and covariance matrix of the mixture of # distributions sum_{k=1}^{G}mypi[k]*f_k(X). tmp<-meanCovMixture(thetaMat, s, mypi) mu.noisy<-tmp$mu.mixture Sigma.noisy<-tmp$Sigma.mixture # Obtain the range of mu.noisy range.mu<-range(mu.noisy, na.rm=TRUE) # Generate the mean vector of noisy variables mu.noisy<-runif(n=p2, min=range.mu[1], max=range.mu[2]) # Obtain the range of the eigen values of Sigma.noisy egvalues<-eigen(Sigma.noisy, symmetric=TRUE)$values range.eg<-range(egvalues, na.rm=TRUE) # Generate the covariance matrix of noisy variables low<-range.eg[1]; upp<-range.eg[2]; # obtain covariance matrices tmp<-genPositiveDefMat(p2, covMethod, alphad, eta, rangeVar, low, upp/low) s.noisy<-tmp$Sigma # obtain covariance matrices p<-numNonNoisy+p2 # total number of variables # update the mean vectors and covariance matirces # The mean of noisy variables are zero mu.noisy2<-rep(mu.noisy, G) mu.mat<-matrix(mu.noisy2, nrow=p2, ncol=G) mu.mat<-t(mu.mat) tmpmu.mat<-matrix(0,nrow=G, ncol=p) tmpmu.mat[,1:numNonNoisy]<-thetaMat tmpmu.mat[,(numNonNoisy+1):p]<-mu.mat thetaMat<-tmpmu.mat tmps<-array(0, c(p,p,G)) dimnames(tmps)<-list(NULL, NULL, paste("cluster", 1:G, sep="")) for(i in 1:G) { tmps[1:numNonNoisy,1:numNonNoisy,i]<-s[,,i] tmps[(numNonNoisy+1):p, (numNonNoisy+1):p,i]<-s.noisy } return(list(thetaMat=thetaMat, s=tmps, p=p)) } # Obtain the mean vector and covariance matrix of the mixture of # distributions sum_{k=1}^{G}mypi[k]*f_k(X). # thetaMat -- cluster center matrix. thetaMat[i,] is the cluster center # for the i-th cluster # s -- array of covariance matrices. s[,,i] is the covariance matrix # for the i-th cluster # mypi -- the proportions of cluster sizes. sum(mypi)=1. meanCovMixture<-function(thetaMat, s, mypi) { tmp<-which(mypi<0) if(length(tmp)>0) { stop("All elements of 'mypi' should be positive!\n") } tmp<-which(mypi>1) if(length(tmp)>0) { stop("All elements of 'mypi' should be less than 1!\n") } G<-nrow(thetaMat) p<-ncol(thetaMat) mu.mixture<-as.vector(mypi[1]*thetaMat[1,]) Sigma.mixture<-mypi[1]*s[,,1] for(k in 2:G) { mu.mixture<-mu.mixture+as.vector(mypi[k]*thetaMat[k,]) Sigma.mixture<-Sigma.mixture+mypi[k]*s[,,k] } term2<-rep(0, p) for(k1 in 1:(G-1)) { muk1<-as.vector(thetaMat[k1,]) for(k2 in (k1+1):G) { muk2<-as.vector(thetaMat[k2,]) tmpk12<-as.vector(muk1-muk2) term2<-term2+mypi[k1]*mypi[k2]*tmpk12%*%t(tmpk12) } } Sigma.mixture<-Sigma.mixture+term2 return(list(mu.mixture=mu.mixture, Sigma.mixture=Sigma.mixture)) } # Update mean vectors and covariance matrices of non-noisy variables. # e.g. rotating data and randomizing the order of variables. # We should not rotate noisy variables in case that noisy variables are # changed to non-noisy via rotation. e.g. a bivariate data set, the second # variable is noisy variable and the first variable is non-noisy variable. # If we rotate data 90 degree, then the second variable is non-noisy while # the first variable is noisy. # We rotate data by the transformation Y=QX. So mu_Y=Q*mu_X, # Sigma_Y=Q*Sigma_X*Q^T. # # rotateind-- if rotateind =TRUE, then rotate data so that we may not detect the # full cluster structure from scatter plots, otherwise not rotate. # By default, 'rotateind=TRUE' to generate more realistic data sets. # thetaMat -- cluster center matrix. thetaMat[i,] is the cluster center # for the i-th cluster # s -- array of covariance matrices. s[,,i] is the covariance matrix # for the i-th cluster # G -- the number of clusters # p -- the number of variables (both non-noisy and noisy variables) # numNoisy -- the number of noisy variables # quiet -- a flag to switch on/off the outputs of intermediate results. # The default value is 'TRUE'. updateMeanCov<-function(rotateind, thetaMat, s, G, p, numNoisy, quiet=TRUE) { if(!is.logical(rotateind)) { stop("The value of the rotation indicator 'rotateind' should be logical, i.e., either 'TRUE' or 'FALSE'!\n") } G<-as.integer(G) if(G<1 || !is.integer(G)) { stop("The number 'G' of clusters should be positive integer!\n") } p<-as.integer(p) if(p<1 || !is.integer(p)) { stop("The number 'p' of variables should be positive integer!\n") } numNoisy<-as.integer(numNoisy) if(numNoisy<0 || !is.integer(numNoisy)) { stop("The number 'numNoisy' of noisy variables should be a non-negative integer!\n") } if(!is.logical(quiet)) { stop("The value of the quiet indicator 'quiet' should be logical, i.e., either 'TRUE' or 'FALSE'!\n") } if(!quiet) { cat(" *** Step 2.6: Generate the numNonNoisy x numNonNoisy orthogonal matrix Q ***\n") } numNonNoisy<-p-numNoisy # the number of non-noisy variables if(rotateind) { Q<-genOrthogonal(numNonNoisy) } else { Q<-diag(numNonNoisy) } if(!quiet) { cat(" *** Step 2.7: Obtain the rotated mean vectors and covariance matrices. The noisy variables will not be rotated. ***\n") } muMat<-as.matrix(thetaMat) muMat[,1:numNonNoisy]<-muMat[,1:numNonNoisy]%*%t(Q) SigmaArray<-s for(i in 1:G) { SigmaArray[1:numNonNoisy,1:numNonNoisy,i]<-Q %*% s[1:numNonNoisy,1:numNonNoisy,i]%*%t(Q) } if(!quiet) { cat(" *** Step 2.8: Randomize the order of variables (including noisy variables)***\n") } # randomize columns myset1<-1:p myset2<-sample(myset1, replace=FALSE) if(numNoisy>0) { mynoisy<-which(myset2>numNonNoisy) } else { mynoisy<-0 } muMat<-muMat[,myset2] for(i in 1:G) { SigmaArray[,,i]<-SigmaArray[myset2, myset2, i] } return(list(muMat=muMat, SigmaArray=SigmaArray, noisySet=mynoisy, Q=Q)) } # Generate Data # mem -- memberships of data points # N -- the total number of data points # G -- the number of clusters # p -- the total number of variables # muMat -- cluster center matrix. thetaMat[i,] is the cluster center # for the i-th cluster # SigmaArray -- array of covariance matrices. s[,,i] is the covariance matrix # for the i-th cluster # size -- cluster sizes # quiet -- indicates if the intermediate results should be output. genData<-function(mem, N, G, p, muMat, SigmaArray, size, quiet=TRUE) { if(length(unique(mem))!=G) { stop("The number of clusters obtained from the membership vector 'mem' is not equal to the specified number 'G' of clusters!\n") } G<-as.integer(G) if(G<1 || !is.integer(G)) { stop("The number 'G' of clusters should be positive integer!\n") } p<-as.integer(p) if(p<1 || !is.integer(p)) { stop("The number 'p' of variables should be positive integer!\n") } if(length(size)!=G) { stop("The length of 'size' is not equal to the number of clusters!\n") } tmp<-which(size<1) if(length(tmp)) { stop("cluster sizes should be positive!\n") } size<-as.integer(size) for(i in 1:length(size)) { if(!is.integer(size[i])) { stop("cluster sizes should be integer!\n") } } if(!is.logical(quiet)) { stop("The value of the quiet indicator 'quiet' should be logical, i.e., either 'TRUE' or 'FALSE'!\n") } if(!quiet) { cat(" *** Step 2.9: Generate data set according to the mean vectors and covariance matrices, membership, and cluster sizes.***\n") } y<-matrix(0, nrow=N, ncol=p) for(i in 1:G) { y[mem==i,]<-mvrnorm(size[i], mu=muMat[i,], Sigma=SigmaArray[,,i]) } return(y) } # Generate outliers. Outliers will be generated from a uniform distribution. # numOutlier -- number/ratio of outliers. If numOutlier is a positive integer, # then numOutlier means the number of outliers. If numOutlier is a real # number between (0, 1), then numOutlier means the ratio of outliers, i.e. # the number of outliers is equal to round(numOutlier*N), where N is the # total number of non-outliers. If numOutlier is a real number greater than # 1, then we round numOutlier to an integer. # y -- Nxp data matrix genOutliers<-function(numOutlier, y) { if(numOutlier<0) { stop("Number of outliers should be positive!\n") } N<-nrow(y) p<-ncol(y) # obtain the number of outliers nOut<-outlierSize(numOutlier, N) if(nOut>0) { # Find the cooridinate-wise mean and sd of the data matrix y mu<-apply(y, 2, mean, na.rm=TRUE) s<-apply(y, 2, sd, na.rm=TRUE) # Calculate the range of the uniform distribution in each dimension. # [mu[j]-5*s[j], mu[j]+5*s[j]], j=1,...,p. y.out<-matrix(0,nrow=nOut, ncol=p) for(j in 1:p) { low<-mu[j]-4*s[j] upp<-mu[j]+4*s[j] y.out[,j]<-runif(nOut, min=low, max=upp) } } else { y.out<-NULL } return(list(y.out=y.out, nOut=nOut)) } # Calculate the number of outliers # numOutlier -- number/ratio of outliers. If numOutlier is a positive integer, # then numOutlier means the number of outliers. If numOutlier is a real # number between (0, 1), then numOutlier means the ratio of outliers, i.e. # the number of outliers is equal to round(numOutlier*N), where N is the # total number of non-outliers. If numOutlier is a real number greater than # 1, then we round numOutlier to an integer. # N -- the total number of non-outliers outlierSize<-function(numOutlier, N) { if(numOutlier<0) { stop("The argument 'numOutlier' should be non-negative!\n") } N<-as.integer(N) if(N<0 || !is.integer(N)) { stop("The number 'N' of non-outliers should be positive integer!\n") } if(numOutlier>=1) { nOut=floor(numOutlier) } else if(numOutlier>0) { nOut<-round(numOutlier*N) } else { nOut<-0 } return(nOut) } # Output the number of data points, the number of non-noisy variables, the # number of noisy variables, the labels of noisy variables, the mean # vectors, covariance matrices, theoretical and empirical separation indices # and projection directions, asymptotic confidence lower bounds, rotation # matrix, etc. # b -- indicates the file is the b-th replicate # fileName -- the first part of the log file names # alpha -- the tuning parameter in the separation index # sepVal -- the minimum separation index set as a priori # G -- the number of cluster # size -- cluster sizes # N -- the total number of clusters # p -- the total number of variables # numNoisy -- the number of noisy variables # noisySet -- the labels of noisy variables # nOut -- the number of outliers # thetaMat -- the mean vectors before rotation and randomizing orders # s -- the array of covariance matrices before rotation and randomizing columns # muMat -- the mean vectors after rotation and randomizing columns # SigmaArray -- the array of covariance matrices after rotation and randomizing # columns # Q -- the rotation matrix # sepValMat -- the theoretical separation index matrix # Jhat2 -- the empirical separation index matrix # myprojDir -- the theoretical projection directions # empProjDir -- the empirical projection directions # egvaluesMat -- 'numClust' by 'p' eigenvalue matrix. Each row contains the # 'p' eigenvalues of a cluster, where 'numClust' is the number # of clusters, 'p' is the number of variables # quiet -- a flag to switch on/off the outputs of intermediate results. # The default value is 'TRUE'. # outputEmpirical -- indicates if empirical projection directions # and separation indices should be output to files. # These inforamtion usually are useful to check the cluster # structures. Hence, by default, 'outputEmpirical=TRUE'. outputLog<-function(b, fileName, alpha, sepVal, G, size, N, p, numNoisy, noisySet, nOut, thetaMat, s, muMat, SigmaArray, Q, sepValMat, Jhat2, myprojDir, empProjDir, egvaluesMat, quiet=TRUE, outputEmpirical=TRUE) { # output log information, e.g. mean vectors and covariance matrices. if(!quiet) { cat(" *** Step 2.11: Output results ***\n") } datafileName<-paste(fileName, "_", b, ".dat", sep="") logfileName<-paste(fileName, "_", b, ".log", sep="") tem<-paste("\n Log information for the data set ", datafileName, " >>", sep="") write.table(tem,append=FALSE, file=logfileName,quote=FALSE, row.names=FALSE,col.names=FALSE) # output parameters outputLogPar(logfileName,G,size,N,p,numNoisy,noisySet,nOut,alpha, sepVal, egvaluesMat) # output Mean vectors and covariance matrices before rotation outputLogMeanCov(logfileName, G, thetaMat, s, flag=1) # output rotation matrix outputLogQ(logfileName, Q) # output Mean vectors and covariance matrices after rotation outputLogMeanCov(logfileName, G, muMat, SigmaArray, flag=2) # output theoretical Separation index matrix and projection directions outputLogSepProj(logfileName, G, sepValMat, myprojDir) if(outputEmpirical) { # output empirical Separation index matrix and projection directions outputLogSepProjData(logfileName,G,alpha,Jhat2,empProjDir) } tem<-paste("\n *********** end file ", datafileName, " **********\n", sep="") } # output parameters # logfileName -- file name of the log file # G -- the number of cluster # size -- cluster sizes # N -- the total number of clusters # p -- the total number of variables # numNoisy -- the number of noisy variables # noisySet -- the labels of noisy variables # nOut -- the number of outliers # alpha -- the tuning parameter in the separation index # sepVal -- the minimum separation index which is set a priori # egvaluesMat -- 'numClust' by 'p' eigenvalue matrix. Each row contains the # 'p' eigenvalues of a cluster, where 'numClust' is the number # of clusters, 'p' is the number of variables outputLogPar<-function(logfileName,G,size,N,p,numNoisy,noisySet,nOut,alpha, sepVal, egvaluesMat) { if(!is.numeric(alpha)) { tt1<-alpha } else { tt1<-round(alpha, 3) } if(!is.numeric(sepVal)) { tt2<-sepVal } else { tt2<-round(sepVal, 3) } tem<-paste("\n alpha = ", tt1, " sepVal = ", tt2, sep="") write.table(tem,append=TRUE, file=logfileName,quote=FALSE, row.names=FALSE,col.names=FALSE) tem<-paste("\n Number of clusters >> ", G, sep="") write.table(tem,append=TRUE, file=logfileName,quote=FALSE, row.names=FALSE,col.names=FALSE) tem<-paste("\n cluster sizes >>") write.table(tem,append=TRUE, file=logfileName,quote=FALSE, row.names=FALSE,col.names=FALSE) write.table(t(size),append=TRUE, file=logfileName,quote=FALSE, row.names=FALSE,col.names=FALSE) tem<-paste("\n Number of data points >> ", N, sep="") write.table(tem,append=TRUE, file=logfileName,quote=FALSE, row.names=FALSE,col.names=FALSE) tem<-paste("\n Number of dimensions >> ", p, sep="") write.table(tem,append=TRUE, file=logfileName,quote=FALSE, row.names=FALSE,col.names=FALSE) tem<-paste("\n Number of noisy dimensions >> ", numNoisy, sep="") write.table(tem,append=TRUE, file=logfileName,quote=FALSE, row.names=FALSE,col.names=FALSE) tem<-paste("\n noisy variables >>") write.table(tem,append=TRUE, file=logfileName,quote=FALSE, row.names=FALSE,col.names=FALSE) write.table(t(noisySet),append=TRUE, file=logfileName,quote=FALSE, row.names=FALSE,col.names=FALSE) tem<-paste("\n Number of outliers >> ", nOut, sep="") write.table(tem,append=TRUE, file=logfileName,quote=FALSE, row.names=FALSE,col.names=FALSE) tem<-paste("\n initial eigenvalues for each cluster >> \n (rows correspond to clusters)", sep="") write.table(tem,append=TRUE, file=logfileName,quote=FALSE, row.names=FALSE,col.names=FALSE) if(!is.numeric(egvaluesMat)) { tt<-egvaluesMat } else { tt<-round(egvaluesMat, 3) } write.table(tt,append=TRUE, file=logfileName,quote=FALSE, row.names=FALSE,col.names=FALSE) } # output Mean vectors and covariance matrices before/after rotation # logfileName -- name of the logfile # G -- the number of clusters # thetaMat -- the cluster center matrix # s -- the array of covariance matrices # flag -- flag =1 means before rotation and randomization # flag =2 means after rotation and randomization outputLogMeanCov<-function(logfileName, G, thetaMat, s, flag=1) { p<-nrow(s[,,1]) egvaluesMat<-matrix(0,nrow=G, ncol=p) rownames(egvaluesMat)<-paste("cluster", 1:G, sep="") colnames(egvaluesMat)<-paste("variable", 1:p, sep="") for(i in 1:G) { if(flag==1) # before rotation { tem<-paste("\n Mean vectors and covariance matrices before rotation --- ", i, "th cluster >>", sep="") } else { tem<-paste("\n Mean vectors and covariance matrices after rotation --- ", i, "th cluster >>", sep="") } write.table(tem,append=TRUE, file=logfileName,quote=FALSE, row.names=FALSE,col.names=FALSE) tem<-paste("\n mean vector (mu'=Q*mu)>>") write.table(tem,append=TRUE, file=logfileName,quote=FALSE, row.names=FALSE,col.names=FALSE) if(!is.numeric(thetaMat[i,])) { tt<-thetaMat[i,] write.table(tt,append=TRUE, file=logfileName,quote=FALSE, row.names=FALSE,col.names=FALSE) } else { tt<-round(thetaMat[i,],3) write.table(t(tt),append=TRUE, file=logfileName,quote=FALSE, row.names=FALSE,col.names=FALSE) } tem<-paste("\n covariance matrix (Sigma'=Q*Sigma*t(Q))>>\n") write.table(tem,append=TRUE, file=logfileName,quote=FALSE, row.names=FALSE,col.names=FALSE) if(!is.numeric(s[,,i])) { tt<-s[,,i] } else { tt<-round(s[,,i], 3) } write.table(tt,append=TRUE, file=logfileName,quote=FALSE, row.names=FALSE,col.names=FALSE) egvaluesMat[i,]<-eigen(s[,,i])$values } tem<-paste("\n Final eigenvalues for each cluster >> \n (rows correspond to clusters)", sep="") write.table(tem,append=TRUE, file=logfileName,quote=FALSE, row.names=FALSE,col.names=FALSE) if(!is.numeric(egvaluesMat)) { tt<-egvaluesMat } else { tt<-round(egvaluesMat, 3) } write.table(tt,append=TRUE, file=logfileName,quote=FALSE, row.names=FALSE,col.names=FALSE) } # output rotation matrix # logfileName -- name of the logfile # Q -- the rotation matrix outputLogQ<-function(logfileName, Q) { tem<-paste("\n *************************************************\n", sep="") write.table(tem,append=TRUE, file=logfileName,quote=FALSE, row.names=FALSE,col.names=FALSE) tem<-paste(" ********************* rotation ******************\n", sep="") write.table(tem,append=TRUE, file=logfileName,quote=FALSE, row.names=FALSE,col.names=FALSE) tem<-paste(" *************************************************\n", sep="") write.table(tem,append=TRUE, file=logfileName,quote=FALSE, row.names=FALSE,col.names=FALSE) tem<-paste("\n The orthogonal matrix Q >>") write.table(tem,append=TRUE, file=logfileName,quote=FALSE, row.names=FALSE,col.names=FALSE) if(!is.numeric(Q)) { tt<-Q } else { tt<-round(Q, 3) } write.table(tt,append=TRUE, file=logfileName,quote=FALSE, row.names=FALSE,col.names=FALSE) } # output Separation index matrix and projection directions # logfileName -- the name of the logfile # G -- the number of clusters # sepValMat -- the theoretical separation index matrix # myprojDir -- the theoretical projection direction outputLogSepProj<-function(logfileName, G, sepValMat, myprojDir) { tem<-paste("\n theoretical Separation index matrix >>\n") write.table(tem,append=TRUE, file=logfileName,quote=FALSE, row.names=FALSE,col.names=FALSE) if(!is.numeric(sepValMat)) { tt<-sepValMat } else { tt<-round(sepValMat, 3) } write.table(tt,append=TRUE, file=logfileName,quote=FALSE, row.names=FALSE,col.names=FALSE) tem<-paste("\n theoretical projection directions >>") write.table(tem,append=TRUE, file=logfileName,quote=FALSE, row.names=FALSE,col.names=FALSE) for(i in 1:(G-1)) { for(j in (i+1):G) { tem<-paste("\n**** cluster ",i," and ", j,sep=""); write.table(tem,append=TRUE, file=logfileName,quote=FALSE, row.names=FALSE,col.names=FALSE) if(!is.numeric(as.vector(myprojDir[i,j,]))) { tt<-as.vector(myprojDir[i,j,]) write.table(tt,append=TRUE, file=logfileName, quote=FALSE, row.names=FALSE,col.names=FALSE) } else { tt<-round(as.vector(myprojDir[i,j,]),3) write.table(t(tt),append=TRUE, file=logfileName, quote=FALSE, row.names=FALSE,col.names=FALSE) } } } } # output empirical Separation index matrix and projection directions # logfileName -- the name of the logfile # G -- the number of clusters # alpha -- the tuning parameter in the separation index # Jhat2 -- the empirical separation index matrix # empProjDir -- the empirical projection directions outputLogSepProjData<-function(logfileName,G,alpha, Jhat2,empProjDir) { tem<-paste("\n empirical Separation index matrix (alpha=",alpha,")>>\n") write.table(tem,append=TRUE, file=logfileName,quote=FALSE, row.names=FALSE,col.names=FALSE) if(!is.numeric(Jhat2)) { tt<-Jhat2 } else { tt<-round(Jhat2, 3) } write.table(tt,append=TRUE, file=logfileName,quote=FALSE, row.names=FALSE,col.names=FALSE) tem<-paste("\n empirical projection directions >>") write.table(tem,append=TRUE, file=logfileName,quote=FALSE, row.names=FALSE,col.names=FALSE) for(i in 1:(G-1)) { for(j in (i+1):G) { tem<-paste("\n**** cluster ",i," and ", j,sep=""); write.table(tem,append=TRUE, file=logfileName,quote=FALSE, row.names=FALSE,col.names=FALSE) if(!is.numeric(as.vector(empProjDir[i,j,]))) { tt<-as.vector(empProjDir[i,j,]) write.table(tt,append=TRUE, file=logfileName, quote=FALSE, row.names=FALSE,col.names=FALSE) } else { tt<-round(as.vector(empProjDir[i,j,]), 3) write.table(t(tt),append=TRUE, file=logfileName, quote=FALSE, row.names=FALSE,col.names=FALSE) } } } } # output data set # b -- indicates the data set is the b-th replicate # fileName -- the first part of the name of the data set # y -- Nxp data matrix # p -- the total number of variables outputData<-function(b, fileName, y, p, outputDatFlag=TRUE) { tem<-paste("x", 1:p, sep="") if(outputDatFlag) { datafileName<-paste(fileName, "_", b, ".dat", sep="") write.table(y,file=datafileName,quote=FALSE,row.names=FALSE,col.names=tem) } colnames(y)<-tem return(y) } # Generate a Positive Definite Matrix # covMethod -- methods to generate random covariance matrices. # 'eigen' method first generates the eigenvalues of the covariance matrix, # then generate eigenvectors to construct the covariance matrix. # 'unifcorrmat' method first generates a correlation # matrix via the method proposed by # Joe H (2006). Generating random correlation matrices based on # partial correlations. J. Mult. Anal. Vol. 97, 2177--2189 # Then, it generate variances from the range 'rangeVar' to # construct the covariance matrix. # There is no default method. # rangeVar -- if 'covMethod="unifcorrmat"', then to generate a covariance # matrix, we first generate a correlation matrix, then randomly generate # variances from the range specified by 'rangeVar'. # The default range is [1, 10] which can generate reasonable # variability of variances. # lambdaLow -- if 'covMethod="eigen"', when generating a covariance matrix, # we first generate its eigenvalues. The eigenvalues are randomly # generated from the interval [lambdaLow, lambdaLow*ratioLambda]. # Note that lambdaLow should be positive. # In our experience, the range [lambdaLow=1, ratioLambda=10] # can give reasonable variability of the diameters of clusters. # ratioLambda -- if 'covMethod="eigen"', lambdaUpp=lambdaLow*ratioLambda, # is the upper bound of the eigenvalues of a covariance matrix # and lambdaLow is the lower bound. # In our experience, the range [lambdaLow=1, lambdaUpp=10] # can give reasonable variability of the diameters of clusters. # dim -- the dimension of this positive definite matrix genPositiveDefMat<-function(dim, covMethod=c("eigen", "onion", "c-vine", "unifcorrmat"), alphad=1, eta=1, rangeVar=c(1,10), lambdaLow=1, ratioLambda=10) { covMethod<-match.arg(arg=covMethod, choices=c("eigen", "onion", "c-vine", "unifcorrmat")) if(rangeVar[1]>rangeVar[2]) { stop("First element of 'rangeVar' should be smaller than second!\n") } if(rangeVar[1]<0) { stop("The lower bound 'rangeVar[1]' for the range of variances should be positive!\n") } if(lambdaLow<0) { stop("The lower bound 'lambdaLow' of eigenvalues of cluster covariance matrices should be greater than zero!\n") } if(ratioLambda<1) { stop("The ratio 'ratioLambda' of the upper bound of the eigenvalues to the lower bound of the eigenvalues of cluster covariance matrices should be greater than 1!\n") } dim<-as.integer(dim) if(dim<1 || !is.integer(dim)) { stop("The dimension 'dim' should be an integer greater than 1!\n") } if(covMethod=="eigen") { low<-lambdaLow upp<-lambdaLow*ratioLambda u<-matrix(0, dim,dim) # u is an diagonal matrix egvalues<-runif(dim,min=low,max=upp) diag(u)<-egvalues #the diagonal elements of u are positive Sigma<-u if(dim>1) { Q<-genOrthogonal(dim) # generate an orthogonal matrix Sigma<-Q%*%u%*%t(Q) # the final positive definite matrix } } else if (covMethod=="onion") { # use 'onion' method for random correlation matrix # and uniform distributions for random variances # Reference: # Joe H (2006). Generating random correlation matrices based on partial # correlations. J. Mult. Anal. Vol. 97, 2177--2189 rr<-rcoronion(dim,eta) sigma2<-runif(dim, min=rangeVar[1], max=rangeVar[2]) if(dim>1) { dd<-diag(sqrt(sigma2)) } else { dd<-sqrt(sigma2) } Sigma<-dd%*%rr%*%dd egvalues<-eigen(Sigma)$values } else if (covMethod=="c-vine"){ # use 'c-vine' method for random correlation matrix # and uniform distributions for random variances # Reference: # Joe H (2006). Generating random correlation matrices based on partial # correlations. J. Mult. Anal. Vol. 97, 2177--2189 rr<-rcorcvine(dim, eta) sigma2<-runif(dim, min=rangeVar[1], max=rangeVar[2]) if(dim>1) { dd<-diag(sqrt(sigma2)) } else { dd<-sqrt(sigma2) } Sigma<-dd%*%rr%*%dd egvalues<-eigen(Sigma)$values } else { # use hjrancor.R for random correlation matrix # and uniform distributions for random variances # Reference: # Joe H (2006). Generating random correlation matrices based on partial # correlations. J. Mult. Anal. Vol. 97, 2177--2189 rr<-rcorrmatrix(dim, alphad=1) sigma2<-runif(dim, min=rangeVar[1], max=rangeVar[2]) if(dim>1) { dd<-diag(sqrt(sigma2)) } else { dd<-sqrt(sigma2) } Sigma<-dd%*%rr%*%dd egvalues<-eigen(Sigma)$values } return(list(egvalues=egvalues, Sigma=Sigma)) } # generate orthogonal matrix # dim -- dimension genOrthogonal<-function(dim) { Q<-MOrthogonal(runif(dim)) return(Q) } # Construct an orthogonal matrix whose first few columns are standardized 'M' # where columns of 'M' are orthogonal. # Here "standardized 'M'" means each its columns has length 1. MOrthogonal<-function(M) { # can set the parameter "tol" of "qr" to decide how small value should be 0 tmp<-qr(M) Q<-qr.Q(tmp,complete=TRUE) if(is.vector(M)) { if(Q[1]*M[1]<0) Q<- -Q } else { if(Q[1,1]*M[1,1]<0) Q<- - Q } return(Q) } # Get a scalar 'A' such that the minimum separation index between clusters # and their nearest neighboring clusters is equal to sepVal. # The mean vector of the cluster 1 is (0,...,0). Other mean vectors # have the form (0,...,0,A), (0,...,A,0),...,(A,0,...,0). # We want make sure the separation # index between the cluster i and other clusters is at least sepVal, # i=1,...,G, where G is the number of clusters. # G -- the number of clusters # sArray -- the array contains the covariance matrices of the clusters # sepVal -- the minimum separation index which is set as a priori # A2 -- the upper bound of A. # See documentation of genRandomClust for explanation of arguments: # iniProjDirMethod, projDirMethod, alpha, ITMAX, eps, quiet getA2<-function(G, sArray, sepVal, A2, iniProjDirMethod=c("SL","naive"), projDirMethod=c("newton", "fixedpoint"), alpha=0.05, ITMAX=20, eps=1.0e-10, quiet=TRUE) { iniProjDirMethod<-match.arg(arg=iniProjDirMethod, choices=c("SL", "naive")) projDirMethod<-match.arg(arg=projDirMethod, choices=c("newton", "fixedpoint")) G<-as.integer(G) if(G<1 || !is.integer(G)) { stop("The number 'G' of clusters should be positive integer!\n") } if(sepVal<= -0.999 || sepVal>=0.999) { stop("The separation index 'sepVal' should be in (-0.999, 0.999)!\n") } if(A2<0) { stop("The initial upper bound 'A2' for the vertex-edge scalar 'A' should be positive!\n") } if(alpha<=0 || alpha>0.5) { stop("'alpha' should be between (0, 0.5]!\n") } ITMAX<-as.integer(ITMAX) if(ITMAX<1 || !is.integer(ITMAX)) { stop("The maximum iteration allowed 'ITMAX' should be positive integer!\n") } if(eps<=0 || eps > 0.01) { stop("The convergence threshold 'eps' should be in (0, 0.01]!\n") } if(!is.logical(quiet)) { stop("The value of the quiet indicator 'quiet' should be logical, i.e., either 'TRUE' or 'FALSE'!\n") } # minimum scalar allowed so that the separation between a cluster and # its direct neighboring clusters is at least sepVal minA<- -3 numNonNoisy<-ncol(sArray[,,1]) # get the vertices of the simplex in numNonNoisy dimensional space # edge lengths of the simplex are all equal to 2 vertex<-genShiftedVertexes(G, numNonNoisy) d<-as.matrix(dist(vertex)) myS<-1:G while(length(myS)>1) { i<-myS[1] myS<-myS[-1] myD<-which(abs(d[i, myS]-2)= sepVal for(j in myD) { sj<-sArray[,,j] vertexj<-vertex[j,]; # get interval [A1, A2] so that J^*_{ij}(A1)<0 and J^*_{ij}(A2)>0 tmp<-getAInterval(vertexi,vertexj,si,sj,A2,sepVal, iniProjDirMethod, projDirMethod, alpha,ITMAX, eps, quiet) A1<-tmp$A1 A2<-tmp$A2 # find the value of A so that the separation index J^*_{ij}=sepVal. tmpA<-getA(A1, A2, vertexi, vertexj, si, sj, sepVal, iniProjDirMethod, projDirMethod, alpha, ITMAX, eps, quiet) if(minA0 # vertexi -- vertex 1 # vertexj -- vertex 2 # si -- covariance matrix for vertex 1 # sj -- covariance matrix for vertex 2 # A2 -- the upper bound of A. # sepVal -- the desired separation index between cluster i and cluster j # See documentation of genRandomClust for explanation of arguments: # iniProjDirMethod, projDirMethod, alpha, ITMAX, eps, quiet getAInterval<-function(vertexi, vertexj, si, sj, A2, sepVal, iniProjDirMethod, projDirMethod, alpha=0.05, ITMAX=20, eps=1.0e-10, quiet=TRUE) { iniProjDirMethod<-match.arg(arg=iniProjDirMethod, choices=c("SL", "naive")) projDirMethod<-match.arg(arg=projDirMethod, choices=c("newton", "fixedpoint")) if(A2<0) { stop("The initial upper bound 'A2' for the vertex-edge scalar 'A' should be positive!\n") } if(sepVal<= -0.999 || sepVal>=0.999) { stop("The separation index 'sepVal' should be in (-1, 1)!\n") } if(alpha<=0 || alpha>0.5) { stop("'alpha' should be between (0, 0.5]!\n") } ITMAX<-as.integer(ITMAX) if(ITMAX<1 || !is.integer(ITMAX)) { stop("The maximum iteration allowed 'ITMAX' should be positive integer!\n") } if(eps<=0 || eps > 0.01) { stop("The convergence threshold 'eps' should be in (0, 0.01]!\n") } if(!is.logical(quiet)) { stop("The value of the quiet indicator 'quiet' should be logical, i.e., either 'TRUE' or 'FALSE'!\n") } tmpsi<-min(diag(si), na.rm=TRUE) tmpsj<-min(diag(sj), na.rm=TRUE) A1<-min(tmpsi, tmpsj, na.rm=TRUE) # get A1 so that the separation index between cluster i and j <0. while(1) { tmp<-funSepVal(A1, vertexi, vertexj, si, sj, sepVal, iniProjDirMethod, projDirMethod, alpha, ITMAX, eps, quiet) if(tmp>0) { A1<-A1/2 } else { break } } # get A2 so that the separation index between cluster i and j >0. tmpsi<-max(diag(si), na.rm=TRUE) tmpsj<-max(diag(sj), na.rm=TRUE) mystep<-max(tmpsi, tmpsj, na.rm=TRUE) while(1) { tmp2<-funSepVal(A2, vertexi, vertexj, si, sj, sepVal, iniProjDirMethod, projDirMethod, alpha, ITMAX, eps, quiet) if(tmp2<0) { A2<-A2+mystep } else { break } } return(list(A1=A1, A2=A2)) } # estimate the value of A so that the separation index between cluster 1 and cluster 2 is sepVal # A1 -- lower bound of A # A2 -- upper bound of A # i -- indicate we are dealing with the separation index between cluster 1 and cluster i # vertex1 -- vertex 1 # vertex2 -- vertex i # Sigma1 -- covariance matrix of the cluster 1 # Sigma2 -- covariance matrix of the cluster i # sepVal -- the desired separation index between cluster 1 and cluster i # See documentation of genRandomClust for explanation of arguments: # iniProjDirMethod, projDirMethod, alpha, ITMAX, eps, quiet getA<-function(A1, A2, vertex1, vertex2, Sigma1, Sigma2, sepVal, iniProjDirMethod=c("SL", "naive"), projDirMethod=c("newton", "fixedpoint"), alpha=0.05, ITMAX=20, eps=1.0e-10, quiet=TRUE) { iniProjDirMethod<-match.arg(arg=iniProjDirMethod, choices=c("SL", "naive")) projDirMethod<-match.arg(arg=projDirMethod, choices=c("newton", "fixedpoint")) if(A1<0) { stop("The initial lower bound 'A1' for the vertex-edge scalar 'A' should be positive!\n") } if(A2<0) { stop("The initial upper bound 'A2' for the vertex-edge scalar 'A' should be positive!\n") } if(A1>A2) { stop("'A1' should be less than 'A2'!\n") } if(sepVal<= -0.999 || sepVal>=0.999) { stop("The separation index 'sepVal' should be between (-1, 1)!\n") } if(alpha<=0 || alpha>0.5) { stop("'alpha' should be between (0, 0.5]!\n") } ITMAX<-as.integer(ITMAX) if(ITMAX<1 || !is.integer(ITMAX)) { stop("The maximum iteration allowed 'ITMAX' should be positive integer!\n") } if(eps<=0 || eps > 0.01) { stop("The convergence threshold 'eps' should be between (0, 0.01]!\n") } if(!is.logical(quiet)) { stop("The value of the quiet indicator 'quiet' should be logical, i.e., either 'TRUE' or 'FALSE'!\n") } newfit<-0 class(newfit)<-"try-error" newfit<-try( tmp<-uniroot(f=funSepVal, lower=A1, upper=A2, vertex1=vertex1, vertex2=vertex2, Sigma1=Sigma1, Sigma2=Sigma2, sepVal=sepVal, iniProjDirMethod=iniProjDirMethod, projDirMethod=projDirMethod, alpha=alpha, ITMAX=ITMAX, eps=eps, quiet=quiet) ) if(sum(class(newfit)=="try-error", na.rm=TRUE)) { warning("Could not find suitable upper bound of 'myc'!\n 'myc' is set to be 'A2'!\n") A <-A2 } else { A<-tmp$root } return(A) # A<-tmp$root # return(A) } # given mui, Sigma1, Sigma2, first to calculate the separation index. # then get the difference between this separation index and the given # separation index. # Sigma1 -- the covariance matrix of the first cluster. The mean vector of # the first cluster is 0. # Sigma2 -- the covariance matrix of the second cluster. The mean vector of # the second cluser is (0,...,0, A, 0, ..., 0), where A is the # ith element. # A -- see Sigma2 # i -- see Sigma2 # sepVal -- the given separation index # See documentation of genRandomClust for explanation of arguments: # iniProjDirMethod, projDirMethod, alpha, ITMAX, eps, quiet funSepVal<-function(A, vertex1, vertex2, Sigma1, Sigma2, sepVal, iniProjDirMethod=c("SL", "naive"), projDirMethod=c("newton", "fixedpoint"), alpha=0.05, ITMAX=20, eps=1.0e-10, quiet=TRUE) { iniProjDirMethod<-match.arg(arg=iniProjDirMethod, choices=c("SL", "naive")) projDirMethod<-match.arg(arg=projDirMethod, choices=c("newton", "fixedpoint")) if(sepVal<= -0.999 || sepVal>=0.999) { stop("The separation index 'sepVal' should be in (-0.999, 0.999)!\n") } if(alpha<=0 || alpha>0.5) { stop("'alpha' should be in (0, 0.5]!\n") } ITMAX<-as.integer(ITMAX) if(ITMAX<1 || !is.integer(ITMAX)) { stop("The maximum iteration allowed 'ITMAX' should be positive integer!\n") } if(eps<=0 || eps > 0.01) { stop("The convergence threshold 'eps' should be in (0, 0.01]!\n") } p<-ncol(Sigma1) mu1<-A*vertex1 mu2<-A*vertex2 # get the initial projection direction iniProjDir<-getIniProjDirTheory(mu1, Sigma1, mu2, Sigma2, iniProjDirMethod, eps) # get the projection direction tmpa<-projDirTheory(iniProjDir, mu1, Sigma1, mu2, Sigma2, projDirMethod, alpha, ITMAX, eps, quiet) a<-tmpa$projDir # get separation index tmpsepVal<-tmpa$sepVal res<-tmpsepVal-sepVal return(res) } clusterGeneration/R/visual.R0000644000176000001440000004531111731723336015640 0ustar ripleyusers# v1.2.5 # (1) fixed a bug in the function 'plot2DProjection'. The 'lim' should be # 'ylim' # # plot separation plane a'(y-eta)=0, i.e. a'y-b=0, where b=a'eta. # y1 -- cluster 1. Rows correspond to observations. # Columns correspond to variables # y2 -- cluster 2. Rows correspond to observations. # Columns correspond to variables # projDir -- projection direction # bw -- the smoothing bandwidth to be used by the R function density() # alpha -- tuning parameter # xlab, ylab -- labels of x and y coordinates # title -- title of the plot plot1DProjection<-function(y1, y2, projDir, sepValMethod=c("normal", "quantile"), bw="nrd0", xlim=NULL, ylim=NULL, xlab="1-D projected clusters", ylab="density estimates", title="1-D Projected Clusters and their density estimates", font=2, font.lab=2, cex=1.2, cex.lab=1.2, cex.main=1.5, lwd=4, lty1=1, lty2=2, pch1=18, pch2=19, col1=2, col2=4, type="l", alpha=0.05, eps=1.0e-10, quiet=TRUE) { sepValMethod<-match.arg(sepValMethod, choices=c("normal", "quantile")) if(alpha<=0 || alpha>0.5) { stop("The tuning parameter 'alpha' should be in the range (0, 0.5]!\n") } if(eps<=0 || eps > 0.01) { stop("The convergence threshold 'eps' should be between (0, 0.01]!\n") } x1<-y1%*%projDir x2<-y2%*%projDir x<-c(x1, x2) n1<-length(x1) n2<-length(x2) n<-n1+n2 m1<-mean(x1, na.rm=TRUE) m2<-mean(x2, na.rm=TRUE) if(m1>m2) { projDir<- -projDir x1<- -x1 x2<- -x2 if(!quiet) { cat("Warning: Projected cluster 1 is on the right-hand side of projected cluster 2!\n") cat("'projDir' is replaced by '-projDir'!\n") } } x<-c(x1, x2) n1<-length(x1) n2<-length(x2) xx1<-x1 xx2<-x2 if(sepValMethod=="quantile") { if(n1>1) { L1<-quantile(xx1, prob=alpha/2, na.rm=TRUE) U1<-quantile(xx1, prob=1-alpha/2, na.rm=TRUE) } else { L1<-xx1 U1<-xx1 } if(n2>1) { L2<-quantile(xx2, prob=alpha/2, na.rm=TRUE) U2<-quantile(xx2, prob=1-alpha/2, na.rm=TRUE) } else { L2<-xx2 U2<-xx2 } } else { za<-qnorm(1-alpha/2) if(n1>1) { m1<-mean(xx1, na.rm=TRUE) sd1<-sd(c(xx1), na.rm=TRUE) } else { m1<-xx1 sd1<-0 } if(n2>1) { m2<-mean(xx2, na.rm=TRUE) sd2<-sd(c(xx2), na.rm=TRUE) } else { m2<-xx2 sd2<-0 } L1<-m1-za*sd1 U1<-m1+za*sd1 L2<-m2-za*sd2 U2<-m2+za*sd2 } numer<-(L2-U1) denom<-(U2-L1) if(abs(denom)1) { tmp1<-density(xx1, bw=bw) tmpx1<-tmp1$x tmpy1<-tmp1$y } else { tmpx1<-xx1; tmpy1<-1; } if(n2>1) { tmp2<-density(xx2, bw=bw) tmpx2<-tmp2$x tmpy2<-tmp2$y } else { tmpx2<-xx2 tmpy2<-1; } nn1<-length(tmpx1) nn2<-length(tmpx2) if(is.null(xlim)) { xlim<-range(c(xx1, xx2, tmpx1, tmpx2)) } if(is.null(ylim)) { ylim<-range(c(tmpy1, tmpy2, tmpy1, tmpy2)) } plotCluster(nn1, nn2, tmpx1, tmpy1, tmpx2, tmpy2, xlim, ylim, xlab, ylab, title, font, font.lab, cex, cex.lab, cex.main, lwd, lty1, lty2, pch1, pch2, col1, col2, type) plotTickLabel(xx1, xx2, L1,U1,L2,U2, 1, font, lwd, lty1, lty2, col1, col2) #ticks and labels invisible(list(sepVal=sepVal, projDir=projDir)) } # function called by plot1DProjection and plot2DProjection plotCluster<-function(nn1, nn2, tmpx1, tmpy1, tmpx2, tmpy2, xlim, ylim, xlab, ylab, title, font, font.lab, cex, cex.lab, cex.main, lwd, lty1, lty2, pch1, pch2, col1, col2, type="l") { if(nn1>1 && nn2>1) { plot(tmpx1, tmpy1, type=type, xlim=xlim, ylim=ylim, xlab=xlab, ylab=ylab, col=col1, font=font, cex=cex, font.lab=font.lab, cex.lab=cex.lab, lwd=lwd, lty=lty1, pch=pch1) title(title, cex.main=cex.main) points(tmpx2, tmpy2, type=type, col=col2, lwd=lwd, lty=lty2, pch=pch2) } else if (nn1>1 && nn2==1) { plot(tmpx1, tmpy1, type=type, xlim=xlim, ylim=ylim, xlab=xlab, ylab=ylab, col=col1, font=font, cex=cex, font.lab=font.lab, cex.lab=cex.lab, lwd=lwd, lty=lty1, pch=pch1) title(title, cex.main=cex.main) points(tmpx2, tmpy2, type="p", col=col2, lty=lty2, pch=pch2) } else if (nn1==1 && nn2>1) { plot(tmpx2, tmpy2, type=type, xlim=xlim, ylim=ylim, xlab=xlab, ylab=ylab, col=col2, font=font, cex=cex, font.lab=font.lab, cex.lab=cex.lab, lwd=lwd, lty=lty1, pch=pch1) title(title, cex.main=cex.main) points(tmpx1, tmpy1, type="p", col=col1, lty=lty2, pch=pch2) } else { plot(x=c(tmpx1), y=c(tmpy1), type="p", xlim=xlim, ylim=ylim, xlab=xlab, ylab=ylab, col=col1, font=font, cex=cex, font.lab=font.lab, cex.lab=cex.lab, lty=lty1, pch=pch1) title(title, cex.main=cex.main) points(x=c(tmpx2), y=c(tmpy2), type="p", col=col2, lty=lty2, pch=pch2) } } # Plot ticks and labels along rotated 1st axis # x1, x2 -- projected cluster 1 and 2 along the 1st axis # L1, U1 -- lower and upper 1-alpha/2 percentile of x1 # L2, U2 -- lower and upper 1-alpha/2 percentile of x2 plotTickLabel<-function(x1, x2, L1, U1, L2, U2, axis=1, font=2, lwd=4, lty1=1, lty2=2, col1=2, col2=4) { axis(side=axis,at=x1, labels=FALSE, tick=TRUE,col=col1) axis(side=axis,at=x2, labels=FALSE, tick=TRUE,col=col2) par(mgp=c(3,2,0)) axis(side=axis,at=c(L1, U1), labels=c("L1", "U1"), tick=TRUE,col=col1, lwd=lwd,font=font, lty=lty1) axis(side=axis,at=c(L2, U2), labels=c("L2", "U2"), tick=TRUE,col=col2, lwd=lwd,font=font, lty=lty2) par(mgp=c(3,1,0)) } # plot 2-D projection # sepValMethod -- quantile or normal # quantile means use upper and lower alpha quantiles of univariate projections # normal means use mean +/- z(alpha)*sd for mean,sd of projections # y1 -- cluster 1 # y2 -- cluster 2 # alpha -- tuning parameter # xlab, ylab -- labels of x and y coordinates # title -- title of the plot plot2DProjection<-function(y1, y2, projDir, sepValMethod=c("normal", "quantile"), iniProjDirMethod=c("SL", "naive"), projDirMethod=c("newton", "fixedpoint"), xlim=NULL, ylim=NULL, xlab="1st projection direction", ylab="2nd projection direction", title="Scatter plot of 2-D Projected Clusters", font=2, font.lab=2, cex=1.2, cex.lab=1, cex.main=1.5, lwd=4, lty1=1, lty2=2, pch1=18, pch2=19, col1=2, col2=4, alpha=0.05, ITMAX=20, eps=1.0e-10, quiet=TRUE) { sepValMethod<-match.arg(sepValMethod, choices=c("normal", "quantile")) iniProjDirMethod<-match.arg(arg=iniProjDirMethod, choices=c("SL", "naive")) projDirMethod<-match.arg(arg=projDirMethod, choices=c("newton", "fixedpoint")) if(alpha<=0 || alpha>0.5) { stop("The tuning parameter 'alpha' should be in the range (0, 0.5]!\n") } ITMAX<-as.integer(ITMAX) if(ITMAX<=0 || !is.integer(ITMAX)) { stop("The maximum iteration number allowed 'ITMAX' should be a positive integer!\n") } if(eps<=0 || eps > 0.01) { stop("The convergence threshold 'eps' should be in (0, 0.01]!\n") } if(!is.logical(quiet)) { stop("The value of the quiet indicator 'quiet' should be logical, i.e., either 'TRUE' or 'FALSE'!\n") } # obtain first two rotated coordinates tmp<-getRotateData(y1, y2, projDir, iniProjDirMethod, projDirMethod, alpha, ITMAX, eps, quiet) Q2<-tmp$Q2 # rotation matrix cl<-tmp$cl x<-tmp$x x1<-x[cl==1,1] x2<-x[cl==2,1] tmpy1<-x[cl==1,2] tmpy2<-x[cl==2,2] mx1<-mean(x1, na.rm=TRUE) mx2<-mean(x2, na.rm=TRUE) if(mx1>mx2) { x1<- -x1 x2<- -x2 Q2[,1]<- -Q2[,1] if(!quiet) { cat("Warning: Projected cluster 1 is on the right-hand side of projected cluster 2 along the 1st projection direction!\n") cat("'projDir' is replaced by '-projDir'!\n") } } tmpx<-c(x1, x2) my1<-mean(tmpy1, na.rm=TRUE) my2<-mean(tmpy2, na.rm=TRUE) if(my1>my2) { tmpy1<- -tmpy1 tmpy2<- -tmpy2 Q2[,1]<- -Q2[,1] if(!quiet) { cat("Warning: Projected cluster 1 is on the right-hand side of projected cluster 2 along the 1st projection direction!\n") cat("'projDir' is replaced by '-projDir'!\n") } } tmpy<-c(tmpy1, tmpy2) xx1<-x1 xx2<-x2 yy1<-tmpy1 yy2<-tmpy2 n1<-length(xx1) n2<-length(xx2) if(sepValMethod=="quantile") { if(n1>1) { Lx1<-quantile(xx1, prob=alpha/2, na.rm=TRUE) Ux1<-quantile(xx1, prob=1-alpha/2, na.rm=TRUE) Ly1<-quantile(yy1, prob=alpha/2, na.rm=TRUE) Uy1<-quantile(yy1, prob=1-alpha/2, na.rm=TRUE) } else { Lx1<-xx1 Ux1<-xx1 Ly1<-yy1 Uy1<-yy1 } if(n2>1) { Lx2<-quantile(xx2, prob=alpha/2, na.rm=TRUE) Ux2<-quantile(xx2, prob=1-alpha/2, na.rm=TRUE) Ly2<-quantile(yy2, prob=alpha/2, na.rm=TRUE) Uy2<-quantile(yy2, prob=1-alpha/2, na.rm=TRUE) } else { Lx2<-xx2 Ux2<-xx2 Ly2<-yy2 Uy2<-yy2 } } else { za<-qnorm(1-alpha/2) if(n1>1) { mx1<-mean(xx1, na.rm=TRUE) sdx1<-sd(c(xx1), na.rm=TRUE) my1<-mean(yy1, na.rm=TRUE) sdy1<-sd(c(yy1), na.rm=TRUE) } else { mx1<-xx1 sdx1<-0 my1<-yy1 sdy1<-0 } if(n2>1) { mx2<-mean(xx2, na.rm=TRUE) sdx2<-sd(c(xx2), na.rm=TRUE) my2<-mean(yy2, na.rm=TRUE) sdy2<-sd(c(yy2), na.rm=TRUE) } else { mx2<-xx2 sdx2<-0 my2<-yy2 sdy2<-0 } Lx1<-mx1-za*sdx1 Ux1<-mx1+za*sdx1 Lx2<-mx2-za*sdx2 Ux2<-mx2+za*sdx2 Ly1<-my1-za*sdy1 Uy1<-my1+za*sdy1 Ly2<-my2-za*sdy2 Uy2<-my2+za*sdy2 } numerx<-(Lx2-Ux1) denomx<-(Ux2-Lx1) numery<-(Ly2-Uy1) denomy<-(Uy2-Ly1) if(abs(denomx)2) { ry1<-ry[,-1] # delete the first dimension # we will find a optimal direction in the rest of dimension. # the obtained optimal direction will orthogonal with "projDir" if(n1>1){tmpy1<-ry1[cl==1,]}else{tmpy1<-matrix(ry1,nrow=1,ncol=p-1)} if(n2>1){tmpy2<-ry1[cl==1,]}else{tmpy2<-matrix(ry1,nrow=1,ncol=p-1)} tmp<-projDirData(tmpy1, tmpy2, iniProjDirMethod, projDirMethod, alpha, ITMAX, eps, quiet) aa<-c(0, tmp$projDir) aa<-as.vector(Q%*%aa) # aa^T a=0 Q2<-cbind(Q[,1], aa) # data will project into these two dimension } else { Q2<-Q } x<-y%*%Q2 return(list(x=x,cl=cl,Q2=Q2)) } # Visualize clusters by projecting them into a 2-dimensional space. # The 2-dimensional space is spanned by the first 2 eigenvectors of the # matrix B: #\begin{eqnarray*} # B&=&{2\over # k_0(k_0-1)}\sum_{i=1}^{k_0-1}\sum_{j=i+1}^{k_0}\E\left[(\Y_i-\Y_j) # (\Y_i-\Y_j)^T\right]\\ # &=&{2\over k_0}\sum_{i=1}^{k_0}\Sigma_i+{2\over # k_0(k_0-1)}\sum_{i0) { cl2<-cl[-out.label] y2<-y[-out.label,,drop=FALSE] } cl2.u<-unique(cl2) k0<-length(cl2.u) # project data to the 2-dimensional space spanned by the first 2 # eigenvectors of the between cluster distance matrix B. if(projMethod=="DMS") { tmp<-DMSProj(y2, cl2, outlierLabel) } else { tmp<-eigenProj(y2, cl2, outlierLabel) } #ev<-tmp$ev # eigenvalues of the matrix B Q<-tmp$Q # the all eigenvectors of B B<-tmp$B # the between distance matrix B # we only plot the first 2 dimension, using all data points # including outliers proj<-y%*%Q[,1:2] if(missing(xlim)) { xlim<-range(proj[,1]) } else { xlim<-xlim } if(missing(ylim)) { ylim<-range(proj[,2]) } else { ylim<-ylim } # plot projected clusters plot(proj[cl==cl2.u[1], 1], proj[cl==cl2.u[1], 2], xlim=xlim, ylim=ylim, xlab=xlab,ylab=ylab,col=1, pch=1,font=font,cex=cex, font.lab=font.lab,cex.lab=cex.lab) # plot outliers if(length(out.label)>0) { points(proj[cl==outlierLabel,1], proj[cl==outlierLabel,2], col=2, pch=2) } title(title,cex.main=1.5); start<-1 # plot the data points in the remaining clusters if(k0>1) { for(i in 2:k0) { points(proj[cl==cl2.u[i],1], proj[cl==cl2.u[i],2], col=start+i,pch=start+i) } } res<-list(B=B, Q=Q, proj=proj) invisible(res) } # Eigen approach to find the projection directions to visualize clusters. # Project data to the 2-dimensional space spanned by the first two # eigenvectors of the between cluster distance matrix B # B=(2/ k0)*sum_{i=1}^{k_0}Sigmai # +(2/(k_0(k_0-1))sum_{i0) { cl2<-cl[-out.label]; y2<-y[-out.label,]; } else { cl2<-cl; y2<-y; } cl.set<-sort(unique(cl2)) k0<-length(cl.set) # obtain cluster centers and covariance matrices s<-array(0,c(p,p,k0)) mu.mat<-matrix(0, nrow=k0, ncol=p) # W = sum_{i=1}^{k0} Sigma_i /k0 W<-matrix(0,nrow=p, ncol=p) for(i in 1:k0) { yi<-y2[cl==cl.set[i],] mu.mat[i,]<-apply(yi, 2, mean, na.rm=TRUE) s[,,i]<-cov(yi) W<-W+s[,,i] } W<-W/k0 if(k0>1) { # B=sum_{i=1}^{k0-1}sum_{j=(i+1)}^{k0} # (theta_i-theta_j)*(theta_i-theta_j)^T / (k0*(k0-1)) B<-matrix(0, nrow=p, ncol=p) for(i in 1:(k0-1)) { mui<-mu.mat[i,] for(j in (i+1):k0) { muj<-mu.mat[j,] B<-B+outer(mui-muj, mui-muj) } } B<-W+(B/(k0*(k0-1))) } else { B<-W } B<-2*B eg<-eigen(B, symmetric=TRUE) Q<-eg$vectors # The i-th column of Q is the eigenvector of B # corresponding to the i-th eigenvalue res<-list(Q=Q, B=B) return(res) } # Dhillon et al.'s (2002) approach to find the projection directions to # visualize clusters. # Dhillon I. S., Modha, D. S. and Spangler, W. S. (2002) # Class visualization of high-dimensional data with applications. # \emph{computational Statistics and Data Analysis}, \bold{41}, 59--90. # # project data to the t-dimensional space spanned by the first two eigen # vectors of the between cluster distance matrix B # B=\sum_{i=2}^{k_0}\sum_{j=1}^{i-1} # n_in_j(\theta_i-\theta_j)(\theta_i-\theta_j)^T # # y -- nxp data points # cl -- a partition of the n data points in y # outlierLabel -- integer (default 0) for detected outliers DMSProj<-function(y, cl, outlierLabel=0) { y<-as.matrix(y) p<-ncol(y); # remove outliers out.label<-which(cl==outlierLabel) if(length(out.label)>0) { cl2<-cl[-out.label]; y2<-y[-out.label,]; } else { cl2<-cl; y2<-y; } cl.set<-sort(unique(cl2)) k0<-length(cl.set) n.set<-tapply(rep(1,length(cl2)), cl2, sum, na.rm=TRUE) # obtain cluster centers and covariance matrices mu.mat<-matrix(0, nrow=k0, ncol=p) for(i in 1:k0) { yi<-y2[cl==cl.set[i],] if(n.set[i]>1) { mu.mat[i,]<-apply(yi, 2, mean, na.rm=TRUE) } else { mu.mat[i,]<-yi } } if(k0>1) { # B=\sum_{i=2}^{k_0}\sum_{j=1}^{i-1} # n_in_j(\theta_i-\theta_j)(\theta_i-\theta_j)^T B<-matrix(0, nrow=p, ncol=p) for(i in 2:k0) { mui<-as.vector(mu.mat[i,]) for(j in 1:(i-1)) { muj<-as.vector(mu.mat[j,]) ni<-n.set[i]; nj<-n.set[j]; tem1<-(mui-muj) tem2<-tem1%*%t(tem1) B<-B+n.set[i]*n.set[j]*tem2 } } } else { stop("the number of clusters is 0!\n") } eg<-eigen(B, symmetric=TRUE) Q<-eg$vectors # The i-th column of Q is the eigenvector of B # corresponding to the i-th eigenvalue res<-list(Q=Q, B=B) return(res) } clusterGeneration/R/hjrancor.R0000644000176000001440000000322610700717035016134 0ustar ripleyusers# Reference: # Joe H (2006). Generating random correlation matrices based on partial # correlations. J. Mult. Anal. Vol. 97, 2177--2189. # Generate random correlation matrix based on random partial correlations # choice of alpha parameter lead to invariance to index order. # d = dimension, # alphad = alpha parameter for partial of 1,d given 2,...,d-1 # default value alphad = 1 leads to random matrix uniform over # space of positive definite correlation matrices # Other each correlation has a Beta(a,a) distribution on (-1,1) where # a=alphad+(d-2)/2 rcorrmatrix<-function(d,alphad=1) { d<-as.integer(d) if(d<=0 || !is.integer(d)) { stop("The dimension 'd' should be a positive integer!\n") } if(alphad<=0) { stop("'alphad' should be positive!\n") } # handling of d=1 and d=2 if(d==1) { rr<-matrix(1,1,1); return(rr) } if(d==2) { rho<-runif(1,-1,1) rr<-matrix(c(1,rho,rho,1),2,2); return(rr) } rr<-matrix(0,d,d) diag(rr)<-1 for(i in 1:(d-1)) { alp<-alphad+(d-2)/2 rr[i,i+1]<-2*rbeta(1,alp,alp)-1 rr[i+1,i]<-rr[i,i+1] } for(m in 2:(d-1)) { for(j in 1:(d-m)) { rsub<-rr[j:(j+m),j:(j+m)] #print(rsub) alp<-alphad+(d-1-m)/2 rr[j,j+m]<-rjm(rsub,alp) rr[j+m,j]<-rr[j,j+m] } } return(rr) } # rsub is a symmetrix matrix, alp is a beta parameter # generate the correlation for the (j,j+m) component rjm<-function(rsub,alp) { b<-nrow(rsub) ii<-2:(b-1) r1<-rsub[ii,1] r3<-rsub[ii,b] R2<-rsub[ii,ii] Ri<-solve(R2) rcond<-2*rbeta(1,alp,alp)-1 tem13<-t(r1)%*%Ri%*%r3 tem11<-t(r1)%*%Ri%*%r1 tem33<-t(r3)%*%Ri%*%r3 res<-tem13+rcond*sqrt((1-tem11)*(1-tem33)) return(res) } clusterGeneration/R/simClustDesign.R0000644000176000001440000004344011736333655017301 0ustar ripleyusers # Generating data sets via a factorial design (Qiu and Joe, 2006), # which has factors degree of separation, number of clusters, # number of non-noisy variables, number of noisy variables, # number of outliers, number of replicates, etc. # The separation between any cluster and its nearest neighboring clusters # can be set to a specified value. The covariance matrices of clusters # can have arbitrary diameters, shapes and orientations. # Qiu, W.-L. and Joe, H. (2006) # Generation of Random Clusters with Specified Degree of Separaion. # \emph{Journal of Classification}, \bold{23}(2), 315--334. # # Factors Levels #--------------------------------------------- # Number of clusters 3, 6, 9 # Degree of separation close, separated, well-separated # Number of non-noisy variables 4, 8, 20 # Number of noisy variables 1, 0.5p1, p1 #--------------------------------------------- # p1 is the number of non-noisy variables # # sepVal -- Levels of the degree of separation. # It has been "calibrated" against two univariate # clusters generated from N(0, 1) and N(0, A), respectively, # With A=4, sepVal=0.01 indicates a close cluster structure. # With A=6, sepVal=0.21 indicates a separate cluster structure. # With A=8, sepVal=0.34 indicates a well-separated cluster structure. # 'close' -- 'sepVal=0.01' # 'separated' -- 'sepVal=0.21' # 'well-separated' -- 'sepVal=0.342' # sepLabels -- labels for 'close', 'separated', and 'well-separated' # numClust -- levels of numbers of clusters # numNonNoisy -- levels of numbers of non-noisy variables # numNoisy -- levels of numbers of noisy variables # numOutlier -- number/ratio of outliers. If numOutlier is a positive integer, # then numOutlier means the number of outliers. If numOutlier is a real # number in (0,1), then numOutlier means the ratio of outliers, i.e. # the number of outliers is equal to round(numOutlier*N), # where N is the total number of non-outliers. # numReplicate -- the number of data sets to generate for each combination. # the default value is 3 as in the design in Milligan (1985) # (An algorithm for generating artificial test clusters, # Psychometrika, 50, 123-127) # fileName -- the output data file names have the format # [fileName]J[j]G[g]v[p1]nv[p2]out[numOutlier]_[numReplicate].[extension] # where # 'extension' can be 'dat', 'log', 'mem', or 'noisy', # 'J' means separation index, 'G' means number of clusters, # 'v' means the number of non-noisy variables, # 'nv' means the number of noisy variables, # 'out' means the number of outliers # clustszind -- cluster size indicator. # clustszind=1 indicates that all cluster have equal size. # The size is specified by clustSizeEq. # clustszind=2 indicates that the cluster sizes are randomly # generated from the range rangeN. # clustszind=3 indicates that the cluster sizes are specified # via the vector clustSizes. # The default value is 2 so that the generated clusters are more realistic. # clustSizeEq -- if clustszind=1, then this is the constant cluster size # The default value 100 is a reasonable cluster size. # rangeN -- if clustszind=2, then the cluster sizes are randomly generaged # from the range 'rangeN'. The default range is [50, 200] which # can produce reasonable variability of cluster sizes. # clustSizes -- if clustszind=3, then the cluster sizes are specified via # clustSizes. An input is required with # the default value of 'clustSizes' set as 'NULL'. # covMethod -- methods to generate random covariance matrices. # 'eigen' method first generates the eigenvalues of the covariance matrix, # then generate eigenvectors to construct the covariance matrix. # 'unifcorrmat' method first generates a correlation # matrix via the method proposed by # Joe H (2006). Generating random correlation matrices based on # partial correlations. J. Mult. Anal. Vol. 97, 2177--2189 # Then, it generate variances from the range 'rangeVar' to # construct the covariance matrix. # # 'onion' method # extension of onion method, using Cholesky instead of msqrt #Ghosh, S., Henderson, S. G. (2003). Behavior of the NORTA method for #correlated random vector generation as the dimension increases. #ACM Transactions on Modeling and Computer Simulation (TOMACS) #v 13 (3), 276-294. # # 'c-vine' method # # Random correlation with the C-vine (different order of partial # correlations). Reference for vines: Kurowicka and Cooke, 2006, # Uncertainty Analysis with High Dimensional Dependence Modelling, # Wiley, 2006. # # The default method is 'eigen' so that the user can directly # specify the range of the 'diameters' of clusters. # rangeVar -- if 'covMethod="unifcorrmat"', then to generate a covariance # matrix, we first generate a correlation matrix, then randomly generate # variances from the range specified by 'rangeVar'. # The default range is [1, 10] which can generate reasonable # variability of variances. # lambdaLow -- if 'covMethod="eigen"', when generating a covariance matrix, # we first generate its eigenvalues. The eigenvalues are randomly # generated from the interval [lambdaLow, lambdaLow*ratioLambda]. # Note that lambdaLow should be positive. # In our experience, the range [lambdaLow=1, ratioLambda=10] # can give reasonable variability of the diameters of clusters. # ratioLambda -- if 'covMethod="eigen"', lambdaUpp=lambdaLow*ratioLambda, # is the upper bound of the eigenvalues of a covariance matrix # and lambdaLow is the lower bound. # In our experience, the range [lambdaLow=1, lambdaUpp=10] # can give reasonable variability of the diameters of clusters. # alphad -- parameter for c-vine and onion method to generate random correlation matrix # eta=1 for uniform. eta should be > 0 # eta -- parameter for c-vine and onion method to generate random correlation matrix # eta=1 for uniform. eta should be > 0 # rotateind-- if rotateind =TRUE, then rotate data so that we may not detect the # full cluster structure from scatterplots, otherwise do not rotate. # By default, 'rotateind=TRUE' to generate more realistic data sets. # iniProjDirMethod -- indicating the method to get initial projection direction # By default, the sample version of the Su and Liu (SL) projection # direction ((n1-1)*S1+(n2-1)*S2)^{-1}(mu2-mu1) is used, # where mui and Si are the mean vector and covariance # matrix of cluster i. Alternatively, the naive projection # direction (mu2-mu1) is used. # Su and Liu (1993) JASA 1993, vol. 88 1350-1355. # projDirMethod -- takes values "fixedpoint" or "newton" # "fixedpoint" means that we get optimal projection direction # by solving the equation d J(w) / d w = 0, where # J(w) is the separation index with projection direction 'w' # "newton" method means that we get optimal projection driection # by the method proposed in the appendix of Qiu and Joe (2006) # "Generation of random clusters with specified degree of separation", # Journal of Classification Vol 23(2), 315-334. # alpha -- tuning parameter for separation index to indicating the percentage # of data points to downweight. We set 'alpha=0.05' like we set # the significance level in hypothesis testing as 0.05. # ITMAX -- when calculating the projection direction, we need iterations. # ITMAX gives the maximum iteration allowed. # The actual #iterations is usually much less than the default value 20. # eps -- A small positive number to check if a quantitiy \eqn{q} is # equal to zero. If \eqn{|q|<}\code{eps}, then we regard \eqn{q} # as equal to zero. \code{eps} is used to check if an algorithm # converges. The default value is \eqn{1.0e-10}. # quiet -- a flag to switch on/off the outputs of intermediate results. # The default value is 'TRUE'. # outputEmpirical -- indicates if empirical projection directions # and separation indices should be output to files. # These information usually are useful to check the cluster # structures. Hence, by default, 'outputEmpirical=TRUE'. # outputInfo -- indicates if separation information dataframe should be output. # The file name extension is .log simClustDesign<-function(numClust=c(3,6,9), sepVal=c(0.01, 0.21, 0.342), sepLabels=c("L", "M", "H"), numNonNoisy=c(4,8,20), numNoisy=NULL, numOutlier=0, numReplicate=3, fileName="test", clustszind=2, clustSizeEq=50, rangeN=c(50,200), clustSizes=NULL, covMethod=c("eigen", "onion", "c-vine", "unifcorrmat"), rangeVar=c(1, 10), lambdaLow=1, ratioLambda=10, alphad=1, eta=1, rotateind=TRUE, iniProjDirMethod=c("SL", "naive"), projDirMethod=c("newton", "fixedpoint"), alpha=0.05, ITMAX=20, eps=1.0e-10, quiet=TRUE, outputDatFlag=TRUE, outputLogFlag=TRUE, outputEmpirical=TRUE, outputInfo=TRUE) { iniProjDirMethod<-match.arg(arg=iniProjDirMethod, choices=c("SL", "naive")) projDirMethod<-match.arg(arg=projDirMethod, choices=c("newton", "fixedpoint")) covMethod<-match.arg(arg=covMethod, choices=c("eigen", "onion", "c-vine", "unifcorrmat")) numClust<-as.integer(numClust) # checks for valid inputs if(prod(numClust<1) || !is.integer(numClust)) { stop("The number 'numClust' of clusters should be a positive integer!\n") } numNonNoisy<-as.integer(numNonNoisy) if(prod(numNonNoisy<2) || !is.integer(numNonNoisy)) { stop("The number 'numNonNoisy' of non-noisy variables should be an integer greater than 1!\n") } if(prod(sepVal<= -0.999) || prod(sepVal >= 0.999)) { stop("The desired separation index 'sepVal' should be in the range (-0.999, 0.999)!\n") } numReplicate<-as.integer(numReplicate) if(numReplicate<1 || !is.integer(numReplicate)) { stop("The number 'numReplicate' should be a positive integer!\n") } if(!is.null(numNoisy)) { numNoisy<-as.integer(numNoisy) if(numNoisy<0 || !is.integer(numNoisy)) { stop("The number 'numNoisy' of noisy variables should be a non-negative integer!\n") } } if(numOutlier<0) { stop("'numOutlier' should be positive!\n") } if(!is.element(clustszind, c(1,2,3))) { stop("Cluster size indicator 'clustszind' should be 1, 2, or 3!\n") } clustSizeEq<-as.integer(clustSizeEq) if(clustSizeEq<2 || !is.integer(clustSizeEq)) { stop("Cluster size 'clustSizeEq' should be an integer greater than 1!\n") } rangeN<-as.integer(rangeN) if(length(rangeN)!=2) { stop("The range 'rangeN' for cluster sizes should be a numeric vector of length 2!\n") } if(rangeN[1]>rangeN[2]) { stop("First element of 'rangeN' should be smaller than second!\n") } if(rangeN[1]<2 || !is.integer(rangeN[1])) { stop("The lower bound 'rangeN[1]' for the range of cluster sizes should be an integer greater than 1!\n") } if(!is.integer(rangeN[2])) { stop("The upper bound 'rangeN[2]' for the range of cluster sizes should be integer!\n") } if(clustszind==3) { len<-length(clustSizes) if(len!=numClust || is.null(clustSizes)) { stop("The number of elements in 'clustSizes' should be equal the number of elements in 'numClust' when the value of 'clustszind' is equal to 3!\n") } clustSizes<-as.integer(clustSizes) for(i in 1:len) { if(clustSizes[i]<1 || !is.integer(clustSizes[i])) { stop(paste("The cluster size for the ", i, "-th cluster should be an integer greater than 1!\n", sep="")) } } } if(rangeVar[1]>rangeVar[2]) { stop("First element of 'rangeVar' should be smaller than second!\n") } if(rangeVar[1]<0) { stop("The lower bound 'rangeVar[1]' for the range of variances should be positive!\n") } if(lambdaLow<0) { stop("The lower bound 'lambdaLow' of eigenvalues of cluster covariance matrices should be greater than zero!\n") } if(ratioLambda<1) { stop("The ratio 'ratioLambda' of the upper bound of the eigenvalues to the lower bound of the eigenvalues of cluster covariance matrices should be greater than 1!\n") } alphad<-as.numeric(alphad) if(alphad<0) { stop("'alphad' should be positive!\n") } eta<-as.numeric(eta) if(eta<0) { stop("'eta' should be positive!\n") } if(!is.logical(rotateind)) { stop("The value of the rotation indicator 'rotateind' should be logical, i.e., either 'TRUE' or 'FALSE'!\n") } if(alpha<=0 || alpha>0.5) { stop("The tuning parameter 'alpha' should be in the range (0, 0.5]!\n") } ITMAX<-as.integer(ITMAX) if(ITMAX<=0 || !is.integer(ITMAX)) { stop("The maximum iteration number allowed 'ITMAX' should be a positive integer!\n") } if(eps<=0 || eps > 0.01) { stop("The convergence threshold 'eps' should be between (0, 0.01]!\n") } if(!is.logical(quiet)) { stop("The value of the quiet indicator 'quiet' should be logical, i.e., either 'TRUE' or 'FALSE'!\n") } if(!is.logical(outputDatFlag)) { stop("The value of the indicator 'outputDatFlag' should be logical, i.e., either 'TRUE' or 'FALSE'!\n") } if(!is.logical(outputLogFlag)) { stop("The value of the indicator 'outputLogFlag' should be logical, i.e., either 'TRUE' or 'FALSE'!\n") } if(!is.logical(outputEmpirical)) { stop("The value of the indicator 'outputEmpirical' should be logical, i.e., either 'TRUE' or 'FALSE'!\n") } if(!is.logical(outputInfo)) { stop("The value of the indicator 'outputInfo' should be logical, i.e., either 'TRUE' or 'FALSE'!\n") } # end of checks of valid inputs, loop begins loop<-0 cat("Generating data sets. Please wait ...\n") for(j in 1:length(sepVal)) { for(g in numClust) { for(p1 in numNonNoisy) { if(is.null(numNoisy)) { numNoisy<-c(1, round(p1/2), p1) } for(p2 in numNoisy) { p<-p1+p2 # total number of variables loop<-loop+1 tmpfileName<-paste(fileName,"J", sepLabels[j],"G",g, "v", p1, "nv",p2, "out", numOutlier, sep="") res<-genRandomClust(numClust=g, numNonNoisy=p1, sepVal=sepVal[j], numNoisy=p2, numReplicate=numReplicate, numOutlier=numOutlier, fileName=tmpfileName, clustszind=clustszind, clustSizeEq=clustSizeEq, rangeN=rangeN, clustSizes=clustSizes, covMethod=covMethod, rangeVar=rangeVar, lambdaLow=lambdaLow, ratioLambda=ratioLambda, rotateind=rotateind, iniProjDirMethod=iniProjDirMethod, projDirMethod=projDirMethod, alpha=alpha, ITMAX=ITMAX, eps=eps, quiet=quiet, outputDatFlag=outputDatFlag, outputLogFlag=outputLogFlag, outputEmpirical=outputEmpirical, outputInfo=FALSE) if(loop>1) { infoFrameTheory<-rbind(infoFrameTheory, res$infoFrameTheory) if(outputEmpirical) { infoFrameData<-rbind(infoFrameData, res$infoFrameData) } else { infoFrameData<-NULL } datList[[loop]]<-res$datList memList[[loop]]<-res$memList noisyList[[loop]]<-res$noisyList } else { infoFrameTheory<-res$infoFrameTheory if(outputEmpirical) { infoFrameData<-res$infoFrameData } else { infoFrameData<-NULL } datList<-list(res$datList) memList<-list(res$memList) noisyList<-list(res$noisyList) } } } } } infoFrameTheory<-data.frame(infoFrameTheory) nr<-nrow(infoFrameTheory) rownames(infoFrameTheory)<-1:nr if(outputEmpirical) { infoFrameData<-data.frame(infoFrameData) nr<-nrow(infoFrameData) rownames(infoFrameData)<-1:nr } else { infoFrameData<-NULL } names(datList)<-1:loop names(memList)<-1:loop names(noisyList)<-1:loop if(outputInfo) { fileNameInfo<-paste(fileName, "_info.log", sep="") msg<-"Theoretical separation information data frame>>>>>>>>>>>\n" write.table(msg, append=FALSE, file=fileNameInfo, quote=FALSE, row.names=FALSE, col.names=FALSE) msg<-colnames(infoFrameTheory) write(msg, append=TRUE, file=fileNameInfo, ncolumns=7, sep=" ") tt<-infoFrameTheory tt[,1:6]<-round(tt[,1:6], 3) write.table(tt, file=fileNameInfo, quote=FALSE, sep="\t", row.names=FALSE, col.names=FALSE, append=TRUE) if(outputEmpirical) { msg<-"\nEmpirical separation information data frame>>>>>>>>>>>\n" write.table(msg, append=TRUE, file=fileNameInfo, quote=FALSE, row.names=FALSE, col.names=FALSE) msg<-colnames(infoFrameData) write(msg, append=TRUE, file=fileNameInfo, ncolumns=7, sep=" ") tt<-infoFrameData tt[,1:6]<-round(tt[,1:6], 3) write.table(tt, file=fileNameInfo, quote=FALSE, sep="\t", row.names=FALSE, col.names=FALSE, append=TRUE) } } cat("The process is completed successfully!\n") invisible(list(infoFrameTheory=infoFrameTheory, infoFrameData=infoFrameData, datList=datList, memList=memList, noisyList=noisyList)) } clusterGeneration/R/separation.R0000644000176000001440000010401312470014257016471 0ustar ripleyusers# # Get initial projection direction # if iniProjDirMethod="SL", iniProjDir=(Sigma1+Sigma2)^{-1}(mu_2-mu_1) # else if iniProjDirMethod="naive", iniProjDir=(mu_2-mu_1) # otherwise iniProjDir is randomly generated such that iniProjDir^%(mu2-mu1)>0 # # mu1, Sigma1 -- mean vector and covariance matrix for cluster 1 # mu2, Sigma2 -- mean vector and covariance matrix for cluster 2 # iniProjDirMethod -- projDirMethod used to construct initial projection direction # projDirMethod="SL" => iniProjDir<-(Sigma1+Sigma2)^{-1}(mu2-mu1) # projDirMethod="naive" => iniProjDir<-(mu2-mu1) # eps -- a small positive number used to check if a quantity is equal to zero. # 'eps' is mainly used to check if an iteration algorithm converges. getIniProjDirTheory<-function(mu1, Sigma1, mu2, Sigma2, iniProjDirMethod=c("SL", "naive"), eps=1.0e-10, quiet=TRUE) { iniProjDirMethod<-match.arg(arg=iniProjDirMethod, choices=c("SL", "naive")) if(iniProjDirMethod=="SL") { # Su and Liu (1993) projection direction, JASA 1993, vol88 1350-1355 tmp<-Sigma1+Sigma2 if(abs(det(tmp))eps) { iniProjDir<-iniProjDir/tmp } else { if(!quiet) { cat("Warning: iniProjDir=0!\n") } } return(iniProjDir) } # Projection direction via iteration formula derived by # iteratively solving the equation d J(projDir) / d projDir = 0 # projDir -- projection direction at the t-th step # output projection direction at the (t+1)-th step # See documentation of getIniProjDirTheory for explanation of arguments: # mu1, Sigma1, mu2, Sigma2, eps getProjDirIter<-function(projDir, mu1, Sigma1, mu2, Sigma2, eps=1.0e-10, quiet=TRUE) { tmp1<-as.numeric((abs(t(projDir)%*%Sigma1%*%projDir))) tmp2<-as.numeric((abs(t(projDir)%*%Sigma2%*%projDir))) b1ProjDir<-sqrt(tmp1) b2ProjDir<-sqrt(tmp2) diff<-mu2-mu1 if(abs(tmp1)eps) # two clusters are totally separated { projDir2[pos]<-1 projDir2<-sign(diff[pos])*projDir2 } res<-list(stop=TRUE, projDir=projDir2) return(res) } if(abs(b1ProjDir)ITMAX) { if(!quiet) { cat("Warning: Iterations did not converge!\n") } break } projDirOld<-projDir } if(!quiet) { cat("number of iterations=", loop, " diff=", diff, "\n") cat("standardized iniProjDir>>\n"); print(iniProjDir); cat("\n"); cat("standardized projDir>>\n"); print(projDir); cat("\n"); } return(projDir) } # Separation index given a projection direction 'projDir' # alpha -- tuning parameter for separation index to indicating the percentage # of data points to downweight. We set 'alpha=0.05' like we set # the significance level in hypothesis testing as 0.05. # See documentation of getIniProjDirTheory for explanation of arguments: # mu1, Sigma1, mu2, Sigma2, eps sepIndexTheory<-function(projDir, mu1, Sigma1, mu2, Sigma2, alpha=0.05, eps=1.0e-10, quiet=TRUE) { if(as.vector(crossprod(projDir, mu2-mu1))<0) { if(!quiet) { cat("Warning: 'projDir*(mu2-mu1)<0'! '-projDir' will be used!\n") } projDir<- - projDir } # standardize projDir denom<-sqrt(sum(projDir^2)) if(abs(denom)mu1 before using this function # mu1, tau1 -- mle of the mean and standard devitation of cluster 1 # mu2, tau2 -- mle of mean and standard devitation of cluster 2 # alpha -- tuning parameter sepIndex<-function(mu1, tau1, mu2, tau2, alpha=0.05, eps=1.0e-10) { Za<-qnorm(1-alpha/2) L1<-mu1-Za*tau1; U1<-mu1+Za*tau1; L2<-mu2-Za*tau2; U2<-mu2+Za*tau2; denom<-U2-L1 if(abs(denom)0.5) { stop("The tuning parameter 'alpha' should be in the range (0, 0.5]!\n") } if(eps<=0 || eps > 0.01) { stop("The convergence threshold 'eps' should be in (0, 0.01]!\n") } if(is.vector(y1)) { len<-length(y1) mu1<-y1 Sigma1<-matrix(0, nrow=len, ncol=len) } else { mu1<-apply(y1, 2, mean, na.rm=TRUE) Sigma1<-cov(y1) } if(is.vector(y2)) { len<-length(y2) mu2<-y2 Sigma2<-matrix(0, nrow=len, ncol=len) } else { mu2<-apply(y2, 2, mean, na.rm=TRUE) Sigma2<-cov(y2) } res<-sepIndexTheory(projDir, mu1, Sigma1, mu2, Sigma2, alpha, eps, quiet) return(res) } # Use multiple initial projection directions to get multiple (local) optimal # projection directions and corresponding separation indices. # We also calculate separation indices for these initial projection directions # directly. # Then we choose the maximum separation index and its corresponding projection # direction as output. This function is used to handle the case where # one of or both of covariance matrices are singular # We consider 2+2*p initial projection directions. The first two projection # directions are 'SL' direction (Sigma1+Sigma2)^{-1}(Mu2-Mu1) and # 'naive' direction (Mu2-Mu1), respectively. # The remaining projection directions are the eigenvectors of Sigma1 and Sigma2 # If iniProjDir^T(Mu2-Mu1)<0, iniProjDir = - iniProjDir # Projection direction will be standardized so that it's length is equal to 1. # # ITMAX -- maximum iteration allowed # quiet -- a flag to switch on/off the outputs of intermediate results. # The default value is 'TRUE'. # See documentation of getIniProjDirTheory and sepIndexTheory for explanation # of arguments: # mu1, Sigma1, mu2, Sigma2, alpha, eps optimProjDirIterMulti<-function(mu1, Sigma1, mu2, Sigma2, alpha=0.05, ITMAX=20, eps=1.0e-10, quiet=TRUE) { p<-length(mu1) num<-2+2*p projDirMat<-matrix(0, nrow=num, ncol=p) # get initial projection directions # Su and Liu (1993)'s direction projDirMat[1,]<-getIniProjDirTheory(mu1, Sigma1, mu2, Sigma2, "SL", eps, quiet) # naive direction projDirMat[2,]<-getIniProjDirTheory(mu1, Sigma1, mu2, Sigma2, "naive", eps, quiet) start<-2+1 end<-2+p # eigenvectors of Sigma1 projDirMat[start:end,]<-t(eigen(Sigma1)$vectors) start<-end+1 end<-end+p # eigenvectors of Sigma2 projDirMat[start:end,]<-t(eigen(Sigma2)$vectors) # for each initial direction, we get separation index pos<-0 maxSep<- -2 for(i in 1:num) { tmpProjDir<-projDirMat[i,] if(sum(tmpProjDir*(mu2-mu1))<0) { tmpProjDir<- -tmpProjDir } sepVal1<-sepIndexTheory(tmpProjDir, mu1, Sigma1, mu2, Sigma2, alpha, eps, quiet) tmpProjDir2<-optimProjDirIter(tmpProjDir, mu1, Sigma1, mu2, Sigma2, ITMAX, eps, quiet) sepVal2<-sepIndexTheory(tmpProjDir2, mu1, Sigma1, mu2, Sigma2, alpha, eps, quiet) if(sepVal1>maxSep) { maxSep<-sepVal1 # record maximum separation index projDirOpt<-tmpProjDir } if(sepVal2>maxSep) { maxSep<-sepVal2 # record maximum separation index projDirOpt<-tmpProjDir2 } } return(list(projDir=projDirOpt, sepVal=maxSep)) } ################################################################# # Finding optimal projection direction # created by Weiliang Qiu, Jan. 23, 2005 # Obtain optimal projection direction for two sets of data points # y1 -- n1 x p matrix for cluster 1 # y2 -- n2 x p matrix for cluster 2 # iniProjDirMethod -- method used to construct initial projection direction # iniProjDirMethod="SL" => iniProjDir<-(Sigma1+Sigma2)^{-1}(mu2-mu1) # iniProjDirMethod="naive" => iniProjDir<-(mu2-mu1) # projDirMethod -- takes values "fixedpoint" or "newton" # "fixedpoint" means that we get optimal projection direction # by solving the equation d J(projDir) / d projDir = 0, where # J(projDir) is the separation index with projection direction 'projDir' # "newton" method means that we get optimal projection driection # by the method proposed in the appendix of Qiu and Joe (2006) # "Generation of random clusters with specified degree of separation", # Journal of Classification Vol 23(2), 315--334 # ITMAX -- maximum iteration allowed # eps -- threshold for convergence criterion. If |new-old|0.5) { stop("The tuning parameter 'alpha' should be in the range (0, 0.5]!\n") } if(eps<=0 || eps > 0.01) { stop("The convergence threshold 'eps' should be in (0, 0.01]!\n") } if(!is.logical(quiet)) { stop("The value of the quiet indicator 'quiet' should be logical, i.e., either 'TRUE' or 'FALSE'!\n") } if(is.vector(y1)) { len<-length(y1) mu1<-y1 Sigma1<-matrix(0, nrow=len, ncol=len) } else { mu1<-apply(y1, 2, mean, na.rm=TRUE) Sigma1<-cov(y1) } if(is.vector(y2)) { len<-length(y2) mu2<-y2 Sigma2<-matrix(0, nrow=len, ncol=len) } else { mu2<-apply(y2, 2, mean, na.rm=TRUE) Sigma2<-cov(y2) } # get initial projection direction 'a' so that 'a^T*a=1' iniProjDir<-getIniProjDirTheory(mu1, Sigma1, mu2, Sigma2, iniProjDirMethod, eps, quiet) # get optimal projection direction res<-projDirTheory(iniProjDir=iniProjDir, mu1=mu1, Sigma1=Sigma1, mu2=mu2, Sigma2=Sigma2, projDirMethod=projDirMethod, alpha=alpha, ITMAX=ITMAX, eps=eps, quiet=quiet) return(res) } # Obtain optimal projection direction for two sets of data points # iniProjDir -- initial projection direction such that 't(iniProjDir)*(mu2-mu1)=1' # mu1 -- mean vector of group 1 # Sigma1 -- covariance matrix of group 1 # mu2 -- mean vector of group 2 # Sigma2 -- covariance matrix of group 2 # projDirMethod -- takes values "fixedpoint" or "newton" # "fixedpoint" means that we get optimal projection direction # by solving the equation d J(projDir) / d projDir = 0, where # J(projDir) is the separation index with projection direction 'projDir' # "newton" method means that we get optimal projection driection # by the method proposed in the appendix of Qiu and Joe (2006) # "Generation of random clusters with specified degree of separation", # Journal of Classification Vol 23(2), 315--334 # ITMAX -- maximum iteration allowed # eps -- threshold for convergence criterion. If |new-old|>\n"); print(anorm); } return(list(projDir=anorm, sepVal=sepVal)) } # iniProjDir proportional to (mu2-mu1) # or iniProjDir proportional to LDA direction # or randomly generate an initial projection direction # y1 -- data for cluster 1 # y2 -- data for cluster 2 # iniProjDirMethod -- method used to construct initial projection direction # iniProjDirMethod="SL" => iniProjDir<-(Sigma1+Sigma2)^{-1}(mu2-mu1) # iniProjDirMethod="naive" => iniProjDir<-(mu2-mu1) # eps -- threshold for convergence criterion. getIniProjDirData<-function(y1, y2, iniProjDirMethod=c("SL", "naive"), eps=1.0e-10, quiet=TRUE) { iniProjDirMethod<-match.arg(arg=iniProjDirMethod, choices=c("SL", "naive")) if(!(is.numeric(y1) || is.matrix(y1))) { stop("The argument 'y1' should be a numeric vector or matrix!\n") } if(!(is.numeric(y2) || is.matrix(y2))) { stop("The argument 'y2' should be a numeric vector or matrix!\n") } if(eps<=0 || eps > 0.01) { stop("The convergence threshold 'eps' should be in (0, 0.01]!\n") } if(is.vector(y1)) { len<-length(y1) mu1<-y1 Sigma1<-matrix(0, nrow=len, ncol=len) } else { mu1<-apply(y1, 2, mean, na.rm=TRUE) Sigma1<-cov(y1) } if(is.vector(y2)) { len<-length(y2) mu2<-y2 Sigma2<-matrix(0, nrow=len, ncol=len) } else { mu2<-apply(y2, 2, mean, na.rm=TRUE) Sigma2<-cov(y2) } iniProjDir<-getIniProjDirTheory(mu1, Sigma1, mu2, Sigma2, iniProjDirMethod, eps, quiet) return(iniProjDir) } # find matrix 'Q1' such that 't(Q1)*Sigma1*Q1=I_p' # require that Sigma1 is positive definite. # # What if Sigma1 is semi-positive definite, not positive definite? # i.e., what if there are zero-value eigenvalues of Sigma1? Q1Fun<-function(Sigma1) { p<-nrow(Sigma1) # obtain eigenvalues and eigenvectors of 'Sigma1' eg<-eigen(Sigma1) eu<-eg$values # eigenvalues 'lambda_i, i=1,...,p' ieu2<-matrix(0,nrow=p, ncol=p) # diagonal elements are equal to '1/sqrt(lambda_i)' diag(ieu2)<-1.0/sqrt(eu) # columns of 'et' correspond to eigenvectors of 'Sigma1' et<-eg$vectors # 'Sigma1=et*diag(eu)*t(et)' # 'Q1=et*diag(eu)^{-1/2}*t(et)' Q1<-et%*%ieu2%*%t(et) return(Q1) } # find matrix 'Q2' and the scalar 'c1' such that # 't(Q2)*t(Q1)*(mu2-mu1)=c1*e1', # where 'e1'=c(1,0,...,0)' # # Q1 -- matrix such that 't(Q1)*Sigma1*Q1=I_p', # where 'Sigma1' is the covariance matrix for group 1 # mu1 -- mean vector for group 1 # mu2 -- mean vector for group 2 Q2c1Fun<-function(Q1, mu1, mu2) { theta<-as.vector(t(Q1)%*%(mu2-mu1)) # Q2=(q_{21}, q_{22}, ..., q_{2p}) # t(q_{21})*theta=c1 # t(q_{2i})*theta=0, i=2,\ldots, p Q2<-MOrthogonal(theta) # Q2 is an orthogonal matrix c1<-as.vector(Q2[,1]%*%theta) return(list(Q2=Q2, c1=c1)) } # calculate the matrix 'V=t(Q2)*t(Q1)*Sigma2*Q1*Q2' VFun<-function(Q1, Q2, Sigma2) { V<-t(Q2)%*%t(Q1)%*%Sigma2%*%Q1%*%Q2 return(V) } # quadratic form 't(y)*A*y' quadraticFun<-function(y, A) { y<-as.vector(y) A<-as.matrix(A) res<-t(y)%*%A%*%y res<-as.vector(res) return(res) } # the function g1() = t(y)*y+1 # y -- a (p-1) by 1 vector g1Fun<-function(y) { res<-crossprod(y)+1 return(as.vector(res)) } # the function g2() = (y+V22^{-1}v21)^T*V22*(y+V22^{-1}v21)+c2 # where c2=v11-v21^T*V22^{-1}*v21. # By simplification, we can get # g2() = y^T*V22*y+2*y^T*v21+v11 g2Fun<-function(y, V) { tmp<-V2Fun(V) V22<-tmp$V22 v21<-tmp$v21 v11<-V[1,1] # part1 = y^T*V22*y part1<-quadraticFun(y, V22) # part2 = 2*y^T*v21 part2<-2*as.vector(crossprod(y, v21)) res<-part1+part2+v11 return(as.vector(res)) } # the function g() = sqrt(g1(y))+sqrt(g2(y)) gFun<-function(y, V) { g1<-g1Fun(y) g2<-g2Fun(y, V) g<-sqrt(g1)+sqrt(g2) return(g) } # the 1st derivative of g() = y/sqrt(g1(y))+(V22*y+v21)/sqrt(g2(y)) d1gFun<-function(y, V) { g1<-g1Fun(y) g2<-g2Fun(y, V) tmp<-V2Fun(V) V22<-tmp$V22 v21<-tmp$v21 y<-as.vector(y) part1<-as.vector(y/sqrt(g1)) part2<-(V22%*%y+v21)/sqrt(as.vector(g2)) part2<-as.vector(part2) res<-part1+part2 return(res) } # the 2nd derivative of g() = I/sqrt(g1(y))-y*y^T/[g1(y)*sqrt(g1(y))] # + V22/sqrt(g2(y))-(V22*y+v21)*(V22*y+v21)^T/[g2(y)*sqrt(g2(y))] d2gFun<-function(y, V) { tmp<-V2Fun(V) V22<-tmp$V22 v21<-tmp$v21 g1<-as.vector(g1Fun(y)) g2<-as.vector(g2Fun(y, V)) p<-nrow(V) myI<-diag(p-1) part1<-(g1*myI-y%*%t(y))/as.vector(g1^(3/2)) tmp<-V22%*%y+v21 tmp<-tmp%*%t(tmp) part2<-(g2*V22-tmp)/as.vector(g2^(3/2)) res<-part1+part2 return(res) } # obtain V_{22} and v_{21} V2Fun<-function(V) { V22<-V[-1,-1] v21<-as.vector(V[,1]) v21<-v21[-1] return(list(V22=V22, v21=v21)) } # Solve projection direction via modified Newton-Raphson # Convergence to eps=1.e-10 is often in 3 iterations # yini -- initial point for iterations # V -- 'V=t(Q2)*t(Q1)*Sigma2*Q1*Q2' # ITMAX -- the maximum iteration allowed # eps -- convergence tolerance # quiet -- a flag to switch on/off the outputs of intermediate results newtonRaphson<-function(yini, V, ITMAX=20, eps=1.0e-10, quiet=TRUE) { code<-0 loop<-0 y<-yini while(1) { loop<-loop+1 if(loop>ITMAX) { code<-1 break } d1g<-d1gFun(y, V) d2g<-d2gFun(y, V) tem<-solve(d2g,d1g) tem<-as.vector(tem) ynew <- (y-tem) diff<-as.vector(sqrt(crossprod(tem))) if(diff0.5) { tem<-tem/2; y<-y+tem; diff<-diff/2 } } if(code==1) { cat("warning! newtonRaphson() did not converge!\n") cat("loop=", loop, " ITMAX=", ITMAX, " diff=", diff, " eps=", eps, " code=", code, "\n") } if(!quiet) { cat("loop=", loop, " ITMAX=", ITMAX, " diff=", diff, " eps=", eps, " code=", code, "\n") } return(list(y=y, code=code)) } # Calculate the theoretical separation index and projection direction # muMat -- cluster center matrix. muMat[i,] is the cluster center # for the i-th cluster # SigmaArray -- array of covariance matrices. SigmaArray[,,i] is the # covariance matrix for the i-th cluster # iniProjDirMethod -- indicating the method to get initial projection direction # By default, the sample version of the Su and Liu (SL) projection # direction ((n1-1)*S1+(n2-1)*S2)^{-1}(mu2-mu1) is used, # where mui and Si are the mean vector and covariance # matrix of cluster i. Alternatively, the naive projection # direction (mu2-mu1) is used. # Su and Liu (1993) JASA 1993, vol. 88 1350-1355. # projDirMethod -- takes values "fixedpoint" or "newton" # "fixedpoint" means that we get optimal projection direction # by solving the equation d J(progDir) / d progDir = 0, where # J(progDir) is the separation index with projection direction 'progDir' # "newton" method means that we get optimal projection driection # by the method proposed in the appendix of Qiu and Joe (2006) # "Generation of random clusters with specified degree of separation", # Journal of Classification Vol 23(2) 315--334 # alpha -- tuning parameter for separation index to indicating the percentage # of data points to downweight. We set 'alpha=0.05' like we set # the significance level in hypothesis testing as 0.05. # ITMAX -- when calculating the projection direction, we need iterations. # ITMAX gives the maximum iteration allowed. # The actual #iterations is usually much less than the default value 20. # eps -- A small positive number to check if a quantitiy \eqn{q} is # equal to zero. If \eqn{|q|<}\code{eps}, then we regard \eqn{q} # as equal to zero. \code{eps} is used to check if an algorithm # converges. The default value is \eqn{1.0e-10}. # quiet -- a flag to switch on/off the outputs of intermediate results. # The default value is 'TRUE'. getSepProjTheory<-function(muMat, SigmaArray, iniProjDirMethod=c("SL", "naive"), projDirMethod=c("newton", "fixedpoint"), alpha=0.05, ITMAX=20, eps=1.0e-10, quiet=TRUE) { iniProjDirMethod<-match.arg(arg=iniProjDirMethod, choices=c("SL", "naive")) projDirMethod<-match.arg(arg=projDirMethod, choices=c("newton", "fixedpoint")) if(alpha<=0 || alpha>0.5) { stop("The tuning parameter 'alpha' should be in the range (0, 0.5]!\n") } ITMAX<-as.integer(ITMAX) if(ITMAX<=0 || !is.integer(ITMAX)) { stop("The maximum iteration number allowed 'ITMAX' should be a positive integer!\n") } if(!is.logical(quiet)) { stop("The value of the quiet indicator 'quiet' should be logical, i.e., either 'TRUE' or 'FALSE'!\n") } if(!quiet) { cat(" *** Step 2.10.1: Calculate the theoretical separation index matrix and projection directions.***\n") } G<-nrow(muMat) # number of clusters p<-ncol(muMat) # number of dimensions projDirArray<-array(0, c(G,G,p)) dimnames(projDirArray)<-list(NULL, NULL, paste("variable", 1:p, sep="")) sepValMat<-matrix(-1, nrow=G, ncol=G) rownames(sepValMat)<-paste("cluster", 1:G, sep="") colnames(sepValMat)<-paste("cluster", 1:G, sep="") for(i in 2:G) { mui<-muMat[i,] si<-SigmaArray[,,i] for(j in 1:(i-1)) { muj<-muMat[j,] sj<-SigmaArray[,,j] #**** begin 1 WQ 09/22/2007 # in case some variances are equal to zero dsi<-as.numeric(diag(si)) dsj<-as.numeric(diag(sj)) tmppos<-which(abs(dsi-dsj)0) { # if difference of means is also equal to zero, then this dimension is noisy # we should set the corresonding elements in the projection direction as zero myset<-1:p # variables have zero variances in both clusters myset2<-myset[tmppos] diffmu<-abs(mui-muj) diffmu2<-diffmu[myset2] tmppos2<-which(diffmu20 && tmplen==tmpn) { if(tmpn==p) { # all varialbes are noisy, i.e., the points in the two clusters have # the same coordinates. # then any projection direction will produce the same results. # So we set the optimal direction as (1,0,0,0...0) a<-rep(0, p) a[1]<-1 tmpaSepVal<- -1 } else if(tmpn==(p-1)) { # only one variable is non-noisy myset3<-myset[-tmppos] a<-rep(0, p) a[myset3]<-1 nui<-mui[myset3] nuj<-muj[myset3] taui<-si[myset3, myset3] tauj<-sj[myset3, myset3] tmpaSepVal<-sepIndex(nui, taui, nuj, tauj, alpha, eps) } else { # we get projection direction for other dimensions myset3<-myset[-tmppos] nui<-mui[myset3] nuj<-muj[myset3] taui<-si[myset3, myset3] tauj<-sj[myset3, myset3] # get the initial projection direction iniProjDir<-getIniProjDirTheory(nui, taui, nuj, tauj, iniProjDirMethod, eps, quiet) # get the projection direction tmpa<-projDirTheory(iniProjDir, nui, taui, nuj, tauj, projDirMethod, alpha, ITMAX, eps, quiet) a2<-tmpa$projDir tmpaSepVal<-tmpa$sepVal a<-rep(0, p) a[myset3]<-a2 } } else { # some variables are non-noisy, some variable are noisy # we choose a dimension has the maximum separation tmppos3<-which(diffmu2==max(diffmu2, na.rm=TRUE))[1] mydim<-myset2[tmppos3] a<-rep(0, tmpn) a[mydim]<-1 tmpaSepVal<-1 } } else { # In all variables, variance of two clusters are not all equal to zero # get the initial projection direction iniProjDir<-getIniProjDirTheory(mui, si, muj, sj, iniProjDirMethod, eps, quiet) # get the projection direction tmpa<-projDirTheory(iniProjDir, mui, si, muj, sj, projDirMethod, alpha, ITMAX, eps, quiet) a<-tmpa$projDir tmpaSepVal<-tmpa$sepVal } projDirArray[i,j,]<-a projDirArray[j,i,]<- -a # get the separation index sepValMat[i,j]<-tmpaSepVal sepValMat[j,i]<-tmpaSepVal #**** end 1 WQ 09/22/2007 } } return(list(sepValMat=sepValMat, projDirArray=projDirArray)) } # get empirical separation indices and projection directions # y -- the Nxp data matrix # cl -- the memberships of data points # iniProjDirMethod -- indicating the method to get initial projection direction # By default, the sample version of the Su and Liu (SL) projection # direction ((n1-1)*S1+(n2-1)*S2)^{-1}(mu2-mu1) is used, # where mui and Si are the mean vector and covariance # matrix of cluster i. Alternatively, the naive projection # direction (mu2-mu1) is used. # Su and Liu (1993) JASA 1993, vol. 88 1350-1355. # projDirMethod -- takes values "fixedpoint" or "newton" # "fixedpoint" means that we get optimal projection direction # by solving the equation d J(progDir) / d progDir = 0, where # J(progDir) is the separation index with projection direction 'progDir' # "newton" method means that we get optimal projection driection # by the method proposed in the appendix of Qiu and Joe (2006) # "Generation of random clusters with specified degree of separation", # Journal of Classification Vol 23(2) 315--334 # alpha -- tuning parameter for separation index to indicating the percentage # of data points to downweight. We set 'alpha=0.05' like we set # the significance level in hypothesis testing as 0.05. # ITMAX -- when calculating the projection direction, we need iterations. # ITMAX gives the maximum iteration allowed. # The actual #iterations is usually much less than the default value 20. # eps -- convergence tolerance # quiet -- a flag to switch on/off the outputs of intermediate results. # The default value is 'TRUE'. getSepProjData<-function(y, cl, iniProjDirMethod=c("SL", "naive"), projDirMethod=c("newton", "fixedpoint"), alpha=0.05, ITMAX=20, eps=1.0e-10, quiet=TRUE) { # QWL: should handle univariate case in the next version! if(!(is.matrix(y) || is.data.frame(y))) { stop("Error! The argument 'y' should be a matrix or data frame with row number (observation number) greater than 1!\n") } if(nrow(y)!=length(cl)) { stop("The row number of 'y' does not match the length of 'cl'!\n") } p<-ncol(y) if(alpha<=0 || alpha>0.5) { stop("The tuning parameter 'alpha' should be in the range (0, 0.5]!\n") } iniProjDirMethod<-match.arg(arg=iniProjDirMethod, choices=c("SL", "naive")) projDirMethod<-match.arg(arg=projDirMethod, choices=c("newton", "fixedpoint")) ITMAX<-as.integer(ITMAX) if(ITMAX<=0 || !is.integer(ITMAX)) { stop("The maximum iteration number allowed 'ITMAX' should be a positive integer!\n") } if(eps<=0 || eps > 0.01) { stop("The convergence threshold 'eps' should be in (0, 0.01]!\n") } if(!is.logical(quiet)) { stop("The value of the quiet indicator 'quiet' should be logical, i.e., either 'TRUE' or 'FALSE'!\n") } if(!quiet) { cat(" *** Step 2.10.2: Calculate the empirical separation index matrix and projection directions.***\n") } u.cl<-sort(unique(cl)) G<-length(u.cl) if(G<2) { stop("Error! There is only one cluster in the data set!\n") } # calculate the matrix of mean vectors and the array of covariance matrices muMat<-matrix(0, nrow=G, ncol=p) SigmaArray<-array(0, c(p,p,G)) dimnames(SigmaArray)<-list(NULL, NULL, paste("cluster", 1:G, sep="")) for(i in 1:G) { yi<-y[which(cl==u.cl[i]),,drop=FALSE] muMat[i,]<-apply(yi, 2, mean, na.rm=TRUE) if(nrow(yi)>1) { SigmaArray[,,i]<-cov(yi) } else { SigmaArray[,,i]<-matrix(0, nrow=p, ncol=p) } } res<-getSepProjTheory(muMat, SigmaArray, iniProjDirMethod, projDirMethod, alpha, ITMAX, eps, quiet) return(res) } clusterGeneration/R/rcvine.R0000644000176000001440000000410110700717035015605 0ustar ripleyusers# General Reference: # Joe H (2006). Generating random correlation matrices based on partial # correlations. J. Mult. Anal. Vol. 97, 2177--2198. # Random correlation with the C-vine (different order of partial # correlations). Reference for vines: Kurowicka and Cooke, 2006, # Uncertainty Analysis with High Dimensional Dependence Modelling, # Wiley, 2006. # qq plot against beta(alp,alp) distribution qqbeta=function(x,alp) { xs=sort(x) n=length(x) pp=(1:n)/(n+1) qq=qbeta(pp,alp,alp) plot(qq,xs,ylab="corr",xlab="beta quantile") title(paste("Beta quantile plot with a=b=",alp)) 0 } # input d>=2, eta>0 (eta=1 for uniform) # output correlation matrix rr[][], density proportional to # det(R)^{eta-1} rcorcvine<-function(d,eta=1) { d<-as.integer(d) if(d<=0 || !is.integer(d)) { stop("The dimension 'd' should be a positive integer!\n") } if(eta<=0) { stop("'eta' should be positive!\n") } #handling of d=1 and d=2 if(d==1) { rr<-matrix(1,1,1); return(rr) } if(d==2) { rho<-2*rbeta(1,eta,eta)-1 rr<-matrix(c(1,rho,rho,1),2,2); return(rr) } rr<-matrix(0,d,d) # matrix of partial correlations as generated prr<-matrix(0,d,d) diag(rr)<-1 for(i in 2:d) { alp<-eta+(d-2)/2 rr[1,i]<-2*rbeta(1,alp,alp)-1 rr[i,1]<-rr[1,i] prr[1,i]<-rr[1,i] } for(m in 2:(d-1)) { alp<-eta+(d-1-m)/2 for(i in (m+1):d) { prr[m,i]<-2*rbeta(1,alp,alp)-1 # back calculate thru lower order partials tem<-prr[m,i] for(k in (m-1):1) { tem<-prr[k,m]*prr[k,i]+tem*sqrt((1-prr[k,m]^2)*(1-prr[k,i]^2)) } rr[m,i]<-tem rr[i,m]<-rr[m,i] } } return(rr) } #set.seed(1234) #nsim=5000 ##nsim=200 #d=5 #eta=d/2 ##eta=2.2 ##eta=3.5 #out5=matrix(0,nsim,d^2) #for(i in 1:nsim) #{ out5[i,]=c(rcorcvine(d,eta)) } ##d=5, OK #par(mfrow=c(3,3)) #qqbeta(out5[,2],eta) #qqbeta(out5[,3],eta) #qqbeta(out5[,4],eta) #qqbeta(out5[,5],eta) #qqbeta(out5[,8],eta) #qqbeta(out5[,9],eta) #qqbeta(out5[,10],eta) #qqbeta(out5[,14],eta) #qqbeta(out5[,15],eta) # clusterGeneration/R/ronion.R0000644000176000001440000000364610700717035015640 0ustar ripleyusers# extension of onion method, using Cholesky instead of msqrt #Ghosh, S., Henderson, S. G. (2003). Behavior of the NORTA method for #correlated random vector generation as the dimension increases. #ACM Transactions on Modeling and Computer Simulation (TOMACS) #v 13 (3), 276-294. # qq plot against beta(alp,alp) distribution qqbeta=function(x,alp) { xs=sort(x) n=length(x) pp=(1:n)/(n+1) qq=qbeta(pp,alp,alp) plot(qq,xs,ylab="corr",xlab="beta quantile") title(paste("Beta quantile plot with a=b=",alp)) 0 } # input d>=2, eta>0 (eta=1 for uniform) # output correlation matrix rr[][], density proportional to # det(R)^{eta-1} rcoronion<-function(d,eta=1) { d<-as.integer(d) if(d<=0 || !is.integer(d)) { stop("The dimension 'd' should be a positive integer!\n") } if(eta<=0) { stop("'eta' should be positive!\n") } #handling of d=1 and d=2 if(d==1) { rr<-matrix(1,1,1); return(rr) } if(d==2) { rho<-2*rbeta(1,eta,eta)-1 rr<-matrix(c(1,rho,rho,1),2,2); return(rr) } rr<-matrix(0,d,d) beta<-eta+(d-2)/2 # step 1 r12<-2*rbeta(1,beta,beta)-1 rr<-matrix(c(1,r12,r12,1),2,2) # iterative steps for(m in 2:(d-1)) { beta<-beta-0.5 y<-rbeta(1,m/2,beta) z<-rnorm(m,0,1) znorm<-sqrt(sum(z^2)) # random on surface of unit sphere z<-z/znorm w=sqrt(y)*z # can spped up by programming incremental Cholesky? rhalf<-chol(rr) qq<-w%*%rhalf rr<-cbind(rr,t(qq)) rr<-rbind(rr,c(qq,1)) } # return rr rr } #set.seed(1234) #nsim=5000 ##nsim=200 #d=5 #eta=d/2 ##eta=2.2 ##eta=3.5 #out5=matrix(0,nsim,d^2) #for(i in 1:nsim) #{ out5[i,]=c(rcoronion(d,eta)) } ##d=5, OK #par(mfrow=c(3,3)) #qqbeta(out5[,2],eta) #qqbeta(out5[,3],eta) #qqbeta(out5[,4],eta) #qqbeta(out5[,5],eta) #qqbeta(out5[,8],eta) #qqbeta(out5[,9],eta) #qqbeta(out5[,10],eta) #qqbeta(out5[,14],eta) #qqbeta(out5[,15],eta) # clusterGeneration/MD50000644000176000001440000000204512471155450014314 0ustar ripleyusers6111d861599016804af2f7a59e1ec9b0 *DESCRIPTION e68616c36fae901fbbf08bf61e39edfe *INDEX d22f0cf868305353c32c788bed2f9690 *NAMESPACE ba17f44e1bb44aff00fff410b1f6a0e6 *NEWS 2a3a63d1df2302bf4e45dd8b50dc92e8 *R/genCluster.R b62e0df5c5b0c10f01b99c4d5b41b5e1 *R/hjrancor.R 124087eaa0f3a417b7fb5e533a5d54a8 *R/rcvine.R bdd1569ce6f6b4a30892d55cd24c39c6 *R/ronion.R 79a682edae05640d478e4aa017615f97 *R/separation.R 019cca6ba356a16034cc99d657cee10a *R/simClustDesign.R d0c487193e72f91e833abc496c1bccfc *R/visual.R 681294ea01f64915bff7dcaa95bb96a5 *man/genPositiveDefMat.Rd 09fba5eb49c111e46b0306453b02a8de *man/genRandomClust.Rd 740741792a3283d9e7651446fb8da1b6 *man/getSepProj.Rd b0d0a324acf28657cdf9b1f7e294659c *man/nearestNeighborSepVal.Rd 91883e571a09c2698a974af69f07c3ba *man/plot1DProjection.Rd 7539341fa2f4a2830b144fda842937de *man/plot2DProjection.Rd 144264bec2faf9cb9ea74814909e6b8c *man/rcorrmatrix.Rd d8659b0b86a03c83dd9b5d719769d5c1 *man/sepIndex.Rd 2313075224988165c2d0ff311561e87b *man/simClustDesign.Rd 101f65dae139d55432752a71567f3203 *man/viewClusters.Rd clusterGeneration/DESCRIPTION0000644000176000001440000000202212471155450015505 0ustar ripleyusersPackage: clusterGeneration Version: 1.3.4 Date: 2015-02-18 Title: Random Cluster Generation (with Specified Degree of Separation) Author: Weiliang Qiu , Harry Joe . Maintainer: Weiliang Qiu Depends: R (>= 2.9.1), MASS Description: We developed the clusterGeneration package to provide functions for generating random clusters, generating random covariance/correlation matrices, calculating a separation index (data and population version) for pairs of clusters or cluster distributions, and 1-D and 2-D projection plots to visualize clusters. The package also contains a function to generate random clusters based on factorial designs with factors such as degree of separation, number of clusters, number of variables, number of noisy variables. License: GPL (>= 2) Packaged: 2015-02-18 16:55:16 UTC; stwxq Repository: CRAN Date/Publication: 2015-02-18 19:34:16 NeedsCompilation: no clusterGeneration/man/0000755000176000001440000000000011142401067014546 5ustar ripleyusersclusterGeneration/man/rcorrmatrix.Rd0000644000176000001440000000234610700717035017422 0ustar ripleyusers\name{rcorrmatrix} \alias{rcorrmatrix} \title{GENERATE A RANDOM CORRELATION MATRIX BASED ON RANDOM PARTIAL CORRELATIONS} \description{ Generate a random correlation matrix based on random partial correlations. } \usage{ rcorrmatrix(d,alphad=1) } \arguments{ \item{d}{ Dimension of the matrix. \code{d} should be a non-negative integer. } \item{alphad}{ \eqn{\alpha} parameter for partial of \eqn{1,d} given \eqn{2,\ldots,d-1}, for generating random correlation matrix based on the method proposed by Joe (2006), where \eqn{d} is the dimension of the correlation matrix. The default value \code{alphad}\eqn{=1} leads to a random matrix which is uniform over space of positive definite correlation matrices. Each correlation has a \eqn{Beta(a,a)} distribution on \eqn{(-1,1)} where \eqn{a=alphad+(d-2)/2}. \code{alphad} should be a positive number. } } \value{ A correlation matrix. } \references{ Joe, H. (2006) Generating Random Correlation Matrices Based on Partial Correlations. \emph{Journal of Multivariate Analysis}, \bold{97}, 2177--2189. } \author{ Weiliang Qiu \email{stwxq@channing.harvard.edu}\cr Harry Joe \email{harry@stat.ubc.ca} } \examples{ rcorrmatrix(3) rcorrmatrix(5) rcorrmatrix(5,alphad=2.5) } \keyword{cluster} clusterGeneration/man/viewClusters.Rd0000644000176000001440000000655310700717035017551 0ustar ripleyusers\name{viewClusters} \alias{viewClusters} \title{PLOT ALL CLUSTERS IN A 2-D PROJECTION SPACE} \description{ Plot all clusters in a 2-D projection space. } \usage{ viewClusters(y, cl, outlierLabel=0, projMethod="Eigen", xlim=NULL, ylim=NULL, xlab="1st projection direction", ylab="2nd projection direction", title="Scatter plot of 2-D Projected Clusters", font=2, font.lab=2, cex=1.2, cex.lab=1.2) } \arguments{ \item{y}{ Data matrix. Rows correspond to observations. Columns correspond to variables. } \item{cl}{ Cluster membership vector. } \item{outlierLabel}{ Label for outliers. Outliers are not involved in calculating the projection directions. Outliers will be represented by red triangles in the plot. By default, \code{outlierLabel=0}. } \item{projMethod}{ Method to construct 2-D projection directions. \code{projMethod="Eigen"} indicates that we project data to the 2-dimensional space spanned by the first two eigenvectors of the between cluster distance matrix \eqn{B={2\over k_0}\sum_{i=1}^{k_0}\Sigma_i+{2\over k_0(k_0-1)}\sum_{i\lambda_p}) for the covariance matrix (\eqn{\boldsymbol{\Sigma}}), then uses columns of a randomly generated orthogonal matrix (\eqn{\boldsymbol{Q}=(\boldsymbol{\alpha}_1,\ldots,\boldsymbol{\alpha}_p)}) as eigenvectors. The covariance matrix \eqn{\boldsymbol{\Sigma}} is then contructed as \eqn{\boldsymbol{Q}*diag(\lambda_1,\dots,\lambda_p)*\boldsymbol{Q}^T}. The second method, denoted as \code{unifcorrmat}, first generates a random correlation matrix (\eqn{\boldsymbol{R}}) via the method proposed in Joe (2006), then randomly generates variances (\eqn{\sigma_1^2,\ldots, \sigma_p^2}) from an interval specified by the argument \code{rangeVar}. The covariance matrix \eqn{\boldsymbol{\Sigma}} is then constructed as \eqn{diag(\sigma_1,\ldots,\sigma_p)*\boldsymbol{R}*diag(\sigma_1,\ldots,\sigma_p)}. For each data set generated, the function \code{simClustDesign} outputs four files: data file, log file, membership file, and noisy set file. All four files have the same format: \cr \file{[fileName]J[j]G[g]v[p1]nv[p2]out[numOutlier]\_[numReplicate].[extension]} \cr where \file{extension} can be \file{dat}, \file{log}, \file{mem}, or \file{noisy}. \sQuote{J} indicates separation index, with \sQuote{j} indicating the level of the factor \sQuote{separation index}; \sQuote{G} indicates number of clusters, with \sQuote{g} indicating the level of the factor \sQuote{number of clusters}; \sQuote{v} indicates the number of non-noisy variables, with \sQuote{p1} indicating the level of the factor \sQuote{number of non-noisy variables}; \sQuote{nv} indicates the number of noisy variables, with \sQuote{p2} indicating the level of the factor \sQuote{number of noisy variables}; \sQuote{out} indicates number of outliers, with \sQuote{numOutlier} indicating the value of the argument \code{numOutlier} of the function \code{simClustDesign}; \sQuote{numReplicate} indicates the value of the argument \code{numReplicate} of the function \code{simClustDesign}. The data file with file extension \file{dat} contains \eqn{n+1} rows and \eqn{p} columns, where \eqn{n} is the number of data points and \eqn{p} is the number of variables. The first row is the variable names. The log file with file extension \file{log} contains information such as cluster sizes, mean vectors, covariance matrices, projection directions, separation index matrices, etc. The membership file with file extension \file{mem} contains \eqn{n} rows and one column of cluster memberships for data points. The noisy set file with file extension \file{noisy} contains a row of labels of noisy variables. When generating clusters, population covariance matrices are all positive-definite. However sample covariance matrices might be semi-positive-definite due to small cluster sizes. In this case, the function \code{genRandomClust} will automatically use the \dQuote{fixedpoint} method to search the optimal projection direction. } \value{ The function outputs four data files for each data set (see details). This function also returns separation information data frames \code{infoFrameTheory} and \code{infoFrameData} based on population and empirical mean vectors and covariance matrices of clusters for all the data sets generated. Both \code{infoFrameTheory} and \code{infoFrameData} contain the following seven columns: \item{Column 1:}{ Labels of clusters (\eqn{1, 2, \ldots, numClust}), where \eqn{numClust} is the number of clusters for the data set. } \item{Column 2:}{ Labels of the corresponding nearest neighbors. } \item{Column 3:}{ Separation indices of the clusters to their nearest neighboring clusters. } \item{Column 4:}{ Labels of the corresponding farthest neighboring clusters. } \item{Column 5:}{ Separation indices of the clusters to their farthest neighbors. } \item{Column 6:}{ Median separation indices of the clusters to their neighbors. } \item{Column 7:}{ Data file names with format \file{[fileName]J[j]G[g]v[p1]nv[p2]out[numOutlier]\_[numReplicate]} (see details). } The function also returns three lists: \code{datList}, \code{memList}, and \code{noisyList}. \item{datList:}{ a list of lists of data matrices for generated data sets. } \item{memList:}{ a list of lists of cluster memberships for data points for generated data sets. } \item{noisyList:}{ a list of lists of sets of noisy variables for generated data sets. } } \references{ Joe, H. (2006) Generating Random Correlation Matrices Based on Partial Correlations. \emph{Journal of Multivariate Analysis}, \bold{97}, 2177--2189. Milligan G. W. (1985) An Algorithm for Generating Artificial Test Clusters. \emph{Psychometrika} \bold{50}, 123--127. Qiu, W.-L. and Joe, H. (2006a) Generation of Random Clusters with Specified Degree of Separaion. \emph{Journal of Classification}, \bold{23}(2), 315-334. Qiu, W.-L. and Joe, H. (2006b) Separation Index and Partial Membership for Clustering. \emph{Computational Statistics and Data Analysis}, \bold{50}, 585--603. Su, J. Q. and Liu, J. S. (1993) Linear Combinations of Multiple Diagnostic Markers. \emph{Journal of the American Statistical Association}, \bold{88}, 1350--1355 } \note{The speed of this function might be slow.} \author{ Weiliang Qiu \email{stwxq@channing.harvard.edu}\cr Harry Joe \email{harry@stat.ubc.ca} } \examples{ \dontrun{tmp<-simClustDesign(numClust=3, sepVal=c(0.01,0.21), sepLabels=c("L","M"), numNonNoisy=4, numOutlier=0, numReplicate=2, clustszind=2)} } \keyword{cluster} clusterGeneration/man/genRandomClust.Rd0000644000176000001440000004300311142376133017767 0ustar ripleyusers\name{genRandomClust} \alias{genRandomClust} \title{RANDOM CLUSTER GENERATION WITH SPECIFIED DEGREE OF SEPARATION} \description{ Generate cluster data sets with specified degree of separation. The separation between any cluster and its nearest neighboring cluster can be set to a specified value. The covariance matrices of clusters can have arbitrary diameters, shapes and orientations. } \usage{ genRandomClust(numClust, sepVal=0.01, numNonNoisy=2, numNoisy=0, numOutlier=0, numReplicate=3, fileName="test", clustszind=2, clustSizeEq=50, rangeN=c(50,200), clustSizes=NULL, covMethod=c("eigen", "onion", "c-vine", "unifcorrmat"), rangeVar=c(1, 10), lambdaLow=1, ratioLambda=10, alphad=1, eta=1, rotateind=TRUE, iniProjDirMethod=c("SL", "naive"), projDirMethod=c("newton", "fixedpoint"), alpha=0.05, ITMAX=20, eps=1.0e-10, quiet=TRUE, outputDatFlag=TRUE, outputLogFlag=TRUE, outputEmpirical=TRUE, outputInfo=TRUE) } \arguments{ \item{numClust}{ Number of clusters in a data set. } \item{sepVal}{ Desired value of the separation index between a cluster and its nearest neighboring cluster. Theoretically, \code{sepVal} can take values within the interval \eqn{[-1, 1)} (In practice, we set \code{sepVal} in \eqn{(-0.999, 0.999)}). The closer to \eqn{1} \code{sepVal} is, the more separated clusters are. The default value is \eqn{0.01} which is the value of the separation index for two univariate clusters generated from \eqn{N(0, 1)} and \eqn{N(0, A)}, respectively, where \eqn{A=4}. \code{sepVal}\eqn{=0.01} indicates a close cluster structure. \code{sepVal}\eqn{=0.21 (A=6)} indicates a separated cluster structure. \code{sepVal}\eqn{=0.34 (A=8)} indicates a well-separated cluster. } \item{numNonNoisy}{ Number of non-noisy variables. } \item{numNoisy}{ Number of noisy variables. The default values of \code{numNoisy} and \code{numOutlier} are \eqn{0} so that we get \dfn{clean} data sets. } \item{numOutlier}{ Number or ratio of outliers. If \code{numOutlier} is a positive integer, then \code{numOutlier} means the number of outliers. If \code{numOutlier} is a real number between \eqn{(0, 1)}, then \code{numOutlier} means the ratio of outliers, i.e. the number of outliers is equal to \code{round}(\code{numOutlier}\eqn{*n_1}), where \eqn{n_1} is the total number of non-outliers. If \code{numOutlier} is a real number greater than \eqn{1}, then \code{numOutlier} to rounded to an integer. The default values of \code{numNoisy} and \code{numOutlier} are \eqn{0} so that we get \sQuote{clean} data sets. } \item{numReplicate}{ Number of data sets to be generated for the same cluster structure specified by the other arguments of the function \code{genRandomClust}. The default value \eqn{3} follows the design in Milligan (1985). } \item{fileName}{ The first part of the names of data files that record the generated data sets and associated information, such as cluster membership of data points, labels of noisy variables, separation index matrix, projection directions, etc. (see details). The default value of \code{fileName} is \file{test}. } \item{clustszind}{ Cluster size indicator. \code{clustszind}\eqn{=1} indicates that all cluster have equal size. The size is specified by the argument \code{clustSizeEq}. \code{clustszind}\eqn{=2} indicates that the cluster sizes are randomly generated from the range specified by the argument \code{rangeN}. \code{clustszind}\eqn{=3} indicates that the cluster sizes are specified via the vector \code{clustSizes}. The default value is \eqn{2} so that the generated clusters are more realistic. } \item{clustSizeEq}{ Cluster size. If the argument \code{clustszind}\eqn{=1}, then all clusters will have the equal number \code{clustSizeEq} of data points. The value of \code{clustSizeEq} should be large enough to get non-singular cluster covariance matrices. We recommend the \code{clustSizeEq} is at least \eqn{10*p}, where \eqn{p} is the total number of variables (including both non-noisy and noisy variables). The default value \eqn{100} is a reasonable cluster size. } \item{rangeN}{ The range of cluster sizes. If \code{clustszind}\eqn{=2}, then cluster sizes will be randomly generated from the range specified by \code{rangeN}. The lower bound of the number of clusters should be large enough to get non-singular cluster covariance matrices. We recommend the minimum cluster size is at least \eqn{10*p}, where \eqn{p} is the total number of variables (including both non-noisy and noisy variables). The default range is \eqn{[50, 200]} which can produce reasonable variability of cluster sizes. } \item{clustSizes}{ The sizes of clusters. If \code{clustszind}\eqn{=3}, then cluster sizes will be specified via the vector \code{clustSizes}. We recommend the minimum cluster size is at least \eqn{10*p}, where \eqn{p} is the total number of variables (including both non-noisy and noisy variables). The user needs to specify the value of \code{clustSizes}. Therefore, we set the default value of \code{clustSizes} as \code{NULL}. } \item{covMethod}{ Method to generate covariance matrices for clusters (see details). The default method is 'eigen' so that the user can directly specify the range of the \dfn{diameters} of clusters. } \item{rangeVar}{ Range for variances of a covariance matrix (see details). The default range is \eqn{[1, 10]} which can generate reasonable variability of variances. } \item{lambdaLow}{ Lower bound of the eigenvalues of cluster covariance matrices. If the argument \dQuote{covMethod="eigen"}, we need to generate eigenvalues for cluster covariance matrices. The eigenvalues are randomly generated from the interval [\code{lambdaLow}, \code{lambdaLow}\eqn{*}\code{ratioLambda}]. In our experience, \code{lambdaLow}\eqn{=1} and \code{ratioLambda}\eqn{=10} can give reasonable variability of the diameters of clusters. \code{lambdaLow} should be positive. } \item{ratioLambda}{ The ratio of the upper bound of the eigenvalues to the lower bound of the eigenvalues of cluster covariance matrices. If the argument \code{covMethod="eigen"}, we need to generate eigenvalues for cluster covariance matrices. The eigenvalues are randomly generated from the interval [\code{lambdaLow}, \code{lambdaLow}\eqn{*}\code{ratioLambda}]. In our experience, \code{lambdaLow}\eqn{=1} and \code{ratioLambda}\eqn{=10} can give reasonable variability of the diameters of clusters. \code{ratioLambda} should be larger than \eqn{1}. } \item{alphad}{parameter for unifcorrmat method to generate random correlation matrix \code{alphad=1} for uniform. \code{alphad} should be positive.} \item{eta}{parameter for \dQuote{c-vine} and \dQuote{onion} methods to generate random correlation matrix \code{eta=1} for uniform. \code{eta} should be positive.} \item{rotateind}{ Rotation indicator. \code{rotateind=TRUE} indicates randomly rotating data in non-noisy dimensions so that we may not detect the full cluster structure from pair-wise scatter plots of the variables. } \item{iniProjDirMethod}{ Indicating the method to get initial projection direction when calculating the separation index between a pair of clusters (c.f. Qiu and Joe, 2006a, 2006b). \cr \code{iniProjDirMethod}=\dQuote{SL}, the default, indicates the initial projection direction is the sample version of the SL's projection direction (Su and Liu, 1993, JASA) \eqn{\left(\boldsymbol{\Sigma}_1+\boldsymbol{\Sigma}_2\right)^{-1}\left(\boldsymbol{\mu}_2-\boldsymbol{\mu}_1\right)}\cr \code{iniProjDirMethod}=\dQuote{naive} indicates the initial projection direction is \eqn{\boldsymbol{\mu}_2-\boldsymbol{\mu}_1} } \item{projDirMethod}{ Indicating the method to get the optimal projection direction when calculating the separation index between a pair of clusters (c.f. Qiu and Joe, 2006a, 2006b). \cr \code{projDirMethod}=\dQuote{newton} indicates we use the modified Newton-Raphson method to search the optimal projection direction (c.f. Qiu and Joe, 2006a). This requires the assumptions that both covariance matrices of the pair of clusters are positive-definite. If this assumption is violated, the \dQuote{fixedpoint} method could be used. The \dQuote{fixedpoint} method iteratively searches the optimal projection direction based on the first derivative of the separation index to the projection direction (c.f. Qiu and Joe, 2006b). } \item{alpha}{ Tuning parameter reflecting the percentage in the two tails of a projected cluster that might be outlying. We set \code{alpha}\eqn{=0.05} like we set the significance level in hypothesis testing as \eqn{0.05}. } \item{ITMAX}{ Maximum iteration allowed when iteratively calculating the optimal projection direction. The actual number of iterations is usually much less than the default value 20. } \item{eps}{ Convergence threshold. A small positive number to check if a quantitiy \eqn{q} is equal to zero. If \eqn{|q|<}\code{eps}, then we regard \eqn{q} is equal to zero. \code{eps} is used to check if an algorithm converges. The default value is \eqn{1.0e-10}. } \item{quiet}{ A flag to switch on/off the outputs of intermediate results and/or possible warning messages. The default value is \code{TRUE}. } \item{outputDatFlag}{ Indicates if data set should be output to file. } \item{outputLogFlag}{ Indicates if log info should be output to file. } \item{outputEmpirical}{ Indicates if empirical separation indices and projection directions should be calculated. This option is useful when generating clusters with sizes which are not large enough so that the sample covariance matrices may be singular. Hence, by default, \code{outputEmpirical=TRUE}. } \item{outputInfo}{ Indicates if theoretical and empirical separation information data frames should be output to a file with format \file{[fileName]\_info.log}. } } \details{ The function \code{genRandomClust} is an implementation of the random cluster generation method proposed in Qiu and Joe (2006a) which improve the cluster generation method proposed in Milligan (1985) so that the degree of separation between any cluster and its nearest neighboring cluster could be set to a specified value while the cluster covariance matrices can be arbitrary positive definite matrices, and so that clusters generated might not be visualized by pair-wise scatterplots of variables. The separation between a pair of clusters is measured by the separation index proposed in Qiu and Joe (2006b). The current version of the function \code{genRandomClust} implements two methods to generate covariance matrices for clusters. The first method, denoted by \code{eigen}, first randomly generates eigenvalues (\eqn{\lambda_1,\ldots>\lambda_p}) for the covariance matrix (\eqn{\boldsymbol{\Sigma}}), then uses columns of a randomly generated orthogonal matrix (\eqn{\boldsymbol{Q}=(\boldsymbol{\alpha}_1,\ldots,\boldsymbol{\alpha}_p)}) as eigenvectors. The covariance matrix \eqn{\boldsymbol{\Sigma}} is then contructed as \eqn{\boldsymbol{Q}*diag(\lambda_1,\dots, \lambda_p)*\boldsymbol{Q}^T}. The second method, denoted as \dQuote{unifcorrmax}, first generates a random correlation matrix (\eqn{\boldsymbol{R}}) via the method proposed in Joe (2006), then randomly generates variances (\eqn{\sigma_1^2,\ldots, \sigma_p^2}) from an interval specified by the argument \code{rangeVar}. The covariance matrix \eqn{\boldsymbol{\Sigma}} is then constructed as \eqn{diag(\sigma_1,\ldots,\sigma_p)*\boldsymbol{R}*diag(\sigma_1,\ldots,\sigma_p)}. For each data set generated, the function \code{genRandomClust} outputs four files: data file, log file, membership file, and noisy set file. All four files have the same format: \file{[fileName]\_[i].[extension]}, where \eqn{i} indicates the replicate number, and \file{extension} can be \file{dat}, \file{log}, \file{mem}, and \file{noisy}. The data file with file extension \file{dat} contains \eqn{n+1} rows and \eqn{p} columns, where \eqn{n} is the number of data points and \eqn{p} is the number of variables. The first row is the variable names. The log file with file extension \file{log} contains information such as cluster sizes, mean vectors, covariance matrices, projection directions, separation index matrices, etc. The membership file with file extension \file{mem} contains \eqn{n} rows and one column of cluster memberships for data points. The noisy set file with file extension \file{noisy} contains a row of labels of noisy variables. When generating clusters, population covariance matrices are all positive-definite. However sample covariance matrices might be semi-positive-definite due to small cluster sizes. In this case, the function \code{genRandomClust} will automatically use the \dQuote{fixedpoint} method to search the optimal projection direction. The current version of the function \code{genPositiveDefMat} implements four methods to generate random covariance matrices. The first method, denoted by \dQuote{eigen}, first randomly generates eigenvalues (\eqn{\lambda_1,\ldots,\lambda_p}) for the covariance matrix (\eqn{\boldsymbol{\Sigma}}), then uses columns of a randomly generated orthogonal matrix (\eqn{\boldsymbol{Q}=(\boldsymbol{\alpha}_1,\ldots,\boldsymbol{\alpha}_p)}) as eigenvectors. The covariance matrix \eqn{\boldsymbol{\Sigma}} is then contructed as \eqn{\boldsymbol{Q}*diag(\lambda_1,\ldots,\lambda_p)*\boldsymbol{Q}^T}. The remaining methods, denoted as \dQuote{onion}, \dQuote{c-vine}, and \dQuote{unifcorrmat} respectively, first generates a random correlation matrix (\eqn{\boldsymbol{R}}) via the method mentioned and proposed in Joe (2006), then randomly generates variances (\eqn{\sigma_1^2,\ldots,\sigma_p^2}) from an interval specified by the argument \code{rangeVar}. The covariance matrix \eqn{\boldsymbol{\Sigma}} is then constructed as \eqn{diag(\sigma_1,\ldots,\sigma_p)*\boldsymbol{R}*diag(\sigma_1,\ldots,\sigma_p)}. } \value{ The function outputs four data files for each data set (see details). This function also returns separation information data frames \code{infoFrameTheory} and \code{infoFrameData} based on population and empirical mean vectors and covariance matrices of clusters for all the data sets generated. Both \code{infoFrameTheory} and \code{infoFrameData} contain the following seven columns: \item{Column 1:}{ Labels of clusters (\eqn{1, 2, \ldots, numClust}), where \eqn{numClust} is the number of clusters for the data set. } \item{Column 2:}{ Labels of the corresponding nearest neighbors. } \item{Column 3:}{ Separation indices of the clusters to their nearest neighboring clusters. } \item{Column 4:}{ Labels of the corresponding farthest neighboring clusters. } \item{Column 5:}{ Separation indices of the clusters to their farthest neighbors. } \item{Column 6:}{ Median separation indices of the clusters to their neighbors. } \item{Column 7:}{ Data file names with format \file{[fileName]\_[i]}, where \eqn{i} indicates the replicate number. } The function also returns three lists: \code{datList}, \code{memList}, and \code{noisyList}. \item{datList:}{ a list of data matrices for generated data sets. } \item{memList:}{ a list of luster memberships for data points for generated data sets. } \item{noisyList:}{ a list of sets of noisy variables for generated data sets. } } \references{ Joe, H. (2006) Generating Random Correlation Matrices Based on Partial Correlations. \emph{Journal of Multivariate Analysis}, \bold{97}, 2177--2189. Milligan G. W. (1985) An Algorithm for Generating Artificial Test Clusters. \emph{Psychometrika} \bold{50}, 123--127. Qiu, W.-L. and Joe, H. (2006a) Generation of Random Clusters with Specified Degree of Separaion. \emph{Journal of Classification}, \bold{23}(2), 315-334. Qiu, W.-L. and Joe, H. (2006b) Separation Index and Partial Membership for Clustering. \emph{Computational Statistics and Data Analysis}, \bold{50}, 585--603. Su, J. Q. and Liu, J. S. (1993) Linear Combinations of Multiple Diagnostic Markers. \emph{Journal of the American Statistical Association}, \bold{88}, 1350--1355. Ghosh, S., Henderson, S. G. (2003). Behavior of the NORTA method for correlated random vector generation as the dimension increases. \emph{ACM Transactions on Modeling and Computer Simulation (TOMACS)}, \bold{13(3)}, 276--294. Kurowicka and Cooke, 2006. \emph{Uncertainty Analysis with High Dimensional Dependence Modelling}, Wiley, 2006. } \note{This function might be take a while to complete.} \author{ Weiliang Qiu \email{stwxq@channing.harvard.edu}\cr Harry Joe \email{harry@stat.ubc.ca} } \examples{ \dontrun{tmp1 <- genRandomClust(numClust=7, sepVal=0.3, numNonNoisy=5, numNoisy=3, numOutlier=5, numReplicate=2, fileName="chk1")} \dontrun{tmp2 <- genRandomClust(numClust=7, sepVal=0.3, numNonNoisy=5, numNoisy=3, numOutlier=5, numReplicate=2, covMethod="unifcorrmat", fileName="chk2")} \dontrun{tmp3 <- genRandomClust(numClust=2, sepVal=-0.1, numNonNoisy=2, numNoisy=6, numOutlier=30, numReplicate=1, clustszind=1, clustSizeEq=80, rangeVar=c(10, 20), covMethod="unifcorrmat", iniProjDirMethod="naive", projDirMethod="fixedpoint", fileName="chk3")} } \keyword{cluster} clusterGeneration/INDEX0000644000176000001440000000252410700717035014574 0ustar ripleyusersgenRandomClust Random cluster generation with specified degree of separation simClustDesign Design for random cluster generation with specified degree of separation rcorrmatrix Generate a random correlation matrix based on random partial correlations genPositiveDefMat Generate a positive definite matrix/covariance matrix sepIndexTheory Measure the magnitude of the gap or sparse area between a pair of cluster distributions along the specified projection direction (theoretical version) sepIndexData Measure the magnitude of the gap or sparse area between a pair of clusters along the specified projection direction (data version) getSepProjTheory Optimal projection direction and corresponding separation index for pairs of cluster distributions (theoretical version) getSepProjData Optimal projection direction and corresponding separation index for pairs of clusters (data version) nearestNeighborSepVal Separation information matrix containing nearest neighbor and farthest neighbor of each cluster plot1DProjection Plot a pair of clusters and their density estimates, which are projected along a specified 1-D projection direction plot2DProjection Plot a pair of clusters in a 2-D projection space viewClusters Plot all clusters in a 2-D projection space