fpc/0000755000176200001440000000000014537033142011024 5ustar liggesusersfpc/NAMESPACE0000644000176200001440000000327413505422254012251 0ustar liggesusers# Remove the previous line if you edit this file # This is the default, just nicked. # Export all names exportPattern(".") # Import all packages listed as Imports or Depends import( MASS, cluster, mclust, flexmix, prabclus, class, diptest, robustbase ) importFrom("kernlab",specc) importFrom("grDevices", "colors", "colours", "grey", "xy.coords") importFrom("graphics", "abline", "hist", "legend", "pairs", "par", "points", "polygon", "title","axis","text") importFrom("methods", "new") importFrom("stats", "BIC", "addmargins", "as.dist", "cmdscale", "coef", "coefficients", "cor", "cov", "cov.wt", "cutree", "density", "dist", "dnorm", "fitted.values", "hclust", "kmeans", "lm", "lsfit", "mahalanobis", "median", "pchisq", "pnorm", "qbinom", "qchisq", "qnorm", "quantile", "rbinom", "resid", "residuals", "rexp", "rgamma", "rnorm", "rt", "runif", "sd", "weighted.mean","ecdf","pgamma") importFrom("utils", "data") importFrom("parallel", "mclapply", "detectCores") S3method(fpclusters, mfpc) S3method(fpclusters, rfpc) S3method(plot, clboot) S3method(plot, dbscan) S3method(plot, mfpc) S3method(plot, rfpc) S3method(plot, valstat) S3method(predict, dbscan) S3method(print, clboot) S3method(print, dbscan) S3method(print, mfpc) S3method(print, predstr) S3method(print, rfpc) S3method(print, summary.mergenorm) S3method(print, summary.mfpc) S3method(print, summary.rfpc) S3method(print, summary.cquality) S3method(print, varwisetables) S3method(print, clusterbenchstats) S3method(print, valstat) S3method(summary, mergenorm) S3method(summary, mfpc) S3method(summary, rfpc) S3method(summary, cquality) fpc/data/0000755000176200001440000000000014536674142011747 5ustar liggesusersfpc/data/tonedata.txt.gz0000644000176200001440000000103714536674142014727 0ustar liggesusersUA0 }NHQqvv?ECe+ߟ?z'DiGԦ' '#l'#y ?k Nkq=:j"{UQsGp_ȍ< ONG;z~!OӬ|C C=_>.@笠T8Xl5/VQM VM( #TV#cTU#IFV)JjҏԄ2, JZ5jgc$DOeN0\\MnH)ۭP 6 C+L.d&nᕋUZ 4k.VN3º-L3]Vl$tCc@ձqVCFEFw;Ra}C :oF)ֺV:T πՆV܋q4LS-Y4 :ėU\ᅅ9Ky;Wv\SEŏf\S2Lt6J5tU6˦M`#~/ !ѡ fpc/man/0000755000176200001440000000000014536674142011611 5ustar liggesusersfpc/man/jittervar.Rd0000644000176200001440000000177513470377106014120 0ustar liggesusers\name{jittervar} \alias{jittervar} %- Also NEED an `\alias' for EACH other topic documented here. \title{Jitter variables in a data matrix} \description{ Jitters some variables in a data matrix. } \usage{ jittervar(x,jitterv=NULL,factor=1) } %- maybe also `usage' for other objects documented here. \arguments{ \item{x}{data matrix or data frame.} \item{jitterv}{vector of numbers of variables to be jittered.} \item{factor}{numeric. Passed on to \code{\link{jitter}}. See the documentation there. The higher, the more jittering.} } \value{ data matrix or data frame with jittered variables. } \author{Christian Hennig \email{christian.hennig@unibo.it} \url{https://www.unibo.it/sitoweb/christian.hennig/en}} \seealso{\code{\link{jitter}}} \examples{ set.seed(776655) v1 <- rnorm(20) v2 <- rnorm(20) d1 <- sample(1:5,20,replace=TRUE) d2 <- sample(1:4,20,replace=TRUE) ldata <- cbind(v1,v2,d1,d2) jv <- jittervar(ldata,jitterv=3:4) } \keyword{manip}% __ONLY ONE__ keyword per line fpc/man/dridgeline.Rd0000644000176200001440000000267013467541512014207 0ustar liggesusers\name{dridgeline} \alias{dridgeline} %- Also NEED an `\alias' for EACH other topic documented here. \title{Density along the ridgeline} \description{ Computes the density of a two-component Gaussian mixture along the ridgeline (Ray and Lindsay, 2005), along which all its density extrema are located. } \usage{ dridgeline(alpha=seq(0,1,0.001), prop, mu1, mu2, Sigma1, Sigma2, showplot=FALSE, ...) } %- maybe also `usage' for other objects documented here. \arguments{ \item{alpha}{sequence of values between 0 and 1 for which the density is computed.} \item{prop}{mixture proportion of first component.} \item{mu1}{mean vector of component 1.} \item{mu2}{mean vector of component 2.} \item{Sigma1}{covariance matrix of component 1.} \item{Sigma2}{covariance matrix of component 2.} \item{showplot}{logical. If \code{TRUE}, the density is plotted against \code{alpha}.} \item{...}{further arguments to be passed on to plot.} } \value{ Vector of density values for values of \code{alpha}. } \references{ Ray, S. and Lindsay, B. G. (2005) The Topography of Multivariate Normal Mixtures, \emph{Annals of Statistics}, 33, 2042-2065. } \author{Christian Hennig \email{christian.hennig@unibo.it} \url{https://www.unibo.it/sitoweb/christian.hennig/en/} } \examples{ q <- dridgeline(seq(0,1,0.1),0.5,c(1,1),c(2,5),diag(2),diag(2)) } \keyword{cluster}% at least one, from doc/KEYWORDS \keyword{multivariate} fpc/man/tdecomp.Rd0000644000176200001440000000156513470377430013536 0ustar liggesusers\name{tdecomp} \alias{tdecomp} %- Also NEED an `\alias' for EACH other topic documented here. \title{Root of singularity-corrected eigenvalue decomposition} \description{ Computes transposed eigenvectors of matrix \code{m} times diagonal of square root of eigenvalues so that eigenvalues smaller than 1e-6 are set to 1e-6. } \usage{ tdecomp(m) } %- maybe also `usage' for other objects documented here. \arguments{ \item{m}{a symmetric matrix of minimum format 2*2.} } \details{ Thought for use in \code{discrcoord} only.} \value{ a matrix. } \author{Christian Hennig \email{christian.hennig@unibo.it} \url{https://www.unibo.it/sitoweb/christian.hennig/en/} } \note{ Thought for use within \code{\link{discrcoord}} only. } \examples{ x <- rnorm(10) y <- rnorm(10) z <- cov(cbind(x,y)) round(tdecomp(z),digits=2) } \keyword{array}% at least one, from doc/KEYWORDS fpc/man/randcmatrix.Rd0000644000176200001440000000156613467541512014420 0ustar liggesusers\name{randcmatrix} \alias{randcmatrix} %- Also NEED an `\alias' for EACH other topic documented here. \title{Random partition matrix} \description{ For use within \code{regmix}. Generates a random 0-1-matrix with \code{n} rows and \code{cln} columns so that every row contains exactly one one and every columns contains at least \code{p+3} ones. } \usage{ randcmatrix(n,cln,p) } %- maybe also `usage' for other objects documented here. \arguments{ \item{n}{positive integer. Number of rows.} \item{cln}{positive integer. Number of columns.} \item{p}{positive integer. See above.} } \value{ An \code{n*cln}-matrix. } \author{Christian Hennig \email{christian.hennig@unibo.it} \url{https://www.unibo.it/sitoweb/christian.hennig/en/}} \seealso{ \code{\link{regmix}} } \examples{ set.seed(111) randcmatrix(10,2,1) } \keyword{cluster}% at least one, from doc/KEYWORDS fpc/man/stupidkfn.Rd0000644000176200001440000000320313731135355014077 0ustar liggesusers\name{stupidkfn} \alias{stupidkfn} %- Also NEED an `\alias' for EACH other topic documented here. \title{Stupid farthest neighbour random clustering} \description{ Picks k random starting points from given dataset to initialise k clusters. Then, one by one, a point not yet assigned to any cluster is assigned to that cluster, until all points are assigned. The point/cluster pair to be used is picked according to the smallest distance of a point to the farthest point to it in any of the already existing clusters as in complete linkage clustering, see Akhanli and Hennig (2020). } \usage{ stupidkfn(d,k) } %- maybe also `usage' for other objects documented here. \arguments{ \item{d}{\code{dist}-object or dissimilarity matrix.} \item{k}{integer. Number of clusters.} } % \details{ % } \value{ The clustering vector (values 1 to \code{k}, length number of objects behind \code{d}), } \references{ Akhanli, S. and Hennig, C. (2020) Calibrating and aggregating cluster validity indexes for context-adapted comparison of clusterings. \emph{Statistics and Computing}, 30, 1523-1544, \url{https://link.springer.com/article/10.1007/s11222-020-09958-2}, \url{https://arxiv.org/abs/2002.01822} } \author{Christian Hennig \email{christian.hennig@unibo.it} \url{https://www.unibo.it/sitoweb/christian.hennig/en/} } \seealso{ \code{\link{stupidkcentroids}}, \code{\link{stupidknn}}, \code{\link{stupidkaven}} } \examples{ set.seed(20000) options(digits=3) face <- rFace(200,dMoNo=2,dNoEy=0,p=2) stupidkfn(dist(face),3) } \keyword{multivariate}% at least one, from doc/KEYWORDS \keyword{cluster}% __ONLY ONE__ keyword per line fpc/man/discrproj.Rd0000644000176200001440000001160613467541512014077 0ustar liggesusers\name{discrproj} \alias{discrproj} %- Also NEED an `\alias' for EACH other topic documented here. \title{Linear dimension reduction for classification} \description{ An interface for ten methods of linear dimension reduction in order to separate the groups optimally in the projected data. Includes classical discriminant coordinates, methods to project differences in mean and covariance structure, asymmetric methods (separation of a homogeneous class from a heterogeneous one), local neighborhood-based methods and methods based on robust covariance matrices. } \usage{ discrproj(x, clvecd, method="dc", clnum=NULL, ignorepoints=FALSE, ignorenum=0, ...) } %- maybe also `usage' for other objects documented here. \arguments{ \item{x}{the data matrix; a numerical object which can be coerced to a matrix.} \item{clvecd}{vector of class numbers which can be coerced into integers; length must equal \code{nrow(xd)}.} \item{method}{one of \describe{ \item{"dc"}{usual discriminant coordinates, see \code{\link{discrcoord}},} \item{"bc"}{Bhattacharyya coordinates, first coordinate showing mean differences, second showing covariance matrix differences, see \code{\link{batcoord}},} \item{"vbc"}{variance dominated Bhattacharyya coordinates, see \code{\link{batcoord}},} \item{"mvdc"}{added meana and variance differences optimizing coordinates, see \code{\link{mvdcoord}},} \item{"adc"}{asymmetric discriminant coordinates, see \code{\link{adcoord}},} \item{"awc"}{asymmetric discriminant coordinates with weighted observations, see \code{\link{awcoord}},} \item{"arc"}{asymmetric discriminant coordinates with weighted observations and robust MCD-covariance matrix, see \code{\link{awcoord}},} \item{"nc"}{neighborhood based coordinates, see \code{\link{ncoord}},} \item{"wnc"}{neighborhood based coordinates with weighted neighborhoods, see \code{\link{ncoord}},} \item{"anc"}{asymmetric neighborhood based coordinates, see \code{\link{ancoord}}.} } Note that "bc", "vbc", "adc", "awc", "arc" and "anc" assume that there are only two classes.} \item{clnum}{integer. Number of the class which is attempted to plot homogeneously by "asymmetric methods", which are the methods assuming that there are only two classes, as indicated above.} \item{ignorepoints}{logical. If \code{TRUE}, points with label \code{ignorenum} in \code{clvecd} are ignored in the computation for \code{method} and are only projected afterwards onto the resulting units. If \code{pch=NULL}, the plot symbol for these points is "N".} \item{ignorenum}{one of the potential values of the components of \code{clvecd}. Only has effect if \code{ignorepoints=TRUE}, see above.} \item{...}{additional parameters passed to the projection methods.} } % \details{ % } \value{ \code{discrproj} returns the output of the chosen projection method, which is a list with at least the components \code{ev, units, proj}. For detailed informations see the help pages of the projection methods. \item{ev}{eigenvalues in descending order, usually indicating portion of information in the corresponding direction.} \item{units}{columns are coordinates of projection basis vectors. New points \code{x} can be projected onto the projection basis vectors by \code{x \%*\% units}} \item{proj}{projections of \code{xd} onto \code{units}.} } \references{ Hennig, C. (2004) Asymmetric linear dimension reduction for classification. Journal of Computational and Graphical Statistics 13, 930-945 . Hennig, C. (2005) A method for visual cluster validation. In: Weihs, C. and Gaul, W. (eds.): Classification - The Ubiquitous Challenge. Springer, Heidelberg 2005, 153-160. Seber, G. A. F. (1984). \emph{Multivariate Observations}. New York: Wiley. Fukunaga (1990). \emph{Introduction to Statistical Pattern Recognition} (2nd ed.). Boston: Academic Press. } \author{Christian Hennig \email{christian.hennig@unibo.it} \url{https://www.unibo.it/sitoweb/christian.hennig/en/}} \seealso{ \code{\link{discrcoord}}, \code{\link{batcoord}}, \code{\link{mvdcoord}}, \code{\link{adcoord}}, \code{\link{awcoord}}, \code{\link{ncoord}}, \code{\link{ancoord}}. \code{\link{rFace}} for generation of the example data used below. } \examples{ set.seed(4634) face <- rFace(300,dMoNo=2,dNoEy=0,p=3) grface <- as.integer(attr(face,"grouping")) # The abs in the following is there to unify the output, # because eigenvectors are defined only up to their sign. # Statistically it doesn't make sense to compute absolute values. round(abs(discrproj(face,grface, method="nc")$units),digits=2) round(abs(discrproj(face,grface, method="wnc")$units),digits=2) round(abs(discrproj(face,grface, clnum=1, method="arc")$units),digits=2) } \keyword{multivariate}% at least one, from doc/KEYWORDS \keyword{classif}% __ONLY ONE__ keyword per line fpc/man/piridge.Rd0000644000176200001440000000243613467541512013524 0ustar liggesusers\name{piridge} \alias{piridge} %- Also NEED an `\alias' for EACH other topic documented here. \title{Ridgeline Pi-function} \description{ The Pi-function is given in (6) in Ray and Lindsay, 2005. Equating it to the mixture proportion yields locations of two-component Gaussian mixture density extrema. } \usage{ piridge(alpha, mu1, mu2, Sigma1, Sigma2, showplot=FALSE) } %- maybe also `usage' for other objects documented here. \arguments{ \item{alpha}{sequence of values between 0 and 1 for which the Pi-function is computed.} \item{mu1}{mean vector of component 1.} \item{mu2}{mean vector of component 2.} \item{Sigma1}{covariance matrix of component 1.} \item{Sigma2}{covariance matrix of component 2.} \item{showplot}{logical. If \code{TRUE}, the Pi-function is plotted against \code{alpha}.} } \value{ Vector of values of the Pi-function for values of \code{alpha}. } \references{ Ray, S. and Lindsay, B. G. (2005) The Topography of Multivariate Normal Mixtures, \emph{Annals of Statistics}, 33, 2042-2065. } \author{Christian Hennig \email{christian.hennig@unibo.it} \url{https://www.unibo.it/sitoweb/christian.hennig/en/} } \examples{ q <- piridge(seq(0,1,0.1),c(1,1),c(2,5),diag(2),diag(2)) } \keyword{cluster}% at least one, from doc/KEYWORDS \keyword{multivariate} fpc/man/fpclusters.Rd0000644000176200001440000000177613467541512014301 0ustar liggesusers\name{fpclusters} \alias{fpclusters} %- Also NEED an `\alias' for EACH other topic documented here. \title{Extracting clusters from fixed point cluster objects} \description{ \code{fpclusters} is a generic function which extracts the representative fixed point clusters (FPCs) from FPC objects generated by \code{\link{fixmahal}} and \code{\link{fixreg}}. For documentation and examples see \code{\link{fixmahal}} and \code{\link{fixreg}}. } \usage{ fpclusters(object, ...) } %- maybe also `usage' for other objects documented here. \arguments{ \item{object}{object of class \code{rfpc} or \code{mfpc}.} \item{...}{further arguments depending on the method.} } \value{ a list of logical or numerical vectors indicating or giving the weights of the cluster memberships. } \author{Christian Hennig \email{christian.hennig@unibo.it} \url{https://www.unibo.it/sitoweb/christian.hennig/en/} } \seealso{\code{\link{fixmahal}}, \code{\link{fixreg}}} \keyword{cluster}% at least one, from doc/KEYWORDS fpc/man/simmatrix.Rd0000644000176200001440000000251613467541512014115 0ustar liggesusers\name{simmatrix} \alias{simmatrix} %- Also NEED an `\alias' for EACH other topic documented here. \title{Extracting intersections between clusters from fpc-object} \description{ Extracts the information about the size of the intersections between representative Fixed Point Clusters (FPCs) of stable groups from the output of the FPC-functions \code{\link{fixreg}} and \code{\link{fixmahal}}. } \usage{ simmatrix(fpcobj) } %- maybe also `usage' for other objects documented here. \arguments{ \item{fpcobj}{an object of class \code{rfpc} or \code{mfpc}.} } \value{ A non-negative real-valued vector giving the number of points in the intersections of the representative FPCs of stable groups. } \author{Christian Hennig \email{christian.hennig@unibo.it} \url{https://www.unibo.it/sitoweb/christian.hennig/en/} } \note{The intersection between representative FPCs no. \code{i} and \code{j} is at position \code{\link{sseg}(i,j)}.} \seealso{ \code{\link{fixmahal}}, \code{\link{fixreg}}, \code{\link{sseg}} } \examples{ set.seed(190000) data(tonedata) # Note: If you do not use the installed package, replace this by # tonedata <- read.table("(path/)tonedata.txt", header=TRUE) attach(tonedata) tonefix <- fixreg(stretchratio,tuned,mtf=1,ir=20) simmatrix(tonefix)[sseg(2,3)] } \keyword{utilities}% at least one, from doc/KEYWORDS fpc/man/unimodal.ind.Rd0000644000176200001440000000152113467541512014454 0ustar liggesusers\name{unimodal.ind} \alias{unimodal.ind} %- Also NEED an `\alias' for EACH other topic documented here. \title{Is a fitted denisity unimodal or not?} \description{ Checks whether a series of fitted density values (such as given out as \code{y}-component of \code{\link{density}}) is unimodal. } \usage{ unimodal.ind(y) } %- maybe also `usage' for other objects documented here. \arguments{ \item{y}{numeric vector of fitted density values in order of increasing x-values such as given out as \code{y}-component of \code{\link{density}}.} } \value{ Logical. \code{TRUE} if unimodal. } \author{Christian Hennig \email{christian.hennig@unibo.it} \url{https://www.unibo.it/sitoweb/christian.hennig/en/} } \examples{ unimodal.ind(c(1,3,3,4,2,1,0,0)) } \keyword{univar}% at least one, from doc/KEYWORDS % \keyword{multivariate} fpc/man/classifdist.Rd0000644000176200001440000001006713674423412014406 0ustar liggesusers\name{classifdist} \alias{classifdist} \alias{classifnp} %- Also NEED an `\alias' for EACH other topic documented here. \title{Classification of unclustered points} \description{ Various methods for classification of unclustered points from clustered points for use within functions \code{nselectboot} and \code{prediction.strength}. } \usage{ classifdist(cdist,clustering, method="averagedist", centroids=NULL,nnk=1) classifnp(data,clustering, method="centroid",cdist=NULL, centroids=NULL,nnk=1) } %- maybe also `usage' for other objects documented here. \arguments{ \item{cdist}{dissimilarity matrix or \code{dist}-object. Necessary for \code{classifdist} but optional for \code{classifnp} and there only used if \code{method="averagedist"} (if not provided, \code{dist} is applied to \code{data}).} \item{data}{something that can be coerced into a an \code{n*p}-data matrix.} \item{clustering}{integer vector. Gives the cluster number (between 1 and k for k clusters) for clustered points and should be -1 for points to be classified.} \item{method}{one of \code{"averagedist", "centroid", "qda", "knn"}. See details.} \item{centroids}{for \code{classifnp} a k times p matrix of cluster centroids. For \code{classifdist} a vector of numbers of centroid objects as provided by \code{\link[cluster]{pam}}. Only used if \code{method="centroid"}; in that case mandatory for \code{classifdist} but optional for \code{classifnp}, where cluster mean vectors are computed if \code{centroids=NULL}.} \item{nnk}{number of nearest neighbours if \code{method="knn"}.} } \details{ \code{classifdist} is for data given as dissimilarity matrix, \code{classifnp} is for data given as n times p data matrix. The following methods are supported: \describe{ \item{"centroid"}{assigns observations to the cluster with closest cluster centroid as specified in argument \code{centroids} (this is associated to k-means and pam/clara-clustering).} \item{"qda"}{only in \code{classifnp}. Classifies by quadratic discriminant analysis (this is associated to Gaussian clusters with flexible covariance matrices), calling \code{\link[MASS]{qda}} with default settings. If \code{\link[MASS]{qda}} gives an error (usually because a class was too small), \code{\link[MASS]{lda}} is used.} \item{"lda"}{only in \code{classifnp}. Classifies by linear discriminant analysis (this is associated to Gaussian clusters with equal covariance matrices), calling \code{\link[MASS]{lda}} with default settings.} \item{"averagedist"}{assigns to the cluster to which an observation has the minimum average dissimilarity to all points in the cluster (this is associated with average linkage clustering).} \item{"knn"}{classifies by \code{nnk} nearest neighbours (for \code{nnk=1}, this is associated with single linkage clustering). Calls \code{\link[class]{knn}} in \code{classifnp}.} \item{"fn"}{classifies by the minimum distance to the farthest neighbour. This is associated with complete linkage clustering).} } } \value{ An integer vector giving cluster numbers for all observations; those for the observations already clustered in the input are the same as in the input. } \author{Christian Hennig \email{christian.hennig@unibo.it} \url{https://www.unibo.it/sitoweb/christian.hennig/en/} } \seealso{ \code{\link{prediction.strength}}, \code{\link{nselectboot}} } \examples{ set.seed(20000) x1 <- rnorm(50) y <- rnorm(100) x2 <- rnorm(40,mean=20) x3 <- rnorm(10,mean=25,sd=100) x <-cbind(c(x1,x2,x3),y) truec <- c(rep(1,50),rep(2,40),rep(3,10)) topredict <- c(1,2,51,52,91) clumin <- truec clumin[topredict] <- -1 classifnp(x,clumin, method="averagedist") classifnp(x,clumin, method="qda") classifdist(dist(x),clumin, centroids=c(3,53,93),method="centroid") classifdist(dist(x),clumin,method="knn") } \keyword{cluster}% at least one, from doc/KEYWORDS \keyword{multivariate} fpc/man/clusterboot.Rd0000644000176200001440000004703213616312276014446 0ustar liggesusers\name{clusterboot} \alias{clusterboot} \alias{print.clboot} \alias{plot.clboot} %- Also NEED an `\alias' for EACH other topic documented here. \title{Clusterwise cluster stability assessment by resampling} \description{ Assessment of the clusterwise stability of a clustering of data, which can be cases*variables or dissimilarity data. The data is resampled using several schemes (bootstrap, subsetting, jittering, replacement of points by noise) and the Jaccard similarities of the original clusters to the most similar clusters in the resampled data are computed. The mean over these similarities is used as an index of the stability of a cluster (other statistics can be computed as well). The methods are described in Hennig (2007). \code{clusterboot} is an integrated function that computes the clustering as well, using interface functions for various clustering methods implemented in R (several interface functions are provided, but you can implement further ones for your favourite clustering method). See the documentation of the input parameter \code{clustermethod} below. Quite general clustering methods are possible, i.e. methods estimating or fixing the number of clusters, methods producing overlapping clusters or not assigning all cases to clusters (but declaring them as "noise"). Fuzzy clusterings cannot be processed and have to be transformed to crisp clusterings by the interface function. } \usage{ clusterboot(data,B=100, distances=(inherits(data, "dist")), bootmethod="boot", bscompare=TRUE, multipleboot=FALSE, jittertuning=0.05, noisetuning=c(0.05,4), subtuning=floor(nrow(data)/2), clustermethod,noisemethod=FALSE,count=TRUE, showplots=FALSE,dissolution=0.5, recover=0.75,seed=NULL,datatomatrix=TRUE,...) \method{print}{clboot}(x,statistics=c("mean","dissolution","recovery"),...) \method{plot}{clboot}(x,xlim=c(0,1),breaks=seq(0,1,by=0.05),...) } %- maybe also `usage' for other objects documented here. \arguments{ \item{data}{by default something that can be coerced into a (numerical) matrix (data frames with non-numerical data are allowed when using \code{datatomatrix=FALSE}, see below). The data matrix - either an \code{n*p}-data matrix (or data frame) or an \code{n*n}-dissimilarity matrix (or \code{dist}-object).} \item{B}{integer. Number of resampling runs for each scheme, see \code{bootmethod}.} \item{distances}{logical. If \code{TRUE}, the data is interpreted as dissimilarity matrix. If \code{data} is a \code{dist}-object, \code{distances=TRUE} automatically, otherwise \code{distances=FALSE} by default. This means that you have to set it to \code{TRUE} manually if \code{data} is a dissimilarity matrix.} \item{bootmethod}{vector of strings, defining the methods used for resampling. Possible methods: \code{"boot"}: nonparametric bootstrap (precise behaviour is controlled by parameters \code{bscompare} and \code{multipleboot}). \code{"subset"}: selecting random subsets from the dataset. Size determined by \code{subtuning}. \code{"noise"}: replacing a certain percentage of the points by random noise, see \code{noisetuning}. \code{"jitter"} add random noise to all points, see \code{jittertuning}. (This didn't perform well in Hennig (2007), but you may want to get your own experience.) \code{"bojit"} nonparametric bootstrap first, and then adding noise to the points, see \code{jittertuning}. \strong{Important:} only the methods \code{"boot"} and \code{"subset"} work with dissimilarity data, or if \code{datatomatrix=FALSE}! The results in Hennig (2007) indicate that \code{"boot"} is generally informative and often quite similar to \code{"subset"} and \code{"bojit"}, while \code{"noise"} sometimes provides different information. Therefore the default (for \code{distances=FALSE}) is to use \code{"boot"} and \code{"noise"}. However, some clustering methods may have problems with multiple points, which can be solved by using \code{"bojit"} or \code{"subset"} instead of \code{"boot"} or by \code{multipleboot=FALSE} below.} \item{bscompare}{logical. If \code{TRUE}, multiple points in the bootstrap sample are taken into account to compute the Jaccard similarity to the original clusters (which are represented by their "bootstrap versions", i.e., the points of the original cluster which also occur in the bootstrap sample). If a point was drawn more than once, it is in the "bootstrap version" of the original cluster more than once, too, if \code{bscompare=TRUE}. Otherwise multiple points are ignored for the computation of the Jaccard similarities. If \code{multipleboot=FALSE}, it doesn't make a difference.} \item{multipleboot}{logical. If \code{FALSE}, all points drawn more than once in the bootstrap draw are only used once in the bootstrap samples.} \item{jittertuning}{positive numeric. Tuning for the \code{"jitter"}-method. The noise distribution for jittering is a normal distribution with zero mean. The covariance matrix has the same Eigenvectors as that of the original data set, but the standard deviation along the principal directions is determined by the \code{jittertuning}-quantile of the distances between neighboring points projected along these directions.} \item{noisetuning}{A vector of two positive numerics. Tuning for the \code{"noise"}-method. The first component determines the probability that a point is replaced by noise. Noise is generated by a uniform distribution on a hyperrectangle along the principal directions of the original data set, ranging from \code{-noisetuning[2]} to \code{noisetuning[2]} times the standard deviation of the data set along the respective direction. Note that only points not replaced by noise are considered for the computation of Jaccard similarities.} \item{subtuning}{integer. Size of subsets for \code{"subset"}.} \item{clustermethod}{an interface function (the function name, not a string containing the name, has to be provided!). This defines the clustering method. See the "Details"-section for a list of available interface functions and guidelines how to write your own ones. } \item{noisemethod}{logical. If \code{TRUE}, the last cluster is regarded as "noise cluster", which means that for computing the Jaccard similarity, it is not treated as a cluster. The noise cluster of the original clustering is only compared with the noise cluster of the clustering of the resampled data. This means that in the \code{clusterboot}-output (and plot), if points were assigned to the noise cluster, the last cluster number refers to it, and its Jaccard similarity values refer to comparisons with estimated noise components in resampled datasets only. (Some cluster methods such as \code{\link[tclust]{tclust}} and \code{\link[mclust]{mclustBIC}} produce such noise components.)} \item{count}{logical. If \code{TRUE}, the resampling runs are counted on the screen.} \item{showplots}{logical. If \code{TRUE}, a plot of the first two dimensions of the resampled data set (or the classical MDS solution for dissimilarity data) is shown for every resampling run. The last plot shows the original data set. Ignored if \code{datatomatrix=FALSE}.} \item{dissolution}{numeric between 0 and 1. If the Jaccard similarity between the resampling version of the original cluster and the most similar cluster on the resampled data is smaller or equal to this value, the cluster is considered as "dissolved". Numbers of dissolved clusters are recorded.} \item{recover}{numeric between 0 and 1. If the Jaccard similarity between the resampling version of the original cluster and the most similar cluster on the resampled data is larger than this value, the cluster is considered as "successfully recovered". Numbers of recovered clusters are recorded.} \item{seed}{integer. Seed for random generator (fed into \code{set.seed}) to make results reproducible. If \code{NULL}, results depend on chance.} \item{datatomatrix}{logical. If \code{TRUE}, \code{data} is coerced into a (numerical) matrix at the start of \code{clusterboot}. \code{FALSE} may be chosen for mixed type data including e.g. categorical factors (assuming that the chosen \code{clustermethod} allows for this). This disables some features of \code{clusterboot}, see parameters \code{bootmethod} and \code{showplots}.} \item{...}{additional parameters for the clustermethods called by \code{clusterboot}. No effect in \code{print.clboot} and \code{plot.clboot}.} \item{x}{object of class \code{clboot}.} \item{statistics}{specifies in \code{print.clboot}, which of the three clusterwise Jaccard similarity statistics \code{"mean"}, \code{"dissolution"} (number of times the cluster has been dissolved) and \code{"recovery"} (number of times a cluster has been successfully recovered) is printed.} \item{xlim}{transferred to \code{hist}.} \item{breaks}{transferred to \code{hist}.} } \details{ Here are some guidelines for interpretation. There is some theoretical justification to consider a Jaccard similarity value smaller or equal to 0.5 as an indication of a "dissolved cluster", see Hennig (2008). Generally, a valid, stable cluster should yield a mean Jaccard similarity value of 0.75 or more. Between 0.6 and 0.75, clusters may be considered as indicating patterns in the data, but which points exactly should belong to these clusters is highly doubtful. Below average Jaccard values of 0.6, clusters should not be trusted. "Highly stable" clusters should yield average Jaccard similarities of 0.85 and above. All of this refers to bootstrap; for the other resampling schemes it depends on the tuning constants, though their default values should grant similar interpretations in most cases. While \code{B=100} is recommended, smaller run numbers could give quite informative results as well, if computation times become too high. Note that the stability of a cluster is assessed, but stability is not the only important validity criterion - clusters obtained by very inflexible clustering methods may be stable but not valid, as discussed in Hennig (2007). See \code{\link{plotcluster}} for graphical cluster validation. Information about interface functions for clustering methods: The following interface functions are currently implemented (in the present package; note that almost all of these functions require the specification of some control parameters, so if you use one of them, look up their common help page \code{\link{kmeansCBI}}) first: \describe{ \item{kmeansCBI}{an interface to the function \code{\link{kmeans}} for k-means clustering. This assumes a cases*variables matrix as input.} \item{hclustCBI}{an interface to the function \code{hclust} for agglomerative hierarchical clustering with optional noise cluster. This function produces a partition and assumes a cases*variables matrix as input.} \item{hclusttreeCBI}{an interface to the function \code{hclust} for agglomerative hierarchical clustering. This function produces a tree (not only a partition; therefore the number of clusters can be huge!) and assumes a cases*variables matrix as input.} \item{disthclustCBI}{an interface to the function \code{hclust} for agglomerative hierarchical clustering with optional noise cluster. This function produces a partition and assumes a dissimilarity matrix as input.} \item{noisemclustCBI}{an interface to the function \code{\link[mclust]{mclustBIC}} for normal mixture model based clustering. This assumes a cases*variables matrix as input. Warning: \code{\link[mclust]{mclustBIC}} sometimes has problems with multiple points. It is recommended to use this only together with \code{multipleboot=FALSE}.} \item{distnoisemclustCBI}{an interface to the function \code{\link[mclust]{mclustBIC}} for normal mixture model based clustering. This assumes a dissimilarity matrix as input and generates a data matrix by multidimensional scaling first. Warning: \code{\link[mclust]{mclustBIC}} sometimes has problems with multiple points. It is recommended to use this only together with \code{multipleboot=FALSE}.} \item{claraCBI}{an interface to the functions \code{\link[cluster]{pam}} and \code{\link[cluster]{clara}} for partitioning around medoids. This can be used with cases*variables as well as dissimilarity matrices as input.} \item{pamkCBI}{an interface to the function \code{\link{pamk}} for partitioning around medoids. The number of cluster is estimated by the average silhouette width. This can be used with cases*variables as well as dissimilarity matrices as input.} % \item{trimkmeansCBI}{an interface to the function % \code{\link[trimcluster]{trimkmeans}} for trimmed k-means % clustering. This assumes a cases*variables matrix as input.} \item{tclustCBI}{an interface to the function \code{tclust} in the tclust library for trimmed Gaussian clustering. This assumes a cases*variables matrix as input. Note that this function is not currently provided because the tclust package is only available in the CRAN archives, but the code is in the Examples-section of the \code{\link{kmeansCBI}}-help page.} % \item{disttrimkmeansCBI}{an interface to the function % \code{\link[trimcluster]{trimkmeans}} for trimmed k-means % clustering. This assumes a dissimilarity matrix as input and % generates a data matrix by multidimensional scaling first.} \item{dbscanCBI}{an interface to the function \code{\link{dbscan}} for density based clustering. This can be used with cases*variables as well as dissimilarity matrices as input..} \item{mahalCBI}{an interface to the function \code{\link{fixmahal}} for fixed point clustering. This assumes a cases*variables matrix as input.} \item{mergenormCBI}{an interface to the function \code{\link{mergenormals}} for clustering by merging Gaussian mixture components.} \item{speccCBI}{an interface to the function \code{\link[kernlab]{specc}} for spectral clustering.} } You can write your own interface function. The first argument of an interface function should preferably be a data matrix (of class "matrix", but it may be a symmetrical dissimilarity matrix). It can be a data frame, but this restricts some of the functionality of \code{clusterboot}, see above. Further arguments can be tuning constants for the clustering method. The output of an interface function should be a list containing (at least) the following components: \describe{ \item{result}{clustering result, usually a list with the full output of the clustering method (the precise format doesn't matter); whatever you want to use later.} \item{nc}{number of clusters. If some points don't belong to any cluster but are declared as "noise", \code{nc} includes the noise cluster, and there should be another component \code{nccl}, being the number of clusters not including the noise cluster (note that it is not mandatory to define a noise component if not all points are assigned to clusters, but if you do it, the stability of the noise cluster is assessed as well.)} \item{clusterlist}{this is a list consisting of a logical vectors of length of the number of data points (\code{n}) for each cluster, indicating whether a point is a member of this cluster (\code{TRUE}) or not. If a noise cluster is included, it should always be the last vector in this list.} \item{partition}{an integer vector of length \code{n}, partitioning the data. If the method produces a partition, it should be the clustering. This component is only used for plots, so you could do something like \code{rep(1,n)} for non-partitioning methods. If a noise cluster is included, \code{nc=nccl+1} and the noise cluster is cluster no. \code{nc}.} \item{clustermethod}{a string indicating the clustering method.} } } \value{ \code{clusterboot} returns an object of class \code{"clboot"}, which is a list with components \code{result, partition, nc, clustermethod, B, noisemethod, bootmethod, multipleboot, dissolution, recover, bootresult, bootmean, bootbrd, bootrecover, jitterresult, jittermean, jitterbrd, jitterrecover, subsetresult, subsetmean, subsetbrd, subsetrecover, bojitresult, bojitmean, bojitbrd, bojitrecover, noiseresult, noisemean, noisebrd, noiserecover}. \item{result}{clustering result; full output of the selected \code{clustermethod} for the original data set.} \item{partition}{partition parameter of the selected \code{clustermethod} (note that this is only meaningful for partitioning clustering methods).} \item{nc}{number of clusters in original data (including noise component if \code{noisemethod=TRUE}).} \item{nccl}{number of clusters in original data (not including noise component if \code{noisemethod=TRUE}).} \item{clustermethod, B, noisemethod, bootmethod, multipleboot, dissolution, recover}{input parameters, see above.} \item{bootresult}{matrix of Jaccard similarities for \code{bootmethod="boot"}. Rows correspond to clusters in the original data set. Columns correspond to bootstrap runs.} \item{bootmean}{clusterwise means of the \code{bootresult}.} \item{bootbrd}{clusterwise number of times a cluster has been dissolved.} \item{bootrecover}{clusterwise number of times a cluster has been successfully recovered.} \item{subsetresult, subsetmean, etc.}{same as \code{bootresult, bootmean, etc.}, but for the other resampling methods.} } \references{ Hennig, C. (2007) Cluster-wise assessment of cluster stability. \emph{Computational Statistics and Data Analysis}, 52, 258-271. Hennig, C. (2008) Dissolution point and isolation robustness: robustness criteria for general cluster analysis methods. \emph{Journal of Multivariate Analysis} 99, 1154-1176. } \author{Christian Hennig \email{christian.hennig@unibo.it} \url{https://www.unibo.it/sitoweb/christian.hennig/en/} } \seealso{ \code{\link{dist}}, interface functions: \code{\link{kmeansCBI}}, \code{\link{hclustCBI}}, \code{\link{hclusttreeCBI}}, \code{\link{disthclustCBI}}, \code{\link{noisemclustCBI}}, \code{\link{distnoisemclustCBI}}, \code{\link{claraCBI}}, \code{\link{pamkCBI}}, \code{\link{dbscanCBI}}, \code{\link{mahalCBI}} } \examples{ options(digits=3) set.seed(20000) face <- rFace(50,dMoNo=2,dNoEy=0,p=2) cf1 <- clusterboot(face,B=3,bootmethod= c("boot","noise","jitter"),clustermethod=kmeansCBI, krange=5,seed=15555) % For a serious application, choose a larger B! print(cf1) plot(cf1) % cf1$result$result is the k-means clustering output for the original % data; cf1$result is the output of bootkmeans on these data. cf2 <- clusterboot(dist(face),B=3,bootmethod= "subset",clustermethod=disthclustCBI, k=5, cut="number", method="average", showplots=TRUE, seed=15555) print(cf2) d1 <- c("a","b","a","c") d2 <- c("a","a","a","b") dx <- as.data.frame(cbind(d1,d2)) cpx <- clusterboot(dx,k=2,B=10,clustermethod=claraCBI, multipleboot=TRUE,usepam=TRUE,datatomatrix=FALSE) print(cpx) } \keyword{cluster}% at least one, from doc/KEYWORDS \keyword{multivariate} fpc/man/cluster.magazine.Rd0000644000176200001440000001214013470372662015347 0ustar liggesusers\name{cluster.magazine} \alias{cluster.magazine} %- Also NEED an `\alias' for EACH other topic documented here. \title{Run many clustering methods on many numbers of clusters} \description{ Runs a user-specified set of clustering methods (CBI-functions, see \code{\link{kmeansCBI}} with several numbers of clusters on a dataset with unified output. } \usage{ cluster.magazine(data,G,diss = inherits(data, "dist"), scaling=TRUE, clustermethod, distmethod=rep(TRUE,length(clustermethod)), ncinput=rep(TRUE,length(clustermethod)), clustermethodpars, trace=TRUE) } %- maybe also `usage' for other objects documented here. \arguments{ \item{data}{data matrix or \code{dist}-object.} \item{G}{vector of integers. Numbers of clusters to consider.} \item{diss}{logical. If \code{TRUE}, the data matrix is assumed to be a distance/dissimilariy matrix, otherwise it's observations times variables.} \item{scaling}{either a logical or a numeric vector of length equal to the number of columns of \code{data}. If \code{FALSE}, data won't be scaled, otherwise \code{scaling} is passed on to \code{\link{scale}} as argument\code{scale}.} \item{clustermethod}{vector of strings specifying names of CBI-functions (see \code{\link{kmeansCBI}}). These are the clustering methods to be applied.} \item{distmethod}{vector of logicals, of the same length as \code{clustermethod}. \code{TRUE} means that the clustering method operates on distances. If \code{diss=TRUE}, all entries have to be \code{TRUE}. Otherwise, if an entry is true, the corresponding method will be applied on \code{dist(data)}.} \item{ncinput}{vector of logicals, of the same length as \code{clustermethod}. \code{TRUE} indicates that the corresponding clustering method requires the number of clusters as input and will not estimate the number of clusters itself.} \item{clustermethodpars}{list of the same length as \code{clustermethod}. Specifies parameters for all involved clustering methods. Its jth entry is passed to clustermethod number k. Can be an empty entry in case all defaults are used for a clustering method. The number of clusters does not need to be specified here.} \item{trace}{logical. If \code{TRUE}, some runtime information is printed.} } % \details{ % } \value{ List of lists comprising \item{output}{Two-dimensional list. The first list index i is the number of the clustering method (ordering as specified in \code{clustermethod}), the second list index j is the number of clusters. This stores the full output of clustermethod i run on number of clusters j.} \item{clustering}{Two-dimensional list. The first list index i is the number of the clustering method (ordering as specified in \code{clustermethod}), the second list index j is the number of clusters. This stores the clustering integer vector (i.e., the \code{partition}-component of the CBI-function, see \code{\link{kmeansCBI}}) of clustermethod i run on number of clusters j.} \item{noise}{Two-dimensional list. The first list index i is the number of the clustering method (ordering as specified in \code{clustermethod}), the second list index j is the number of clusters. List entries are single logicals. If \code{TRUE}, the clustering method estimated some noise, i.e., points not belonging to any cluster, which in the clustering vector are indicated by the highest number (number of clusters plus one in case that the number of clusters was fixed).} \item{othernc}{list of integer vectors of length 2. The first number is the number of the clustering method (the order is determined by argument \code{clustermethod}), the second number is the number of clusters for those methods that estimate the number of clusters themselves and estimate a number that is smaller than \code{min(G)} or larger than \code{max(G)}.} } \references{ Hennig, C. (2017) Cluster validation by measurement of clustering characteristics relevant to the user. In C. H. Skiadas (ed.) \emph{Proceedings of ASMDA 2017}, 501-520, \url{https://arxiv.org/abs/1703.09282} } \author{Christian Hennig \email{christian.hennig@unibo.it} \url{https://www.unibo.it/sitoweb/christian.hennig/en/} } \seealso{ \code{\link{clusterbenchstats}}, \code{\link{kmeansCBI}} } \examples{ set.seed(20000) options(digits=3) face <- rFace(10,dMoNo=2,dNoEy=0,p=2) clustermethod=c("kmeansCBI","hclustCBI","hclustCBI") # A clustering method can be used more than once, with different # parameters clustermethodpars <- list() clustermethodpars[[2]] <- clustermethodpars[[3]] <- list() clustermethodpars[[2]]$method <- "complete" clustermethodpars[[3]]$method <- "average" cmf <- cluster.magazine(face,G=2:3,clustermethod=clustermethod, distmethod=rep(FALSE,3),clustermethodpars=clustermethodpars) print(str(cmf)) } \keyword{multivariate}% at least one, from doc/KEYWORDS \keyword{cluster}% __ONLY ONE__ keyword per line fpc/man/extract.mixturepars.Rd0000644000176200001440000000270213606667536016143 0ustar liggesusers\name{extract.mixturepars} \alias{extract.mixturepars} %- Also NEED an `\alias' for EACH other topic documented here. \title{Extract parameters for certain components from mclust} \description{ Extracts parameters of certain mixture components from the output of \code{\link[mclust]{summary.mclustBIC}} and updates proportions so that they sum up to 1. } \usage{ extract.mixturepars(mclustsum,compnumbers,noise=FALSE) } %- maybe also `usage' for other objects documented here. \arguments{ \item{mclustsum}{output object of \code{\link[mclust]{summary.mclustBIC}}.} \item{compnumbers}{vector of integers. Numbers of mixture components.} \item{noise}{logical. Should be \code{TRUE} if a noise component was fitted by \code{\link[mclust]{mclustBIC}}.} } \value{ Object as component \code{parameters} of \code{\link[mclust]{summary.mclustBIC}}-output, but for specified components only. (Orientation information from all components is kept.) } \author{Christian Hennig \email{christian.hennig@unibo.it} \url{https://www.unibo.it/sitoweb/christian.hennig/en/} } \examples{ set.seed(98765) require(mclust) iriss <- iris[sample(150,20),-5] irisBIC <- mclustBIC(iriss,G=5,modelNames="VEV") siris <- summary(irisBIC,iriss) emp <- extract.mixturepars(siris,2) emp$pro round(emp$mean,digits=1) emp$variance$modelName round(emp$variance$scale,digits=2) } \keyword{cluster}% at least one, from doc/KEYWORDS \keyword{multivariate} fpc/man/clusterbenchstats.Rd0000644000176200001440000003046213731163267015642 0ustar liggesusers\name{clusterbenchstats} \alias{clusterbenchstats} \alias{print.clusterbenchstats} %- Also NEED an `\alias' for EACH other topic documented here. \title{Run and validate many clusterings} \description{ This runs the methodology explained in Hennig (2019), Akhanli and Hennig (2020). It runs a user-specified set of clustering methods (CBI-functions, see \code{\link{kmeansCBI}}) with several numbers of clusters on a dataset, and computes many cluster validation indexes. In order to explore the variation of these indexes, random clusterings on the data are generated, and validation indexes are standardised by use of the random clusterings in order to make them comparable and differences between values interpretable. The function \code{\link{print.valstat}} can be used to provide weights for the cluster validation statistics, and will then compute a weighted validation index that can be used to compare all clusterings. See the examples for how to get the indexes A1 and A2 from Akhanli and Hennig (2020). } \usage{ clusterbenchstats(data,G,diss = inherits(data, "dist"), scaling=TRUE, clustermethod, methodnames=clustermethod, distmethod=rep(TRUE,length(clustermethod)), ncinput=rep(TRUE,length(clustermethod)), clustermethodpars, npstats=FALSE, useboot=FALSE, bootclassif=NULL, bootmethod="nselectboot", bootruns=25, trace=TRUE, pamcrit=TRUE,snnk=2, dnnk=2, nnruns=100,kmruns=100,fnruns=100,avenruns=100, multicore=FALSE,cores=detectCores()-1, useallmethods=TRUE, useallg=FALSE,...) \method{print}{clusterbenchstats}(x,...) } %- maybe also `usage' for other objects documented here. \arguments{ \item{data}{data matrix or \code{dist}-object.} \item{G}{vector of integers. Numbers of clusters to consider.} \item{diss}{logical. If \code{TRUE}, the data matrix is assumed to be a distance/dissimilariy matrix, otherwise it's observations times variables.} \item{scaling}{either a logical or a numeric vector of length equal to the number of columns of \code{data}. If \code{FALSE}, data won't be scaled, otherwise \code{scaling} is passed on to \code{\link{scale}} as argument\code{scale}.} \item{clustermethod}{vector of strings specifying names of CBI-functions (see \code{\link{kmeansCBI}}). These are the clustering methods to be applied.} \item{methodnames}{vector of strings with user-chosen names for clustering methods, one for every method in \code{clustermethod}. These can be used to distinguish different methods run by the same CBI-function but with different parameter values such as complete and average linkage for \code{\link{hclustCBI}}.} \item{distmethod}{vector of logicals, of the same length as \code{clustermethod}. \code{TRUE} means that the clustering method operates on distances. If \code{diss=TRUE}, all entries have to be \code{TRUE}. Otherwise, if an entry is true, the corresponding method will be applied on \code{dist(data)}.} \item{ncinput}{vector of logicals, of the same length as \code{clustermethod}. \code{TRUE} indicates that the corresponding clustering method requires the number of clusters as input and will not estimate the number of clusters itself. Only methods for which this is \code{TRUE} can be used with \code{useboot=TRUE}.} \item{clustermethodpars}{list of the same length as \code{clustermethod}. Specifies parameters for all involved clustering methods. Its jth entry is passed to clustermethod number k. Can be an empty entry in case all defaults are used for a clustering method. However, the last entry is not allowed to be empty (you may just set a parameter of the last clustering method to its default value if you don't want to specify anything else)! The number of clusters does not need to be specified here.} \item{npstats}{logical. If \code{TRUE}, \code{\link{distrsimilarity}} is called and the two validity statistics computed there are added. These require \code{diss=FALSE}.} \item{useboot}{logical. If \code{TRUE}, a stability index (either \code{nselectboot} or \code{prediction.strength}) will be involved.} \item{bootclassif}{If \code{useboot=TRUE}, a vector of strings indicating the classification methods to be used with the stability index for the different methods indicated in \code{clustermethods}, see the \code{classification} argument of \code{nselectboot} and \code{prediction.strength}.} \item{bootmethod}{either \code{"nselectboot"} or \code{"prediction.strength"}; stability index to be used if \code{useboot=TRUE}.} \item{bootruns}{integer. Number of resampling runs. If \code{useboot=TRUE}, passed on as \code{B} to \code{\link{nselectboot}} or \code{M} to \code{\link{prediction.strength}}. Note that these are applied to all \code{kmruns+nnruns+avenruns+fnruns} random clusterings on top of the regular ones, which may take a lot of time if \code{bootruns} and these values are chosen large.} \item{trace}{logical. If \code{TRUE}, some runtime information is printed.} \item{pamcrit}{logical. If \code{TRUE}, the average distance of points to their respective cluster centroids is computed (criterion of the PAM clustering method, validation criterion \code{pamc}); centroids are chosen so that they minimise this criterion for the given clustering. Passed on to \code{\link{cqcluster.stats}}.} \item{snnk}{integer. Number of neighbours used in coefficient of variation of distance to nearest within cluster neighbour, the \code{cvnnd}-statistic (clusters with \code{snnk} or fewer points are ignored for this). Passed on to \code{\link{cqcluster.stats}} as argument \code{nnk}.} \item{dnnk}{integer. Number of nearest neighbors to use for dissimilarity to the uniform in case that \code{npstats=TRUE}; \code{nnk}-argument to be passed on to \code{\link{distrsimilarity}}.} \item{nnruns}{integer. Number of runs of \code{\link{stupidknn}} (random clusterings). With \code{useboot=TRUE} one may want to choose this lower than the default for reasons of computation time.} \item{kmruns}{integer. Number of runs of \code{\link{stupidkcentroids}} (random clusterings). With \code{useboot=TRUE} one may want to choose this lower than the default for reasons of computation time.} \item{fnruns}{integer. Number of runs of \code{\link{stupidkfn}} (random clusterings). With \code{useboot=TRUE} one may want to choose this lower than the default for reasons of computation time.} \item{avenruns}{integer. Number of runs of \code{\link{stupidkaven}} (random clusterings). With \code{useboot=TRUE} one may want to choose this lower than the default for reasons of computation time.} \item{multicore}{logical. If \code{TRUE}, parallel computing is used through the function \code{\link{mclapply}} from package \code{parallel}; read warnings there if you intend to use this; it won't work on Windows.} \item{cores}{integer. Number of cores for parallelisation.} \item{useallmethods}{logical, to be passed on to \code{\link{cgrestandard}}. If \code{FALSE}, only random clustering results are used for standardisation. If \code{TRUE}, clustering results from all methods are used.} \item{useallg}{logical to be passed on to \code{\link{cgrestandard}}. If \code{TRUE}, standardisation uses results from all numbers of clusters in \code{G}. If \code{FALSE}, standardisation of results for a specific number of cluster only uses results from that number of clusters.} \item{...}{further arguments to be passed on to \code{\link{cqcluster.stats}} through \code{\link{clustatsum}} (no effect in \code{print.clusterbenchstats}).} \item{x}{object of class \code{"clusterbenchstats"}.} } \note{ This may require a lot of computing time and also memory for datasets that are not small, as most indexes require computation and storage of distances. } \value{ The output of \code{clusterbenchstats} is a big list of lists comprising lists \code{cm, stat, sim, qstat, sstat} \item{cm}{output object of \code{\link{cluster.magazine}}, see there for details. Clustering of all methods and numbers of clusters on the dataset \code{data}.}. \item{stat}{object of class \code{"valstat"}, see \code{\link{valstat.object}} for details. Unstandardised cluster validation statistics.} \item{sim}{output object of \code{\link{randomclustersim}}, see there. validity indexes from random clusterings used for standardisation of validation statistics on \code{data}.} \item{qstat}{object of class \code{"valstat"}, see \code{\link{valstat.object}} for details. Cluster validation statistics standardised by random clusterings, output of \code{\link{cgrestandard}} based on percentages, i.e., with \code{percentage=TRUE}.} \item{sstat}{object of class \code{"valstat"}, see \code{\link{valstat.object}} for details. Cluster validation statistics standardised by random clusterings, output of \code{\link{cgrestandard}} based on mean and standard deviation (called Z-score standardisation in Akhanli and Hennig (2020), i.e., with \code{percentage=FALSE}.} } \references{ Hennig, C. (2019) Cluster validation by measurement of clustering characteristics relevant to the user. In C. H. Skiadas (ed.) \emph{Data Analysis and Applications 1: Clustering and Regression, Modeling-estimating, Forecasting and Data Mining, Volume 2}, Wiley, New York 1-24, \url{https://arxiv.org/abs/1703.09282} Akhanli, S. and Hennig, C. (2020) Calibrating and aggregating cluster validity indexes for context-adapted comparison of clusterings. \emph{Statistics and Computing}, 30, 1523-1544, \url{https://link.springer.com/article/10.1007/s11222-020-09958-2}, \url{https://arxiv.org/abs/2002.01822} } \author{Christian Hennig \email{christian.hennig@unibo.it} \url{https://www.unibo.it/sitoweb/christian.hennig/en/} } \seealso{ \code{\link{valstat.object}}, \code{\link{cluster.magazine}}, \code{\link{kmeansCBI}}, \code{\link{cqcluster.stats}}, \code{\link{clustatsum}}, \code{\link{cgrestandard}} } \examples{ set.seed(20000) options(digits=3) face <- rFace(10,dMoNo=2,dNoEy=0,p=2) clustermethod=c("kmeansCBI","hclustCBI") # A clustering method can be used more than once, with different # parameters clustermethodpars <- list() clustermethodpars[[2]] <- list() clustermethodpars[[2]]$method <- "average" # Last element of clustermethodpars needs to have an entry! methodname <- c("kmeans","average") cbs <- clusterbenchstats(face,G=2:3,clustermethod=clustermethod, methodname=methodname,distmethod=rep(FALSE,2), clustermethodpars=clustermethodpars,nnruns=1,kmruns=1,fnruns=1,avenruns=1) print(cbs) print(cbs$qstat,aggregate=TRUE,weights=c(1,0,0,0,0,1,0,1,0,1,0,1,0,0,1,1)) # The weights are weights for the validation statistics ordered as in # cbs$qstat$statistics for computation of an aggregated index, see # ?print.valstat. # Now using bootstrap stability assessment as in Akhanli and Hennig (2020): bootclassif <- c("centroid","averagedist") cbsboot <- clusterbenchstats(face,G=2:3,clustermethod=clustermethod, methodname=methodname,distmethod=rep(FALSE,2), clustermethodpars=clustermethodpars, useboot=TRUE,bootclassif=bootclassif,bootmethod="nselectboot", bootruns=2,nnruns=1,kmruns=1,fnruns=1,avenruns=1,useallg=TRUE) print(cbsboot) \dontrun{ # Index A1 in Akhanli and Hennig (2020) (need these weights choices): print(cbsboot$sstat,aggregate=TRUE,weights=c(1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0)) # Index A2 in Akhanli and Hennig (2020) (need these weights choices): print(cbsboot$sstat,aggregate=TRUE,weights=c(0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0)) } # Results from nselectboot: plot(cbsboot$stat,cbsboot$sim,statistic="boot") } \keyword{multivariate}% at least one, from doc/KEYWORDS \keyword{cluster}% __ONLY ONE__ keyword per line fpc/man/dudahart2.Rd0000644000176200001440000000276213467541512013761 0ustar liggesusers\name{dudahart2} \alias{dudahart2} %- Also NEED an `\alias' for EACH other topic documented here. \title{Duda-Hart test for splitting} \description{ Duda-Hart test for whether a data set should be split into two clusters. } \usage{ dudahart2(x,clustering,alpha=0.001) } %- maybe also `usage' for other objects documented here. \arguments{ \item{x}{data matrix or data frame.} \item{clustering}{vector of integers. Clustering into two clusters.} \item{alpha}{numeric between 0 and 1. Significance level (recommended to be small if this is used for estimating the number of clusters).} } \value{ A list with components \item{p.value}{p-value against null hypothesis of homogemeity.} \item{dh}{ratio of within-cluster sum of squares for two clusters and overall sum of squares.} \item{compare}{critical value for \code{dh} at level \code{alpha}.} \item{cluster1}{\code{FALSE} if the null hypothesis of homogemeity is rejected.} \item{alpha}{see above.} \item{z}{\code{1-alpha}-quantile of a standard Gaussian.} } \references{ Duda, R. O. and Hart, P. E. (1973) \emph{Pattern Classification and Scene Analysis}. Wiley, New York. } \author{Christian Hennig \email{christian.hennig@unibo.it} \url{https://www.unibo.it/sitoweb/christian.hennig/en}} \seealso{\code{\link{cluster.stats}}} \examples{ options(digits=2) set.seed(98765) iriss <- iris[sample(150,20),-5] km <- kmeans(iriss,2) dudahart2(iriss,km$cluster) } \keyword{cluster}% __ONLY ONE__ keyword per line fpc/man/solvecov.Rd0000644000176200001440000000211213467541512013730 0ustar liggesusers\name{solvecov} \alias{solvecov} %- Also NEED an `\alias' for EACH other topic documented here. \title{Inversion of (possibly singular) symmetric matrices} \description{ Tries to invert a matrix by \code{solve}. If this fails because of singularity, an eigenvector decomposition is computed, and eigenvalues below \code{1/cmax} are replaced by \code{1/cmax}, i.e., \code{cmax} will be the corresponding eigenvalue of the inverted matrix. } \usage{ solvecov(m, cmax = 1e+10) } %- maybe also `usage' for other objects documented here. \arguments{ \item{m}{a numeric symmetric matrix.} \item{cmax}{a positive value, see above.} } \value{ A list with the following components: \item{inv}{the inverted matrix} \item{coll}{\code{TRUE} if \code{solve} failed because of singularity.} } \author{Christian Hennig \email{christian.hennig@unibo.it} \url{https://www.unibo.it/sitoweb/christian.hennig/en/}} \seealso{\code{\link{solve}}, \code{\link{eigen}}} \examples{ x <- c(1,0,0,1,0,1,0,0,1) dim(x) <- c(3,3) solvecov(x) } \keyword{array}% at least one, from doc/KEYWORDS fpc/man/mahalanodisc.Rd0000644000176200001440000000230013470377140014510 0ustar liggesusers\name{mahalanodisc} \alias{mahalanodisc} %- Also NEED an `\alias' for EACH other topic documented here. \title{Mahalanobis for AWC} \description{ Vector of Mahalanobis distances or their root. For use in \code{awcoord} only. } \usage{ mahalanodisc(x2, mg, covg, modus="square") } %- maybe also `usage' for other objects documented here. \arguments{ \item{x2}{numerical data matrix.} \item{mg}{mean vector.} \item{covg}{covariance matrix.} \item{modus}{"md" (roots of Mahalanobis distances) or "square" (original squared form of Mahalanobis distances).} } \details{ The covariance matrix is inverted by use of \code{\link{solvecov}}, which can be expected to give reasonable results for singular within-class covariance matrices. } % \details{ % } \value{ vector of (rooted) Mahalanobis distances. } \author{Christian Hennig \email{christian.hennig@unibo.it} \url{https://www.unibo.it/sitoweb/christian.hennig/en/} } \seealso{ \code{\link{awcoord}}, \code{\link{solvecov}} } \examples{ options(digits=3) x <- cbind(rnorm(50),rnorm(50)) mahalanodisc(x,c(0,0),cov(x)) mahalanodisc(x,c(0,0),matrix(0,ncol=2,nrow=2)) } \keyword{multivariate}% at least one, from doc/KEYWORDS fpc/man/fpc-package.Rd0000644000176200001440000002026113731163337014235 0ustar liggesusers\name{fpc-package} \alias{fpc-package} %- Also NEED an `\alias' for EACH other topic documented here. \docType{package} \title{fpc package overview} \description{ Here is a list of the main functions in package fpc. Most other functions are auxiliary functions for these. } \section{Clustering methods}{ \describe{ \item{dbscan}{Computes DBSCAN density based clustering as introduced in Ester et al. (1996).} \item{fixmahal}{Mahalanobis Fixed Point Clustering, Hennig and Christlieb (2002), Hennig (2005).} \item{fixreg}{Regression Fixed Point Clustering, Hennig (2003).} \item{flexmixedruns}{This fits a latent class model to data with mixed type continuous/nominal variables. Actually it calls a method for \code{\link[flexmix]{flexmix}}.} \item{mergenormals}{Clustering by merging components of a Gaussian mixture, see Hennig (2010).} \item{regmix}{ML-fit of a mixture of linear regression models, see DeSarbo and Cron (1988).} }} \section{Cluster validity indexes and estimation of the number of clusters}{ \describe{ \item{cluster.stats}{This computes several cluster validity statistics from a clustering and a dissimilarity matrix including the Calinski-Harabasz index, the adjusted Rand index and other statistics explained in Gordon (1999) as well as several characterising measures such as average between cluster and within cluster dissimilarity and separation. See also \code{\link{calinhara}}, \code{\link{dudahart2}} for specific indexes, and a new version \code{\link{cqcluster.stats}} that computes some more indexes and statistics used for computing them. There's also \code{\link{distrsimilarity}}, which computes within-cluster dissimilarity to the Gaussian and uniform distribution.} \item{prediction.strength}{Estimates the number of clusters by computing the prediction strength of a clustering of a dataset into different numbers of components for various clustering methods, see Tibshirani and Walther (2005). In fact, this is more flexible than what is in the original paper, because it can use point classification schemes that work better with clustering methods other than k-means.} \item{nselectboot}{Estimates the number of clusters by bootstrap stability selection, see Fang and Wang (2012). This is quite flexible regarding clustering methods and point classification schemes and also allows for dissimilarity data.} \item{clusterbenchstats}{This runs many clustering methods (to be specifed by the user) with many numbers of clusters on a dataset and produces standardised and comparable versions of many cluster validity indexes (see Hennig 2019, Akhanli and Hennig 2020). This is done by means of producing random clusterings on the given data, see \code{\link{stupidkcentroids}} and \code{\link{stupidknn}}. It allows to compare many clusterings based on many different potential desirable features of a clustering. \code{\link{print.valstat}} allows to compute an aggregated index with user-specified weights.} }} \section{Cluster visualisation and validation}{ \describe{ \item{clucols}{Sets of colours and symbols useful for cluster plotting.} \item{clusterboot}{Cluster-wise stability assessment of a clustering. Clusterings are performed on resampled data to see for every cluster of the original dataset how well this is reproduced. See Hennig (2007) for details.} \item{cluster.varstats}{Extracts variable-wise information for every cluster in order to help with cluster interpretation.} \item{plotcluster}{Visualisation of a clustering or grouping in data by various linear projection methods that optimise the separation between clusters, or between a single cluster and the rest of the data according to Hennig (2004) including classical methods such as discriminant coordinates. This calls the function \code{\link{discrproj}}, which is a bit more flexible but doesn't produce a plot itself.} \item{ridgeline.diagnosis}{Plots and diagnostics for assessing modality of Gaussian mixtures, see Ray and Lindsay (2005).} \item{weightplots}{Plots to diagnose component separation in Gaussian mixtures, see Hennig (2010).} \item{localshape}{Local shape matrix, can be used for finding clusters in connection with function \code{ics} in package \code{ICS}, see Hennig's discussion and rejoinder of Tyler et al. (2009).} }} \section{Useful wrapper functions for clustering methods}{ \describe{ \item{kmeansCBI}{This and other "CBI"-functions (see the \code{\link{kmeansCBI}}-help page) are unified wrappers for various clustering methods in R that may be useful because they do in one step for what you normally may need to do a bit more in R (for example fitting a Gaussian mixture with noise component in package mclust).} \item{kmeansruns}{This calls \code{\link{kmeans}} for the k-means clustering method and includes estimation of the number of clusters and finding an optimal solution from several starting points.} \item{pamk}{This calls \code{\link[cluster]{pam}} and \code{\link[cluster]{clara}} for the partitioning around medoids clustering method (Kaufman and Rouseeuw, 1990) and includes two different ways of estimating the number of clusters.} }} \references{ Akhanli, S. and Hennig, C. (2020) Calibrating and aggregating cluster validity indexes for context-adapted comparison of clusterings. \emph{Statistics and Computing}, 30, 1523-1544, \url{https://link.springer.com/article/10.1007/s11222-020-09958-2}, \url{https://arxiv.org/abs/2002.01822} DeSarbo, W. S. and Cron, W. L. (1988) A maximum likelihood methodology for clusterwise linear regression, \emph{Journal of Classification} 5, 249-282. Ester, M., Kriegel, H.-P., Sander, J. and Xu, X. (1996). A Density-Based Algorithm for Discovering Clusters in Large Spatial Databases with Noise. \emph{Proceedings of 2nd International Conference on Knowledge Discovery and Data Mining (KDD-96).} Fang, Y. and Wang, J. (2012) Selection of the number of clusters via the bootstrap method. \emph{Computational Statistics and Data Analysis}, 56, 468-477. Gordon, A. D. (1999) \emph{Classification}, 2nd ed. Chapman and Hall. Hennig, C. (2003) Clusters, outliers and regression: fixed point clusters, \emph{Journal of Multivariate Analysis} 86, 183-212. Hennig, C. (2004) Asymmetric linear dimension reduction for classification. \emph{Journal of Computational and Graphical Statistics}, 13, 930-945 . Hennig, C. (2005) Fuzzy and Crisp Mahalanobis Fixed Point Clusters, in Baier, D., Decker, R., and Schmidt-Thieme, L. (eds.): \emph{Data Analysis and Decision Support}. Springer, Heidelberg, 47-56. Hennig, C. (2007) Cluster-wise assessment of cluster stability. \emph{Computational Statistics and Data Analysis}, 52, 258-271. Hennig, C. (2010) Methods for merging Gaussian mixture components, \emph{Advances in Data Analysis and Classification}, 4, 3-34. Hennig, C. (2019) Cluster validation by measurement of clustering characteristics relevant to the user. In C. H. Skiadas (ed.) \emph{Data Analysis and Applications 1: Clustering and Regression, Modeling-estimating, Forecasting and Data Mining, Volume 2}, Wiley, New York 1-24, \url{https://arxiv.org/abs/1703.09282} Hennig, C. and Christlieb, N. (2002) Validating visual clusters in large datasets: Fixed point clusters of spectral features, \emph{Computational Statistics and Data Analysis} 40, 723-739. Kaufman, L. and Rousseeuw, P.J. (1990). "Finding Groups in Data: An Introduction to Cluster Analysis". Wiley, New York. Ray, S. and Lindsay, B. G. (2005) The Topography of Multivariate Normal Mixtures, \emph{Annals of Statistics}, 33, 2042-2065. Tibshirani, R. and Walther, G. (2005) Cluster Validation by Prediction Strength, \emph{Journal of Computational and Graphical Statistics}, 14, 511-528. } \author{Christian Hennig \email{christian.hennig@unibo.it} \url{https://www.unibo.it/sitoweb/christian.hennig/en/} } fpc/man/clucols.Rd0000644000176200001440000000277213467541512013550 0ustar liggesusers\name{clucols} \alias{clucols} \alias{clugrey} \alias{clusym} %- Also NEED an `\alias' for EACH other topic documented here. \title{Sets of colours and symbols for cluster plotting} \description{ \code{clucols} gives out a vector of different random colours. \code{clugrey} gives out a vector of equidistant grey scales. \code{clusym} is a vector of different symbols starting from "1", "2",... } \usage{ clucols(i, seed=NULL) clugrey(i,max=0.9) clusym } %- maybe also `usage' for other objects documented here. \arguments{ \item{i}{integer. Length of output vector (number of clusters).} \item{seed}{integer. Random seed.} \item{max}{between 0 and 1. Maximum grey scale value, see \code{\link{grey}} (close to 1 is bright).} } \value{ \code{clucols} gives out a vector of different random colours. \code{clugrey} gives out a vector of equidistant grey scales. \code{clusym} is a vector of different characters starting from "1", "2",... } \author{Christian Hennig \email{christian.hennig@unibo.it} \url{https://www.unibo.it/sitoweb/christian.hennig/en}} \examples{ set.seed(112233) require(MASS) require(flexmix) data(Cars93) Cars934 <- Cars93[,c(3,5,8,10)] cc <- discrete.recode(Cars934,xvarsorted=FALSE,continuous=c(2,3),discrete=c(1,4)) fcc <- flexmix(cc$data~1,k=3, model=lcmixed(continuous=2,discrete=2,ppdim=c(6,3),diagonal=TRUE)) plot(Cars934[,c(2,3)],col=clucols(3)[fcc@cluster],pch=clusym[fcc@cluster]) } \keyword{cluster}% __ONLY ONE__ keyword per line fpc/man/adcoord.Rd0000644000176200001440000000507713467541512013520 0ustar liggesusers\name{adcoord} \alias{adcoord} %- Also NEED an `\alias' for EACH other topic documented here. \title{Asymmetric discriminant coordinates} \description{ Asymmetric discriminant coordinates as defined in Hennig (2003). Asymmetric discriminant projection means that there are two classes, one of which is treated as the homogeneous class (i.e., it should appear homogeneous and separated in the resulting projection) while the other may be heterogeneous. The principle is to maximize the ratio between the projection of a between classes separation matrix and the projection of the covariance matrix within the homogeneous class. } \usage{ adcoord(xd, clvecd, clnum=1) } %- maybe also `usage' for other objects documented here. \arguments{ \item{xd}{the data matrix; a numerical object which can be coerced to a matrix.} \item{clvecd}{integer vector of class numbers; length must equal \code{nrow(xd)}.} \item{clnum}{integer. Number of the homogeneous class.} } \details{ The square root of the homogeneous classes covariance matrix is inverted by use of \code{\link{tdecomp}}, which can be expected to give reasonable results for singular within-class covariance matrices. } % \details{ % } \value{ List with the following components \item{ev}{eigenvalues in descending order.} \item{units}{columns are coordinates of projection basis vectors. New points \code{x} can be projected onto the projection basis vectors by \code{x \%*\% units}} \item{proj}{projections of \code{xd} onto \code{units}.} } \references{ Hennig, C. (2004) Asymmetric linear dimension reduction for classification. Journal of Computational and Graphical Statistics 13, 930-945 . Hennig, C. (2005) A method for visual cluster validation. In: Weihs, C. and Gaul, W. (eds.): Classification - The Ubiquitous Challenge. Springer, Heidelberg 2005, 153-160. } \author{Christian Hennig \email{christian.hennig@unibo.it} \url{https://www.unibo.it/sitoweb/christian.hennig/en/} } \seealso{ \code{\link{plotcluster}} for straight forward discriminant plots. \code{\link{discrproj}} for alternatives. \code{\link{rFace}} for generation of the example data used below. } \examples{ set.seed(4634) face <- rFace(600,dMoNo=2,dNoEy=0) grface <- as.integer(attr(face,"grouping")) adcf <- adcoord(face,grface==2) adcf2 <- adcoord(face,grface==4) plot(adcf$proj,col=1+(grface==2)) plot(adcf2$proj,col=1+(grface==4)) # ...done in one step by function plotcluster. } \keyword{multivariate}% at least one, from doc/KEYWORDS \keyword{classif}% __ONLY ONE__ keyword per line fpc/man/lcmixed.Rd0000644000176200001440000001035413467541512013524 0ustar liggesusers\name{lcmixed} \alias{lcmixed} %- Also NEED an `\alias' for EACH other topic documented here. \title{flexmix method for mixed Gaussian/multinomial mixtures} \description{ \code{lcmixed} is a method for the \code{\link[flexmix]{flexmix}}-function in package \code{flexmix}. It provides the necessary information to run an EM-algorithm for maximum likelihood estimation for a latent class mixture (clustering) model where some variables are continuous and modelled within the mixture components by Gaussian distributions and some variables are categorical and modelled within components by independent multinomial distributions. \code{lcmixed} can be called within \code{flexmix}. The function \code{\link{flexmixedruns}} is a wrapper function that can be run to apply \code{lcmixed}. Note that at least one categorical variable is needed, but it is possible to use data without continuous variable. There are further format restrictions to the data (see below in the documentation of \code{continuous} and \code{discrete}), which can be ignored when running \code{lcmixed} through \code{\link{flexmixedruns}}. } \usage{ lcmixed( formula = .~. , continuous, discrete, ppdim, diagonal = TRUE, pred.ordinal=FALSE, printlik=FALSE ) } %- maybe also `usage' for other objects documented here. \arguments{ \item{formula}{a formula to specify response and explanatory variables. For \code{lcmixed} this always has the form \code{x~1}, where \code{x} is a matrix or data frome of all variables to be involved, because regression and explanatory variables are not implemented.} \item{continuous}{number of continuous variables. Note that the continuous variables always need to be the first variables in the matrix or data frame.} \item{discrete}{number of categorical variables. Always the last variables in the matrix or data frame. Note that categorical variables always must be coded as integers 1,2,3, etc. without interruption.} \item{ppdim}{vector of integers specifying the number of (in the data) existing categories for each categorical variable.} \item{diagonal}{logical. If \code{TRUE}, Gaussian models are fitted restricted to diagonal covariance matrices. Otherwise, covariance matrices are unrestricted. \code{TRUE} is consistent with the "within class independence" assumption for the multinomial variables.} \item{pred.ordinal}{logical. If \code{FALSE}, the within-component predicted value for categorical variables is the probability mode, otherwise it is the mean of the standard (1,2,3,...) scores, which may be better for ordinal variables.} \item{printlik}{logical. If \code{TRUE}, the loglikelihood is printed out whenever computed.} } \details{ The data need to be organised case-wise, i.e., if there are categorical variables only, and 15 cases with values c(1,1,2) on the 3 variables, the data matrix needs 15 rows with values 1 1 2. General documentation on flexmix methods can be found in Chapter 4 of Friedrich Leisch's "FlexMix: A General Framework for Finite Mixture Models and Latent Class Regression in R", \url{https://CRAN.R-project.org/package=flexmix} } \value{ An object of class \code{FLXMC} (not documented; only used internally by \code{flexmix}). } \references{ Hennig, C. and Liao, T. (2013) How to find an appropriate clustering for mixed-type variables with application to socio-economic stratification, \emph{Journal of the Royal Statistical Society, Series C Applied Statistics}, 62, 309-369. } \author{Christian Hennig \email{christian.hennig@unibo.it} \url{https://www.unibo.it/sitoweb/christian.hennig/en}} \seealso{ \code{\link{flexmixedruns}}, \code{\link[flexmix]{flexmix}}, \code{\link[flexmix]{flexmix-class}}, \code{\link{discrete.recode}}, which recodes a dataset into the format required by \code{lcmixed} } \examples{ set.seed(112233) options(digits=3) require(MASS) require(flexmix) data(Cars93) Cars934 <- Cars93[,c(3,5,8,10)] cc <- discrete.recode(Cars934,xvarsorted=FALSE,continuous=c(2,3),discrete=c(1,4)) fcc <- flexmix(cc$data~1,k=2, model=lcmixed(continuous=2,discrete=2,ppdim=c(6,3),diagonal=TRUE)) summary(fcc) } \keyword{cluster}% __ONLY ONE__ keyword per line fpc/man/localshape.Rd0000644000176200001440000000344113467541512014211 0ustar liggesusers\name{localshape} \alias{localshape} \title{Local shape matrix} \description{ This computes a matrix formalising 'local shape', i.e., aggregated standardised variance/covariance in a Mahalanobis neighbourhood of the data points. This can be used for finding clusters when used as one of the covariance matrices in Invariant Coordinate Selection (function \code{ics} in package \code{ICS}), see Hennig's discussion and rejoinder of Tyler et al. (2009). } \usage{ localshape(xdata,proportion=0.1,mscatter="mcd",mcdalpha=0.8, covstandard="det") } \arguments{ \item{xdata}{objects times variables data matrix.} \item{proportion}{proportion of points to be considered as neighbourhood.} \item{mscatter}{"mcd" or "cov"; specified minimum covariance determinant or classical covariance matrix to be used for Mahalanobis distance computation.} \item{mcdalpha}{if \code{mscatter="mcd"}, this is the alpha parameter to be used by the MCD covariance matrix, i.e. one minus the asymptotic breakdown point, see \code{\link[robustbase]{covMcd}}.} \item{covstandard}{one of "trace", "det" or "none", determining by what constant the pointwise neighbourhood covariance matrices are standardised. "det" makes the affine equivariant, as noted in the discussion rejoinder of Tyler et al. (2009).} } \value{ The local shape matrix. } \references{ Tyler, D. E., Critchley, F., Duembgen, L., Oja, H. (2009) Invariant coordinate selection (with discussion). \emph{Journal of the Royal Statistical Society, Series B}, 549-592. } \author{Christian Hennig \email{christian.hennig@unibo.it} \url{https://www.unibo.it/sitoweb/christian.hennig/en}} \examples{ options(digits=3) data(iris) localshape(iris[,-5],mscatter="cov") } \keyword{multivariate} fpc/man/clustatsum.Rd0000644000176200001440000001740714355660011014302 0ustar liggesusers\name{clustatsum} \alias{clustatsum} %- Also NEED an `\alias' for EACH other topic documented here. \title{Compute and format cluster validation statistics} \description{ \code{clustatsum} computes cluster validation statistics by running \code{\link{cqcluster.stats}}, and potentially \code{\link{distrsimilarity}}, and collecting some key statistics values with a somewhat different nomenclature. This was implemented as a helper function for use inside of \code{\link{clusterbenchstats}} and \code{\link{cgrestandard}}. } \usage{ clustatsum(datadist=NULL,clustering,noisecluster=FALSE, datanp=NULL,npstats=FALSE,useboot=FALSE, bootclassif=NULL, bootmethod="nselectboot", bootruns=25, cbmethod=NULL,methodpars=NULL, distmethod=NULL,dnnk=2, pamcrit=TRUE,...) } %- maybe also `usage' for other objects documented here. \arguments{ \item{datadist}{distances on which validation-measures are based, \code{dist} object or distance matrix. If \code{NULL}, this is computed from \code{datanp}; at least one of \code{datadist} and \code{datanp} must be specified.} \item{clustering}{an integer vector of length of the number of cases, which indicates a clustering. The clusters have to be numbered from 1 to the number of clusters.} \item{noisecluster}{logical. If \code{TRUE}, it is assumed that the largest cluster number in \code{clustering} denotes a 'noise class', i.e. points that do not belong to any cluster. These points are not taken into account for the computation of all functions of within and between cluster distances including the validation indexes.} \item{datanp}{optional observations times variables data matrix, see \code{npstats}.} \item{npstats}{logical. If \code{TRUE}, \code{\link{distrsimilarity}} is called and the two statistics computed there are added to the output. These are based on \code{datanp} and require \code{datanp} to be specified.} \item{useboot}{logical. If \code{TRUE}, a stability index (either \code{nselectboot} or \code{prediction.strength}) will be involved.} \item{bootclassif}{If \code{useboot=TRUE}, a string indicating the classification method to be used with the stability index, see the \code{classification} argument of \code{nselectboot} and \code{prediction.strength}.} \item{bootmethod}{either \code{"nselectboot"} or \code{"prediction.strength"}; stability index to be used if \code{useboot=TRUE}.} \item{bootruns}{integer. Number of resampling runs. If \code{useboot=TRUE}, passed on as \code{B} to \code{\link{nselectboot}} or \code{M} to \code{\link{prediction.strength}}.} \item{cbmethod}{CBI-function (see \code{\link{kmeansCBI}}); clustering method to be used for stability assessment if \code{useboot=TRUE}.} \item{methodpars}{parameters to be passed on to \code{cbmethod}.} \item{distmethod}{logical. In case of \code{useboot=TRUE} indicates whether \code{cbmethod} will interpret data as distances.} \item{dnnk}{\code{nnk}-argument to be passed on to \code{\link{distrsimilarity}}.} \item{pamcrit}{\code{pamcrit}-argument to be passed on to \code{\link{cqcluster.stats}}.} \item{...}{further arguments to be passed on to \code{\link{cqcluster.stats}}.} } \value{ \code{clustatsum} returns a list. The components, as listed below, are outputs of \code{\link{summary.cquality}} with default parameters, which means that they are partly transformed versions of those given out by \code{\link{cqcluster.stats}}, i.e., their range is between 0 and 1 and large values are good. Those from \code{\link{distrsimilarity}} are computed with \code{largeisgood=TRUE}, correspondingly. \item{avewithin}{average distance within clusters (reweighted so that every observation, rather than every distance, has the same weight).} \item{mnnd}{average distance to \code{nnk}th nearest neighbour within cluster.} \item{cvnnd}{coefficient of variation of dissimilarities to \code{nnk}th nearest wthin-cluster neighbour, measuring uniformity of within-cluster densities, weighted over all clusters, see Sec. 3.7 of Hennig (2019).} \item{maxdiameter}{maximum cluster diameter.} \item{widestgap}{widest within-cluster gap or average of cluster-wise widest within-cluster gap, depending on parameter \code{averagegap}.} \item{sindex}{separation index, see argument \code{sepindex}.} \item{minsep}{minimum cluster separation.} \item{asw}{average silhouette width. See \code{\link{silhouette}}.} \item{dindex}{this index measures to what extent the density decreases from the cluster mode to the outskirts; I-densdec in Sec. 3.6 of Hennig (2019).} \item{denscut}{this index measures whether cluster boundaries run through density valleys; I-densbound in Sec. 3.6 of Hennig (2019).} \item{highdgap}{this measures whether there is a large within-cluster gap with high density on both sides; I-highdgap in Sec. 3.6 of Hennig (2019).} \item{pearsongamma}{correlation between distances and a 0-1-vector where 0 means same cluster, 1 means different clusters. "Normalized gamma" in Halkidi et al. (2001).} \item{withinss}{a generalisation of the within clusters sum of squares (k-means objective function), which is obtained if \code{d} is a Euclidean distance matrix. For general distance measures, this is half the sum of the within cluster squared dissimilarities divided by the cluster size.} \item{entropy}{entropy of the distribution of cluster memberships, see Meila(2007).} \item{pamc}{average distance to cluster centroid.} \item{kdnorm}{Kolmogorov distance between distribution of within-cluster Mahalanobis distances and appropriate chi-squared distribution, aggregated over clusters (I am grateful to Agustin Mayo-Iscar for the idea).} \item{kdunif}{Kolmogorov distance between distribution of distances to \code{nnk}th nearest within-cluster neighbor and appropriate Gamma-distribution, see Byers and Raftery (1998), aggregated over clusters.} \item{boot}{if \code{useboot=TRUE}, stability value; \code{stabk} for method \code{\link{nselectboot}}; \code{mean.pred} for method \code{\link{prediction.strength}}.} } \references{ Akhanli, S. and Hennig, C. (2020) Calibrating and aggregating cluster validity indexes for context-adapted comparison of clusterings. \emph{Statistics and Computing}, 30, 1523-1544, \url{https://link.springer.com/article/10.1007/s11222-020-09958-2}, \url{https://arxiv.org/abs/2002.01822} Halkidi, M., Batistakis, Y., Vazirgiannis, M. (2001) On Clustering Validation Techniques, \emph{Journal of Intelligent Information Systems}, 17, 107-145. Hennig, C. (2019) Cluster validation by measurement of clustering characteristics relevant to the user. In C. H. Skiadas (ed.) \emph{Data Analysis and Applications 1: Clustering and Regression, Modeling-estimating, Forecasting and Data Mining, Volume 2}, Wiley, New York 1-24, \url{https://arxiv.org/abs/1703.09282} Kaufman, L. and Rousseeuw, P.J. (1990). "Finding Groups in Data: An Introduction to Cluster Analysis". Wiley, New York. Meila, M. (2007) Comparing clusterings?an information based distance, \emph{Journal of Multivariate Analysis}, 98, 873-895. } \author{Christian Hennig \email{christian.hennig@unibo.it} \url{https://www.unibo.it/sitoweb/christian.hennig/en/} } \seealso{ \code{\link{cqcluster.stats}}, \code{\link{distrsimilarity}} } \examples{ set.seed(20000) options(digits=3) face <- rFace(20,dMoNo=2,dNoEy=0,p=2) dface <- dist(face) complete3 <- cutree(hclust(dface),3) clustatsum(dface,complete3) } \keyword{cluster}% at least one, from doc/KEYWORDS \keyword{multivariate} fpc/man/cdbw.Rd0000644000176200001440000000413113467541512013012 0ustar liggesusers\name{cdbw} \alias{cdbw} %- Also NEED an `\alias' for EACH other topic documented here. \title{CDbw-index for cluster validation} \description{ CDbw-index for cluster validation, as defined in Halkidi and Vazirgiannis (2008), Halkidi et al. (2015). } \usage{ cdbw(x,clustering,r=10,s=seq(0.1,0.8,by=0.1), clusterstdev=TRUE,trace=FALSE) } %- maybe also `usage' for other objects documented here. \arguments{ \item{x}{something that can be coerced into a numerical matrix. Euclidean dataset.} \item{clustering}{vector of integers with length \code{=nrow(x)}; indicating the cluster for each observation.} \item{r}{integer. Number of cluster border representatives.} \item{s}{numerical vector of shrinking factors (between 0 and 1).} \item{clusterstdev}{logical. If \code{TRUE}, the neighborhood radius for intra-cluster density is the within-cluster estimated squared distance from the mean of the cluster; otherwise it is the average of these over all clusters.} \item{trace}{logical. If \code{TRUE}, results are printed for the steps to compute the index.} } \value{ List with components (see Halkidi and Vazirgiannis (2008), Halkidi et al. (2015) for details) \item{cdbw}{value of CDbw index (the higher the better).} \item{cohesion}{cohesion.} \item{compactness}{compactness.} \item{sep}{separation.} } \references{ Halkidi, M. and Vazirgiannis, M. (2008) A density-based cluster validity approach using multi-representatives. \emph{Pattern Recognition Letters} 29, 773-786. Halkidi, M., Vazirgiannis, M. and Hennig, C. (2015) Method-independent indices for cluster validation. In C. Hennig, M. Meila, F. Murtagh, R. Rocci (eds.) \emph{Handbook of Cluster Analysis}, CRC Press/Taylor \code{&} Francis, Boca Raton. } \author{Christian Hennig \email{christian.hennig@unibo.it} \url{https://www.unibo.it/sitoweb/christian.hennig/en/} } \examples{ options(digits=3) iriss <- as.matrix(iris[c(1:5,51:55,101:105),-5]) irisc <- as.numeric(iris[c(1:5,51:55,101:105),5]) cdbw(iriss,irisc) } \keyword{cluster}% at least one, from doc/KEYWORDS fpc/man/mixdens.Rd0000644000176200001440000000237713467541512013554 0ustar liggesusers\name{mixdens} \alias{mixdens} %- Also NEED an `\alias' for EACH other topic documented here. \title{Density of multivariate Gaussian mixture, mclust parameterisation} \description{ Computes density values for data from a mixture of multivariate Gaussian distributions with parameters based on the way models are specified and parameters are stored in package mclust. } \usage{ mixdens(modelName,data,parameters) } %- maybe also `usage' for other objects documented here. \arguments{ \item{modelName}{an mclust model name. See \code{\link[mclust]{mclustModelNames}}.} \item{data}{data matrix; density values are computed for every observation (row).} \item{parameters}{parameters of Gaussian mixture in the format used in the output of \code{\link[mclust]{summary.mclustBIC}}.} } \value{ Vector of density values for the observations. } \author{Christian Hennig \email{christian.hennig@unibo.it} \url{https://www.unibo.it/sitoweb/christian.hennig/en/} } \examples{ set.seed(98765) require(mclust) iriss <- iris[sample(150,20),-5] irisBIC <- mclustBIC(iriss) siris <- summary(irisBIC,iriss) round(mixdens(siris$modelName,iriss,siris$parameters),digits=2) } \keyword{cluster}% at least one, from doc/KEYWORDS \keyword{multivariate} fpc/man/mahalconf.Rd0000644000176200001440000000351213467541512014025 0ustar liggesusers\name{mahalconf} \alias{mahalconf} %- Also NEED an `\alias' for EACH other topic documented here. \title{Mahalanobis fixed point clusters initial configuration} \description{ Generates an initial configuration of \code{startn} points from dataset \code{x} for the \code{\link{fixmahal}} fixed point iteration. Thought only for use within \code{\link{fixmahal}}. } \usage{ mahalconf(x, no, startn, covall, plot) } %- maybe also `usage' for other objects documented here. \arguments{ \item{x}{numerical matrix. Rows are points, columns are variables.} \item{no}{integer between 1 and \code{nrow(x)}. Number of the first point of the configuration.} \item{startn}{integer between 1 and \code{nrow(x)}.} \item{covall}{covariance matrix for the computation of the first Mahalanobis distances.} \item{plot}{a string. If equal to \code{"start"} or \code{"both"},the first two variables and the first \code{ncol(x)+1} points are plotted.} } \details{ \code{mahalconf} first chooses the \eqn{p} (number of variables) nearest points to point no. \code{no} in terms of the Mahalanobis distance w.r.t. \code{covall}, so that there are \eqn{p+1} points. In every further step, the covariance matrix of the current configuration is computed and the nearest point in terms of the new Mahalanobis distance is added. \code{\link{solvecov}} is used to invert singular covariance matrices. } \value{ A logical vector of length \code{nrow(x)}. } \author{Christian Hennig \email{christian.hennig@unibo.it} \url{https://www.unibo.it/sitoweb/christian.hennig/en/}} \seealso{\code{\link{fixmahal}}, \code{\link{solvecov}}} \examples{ set.seed(4634) face <- rFace(600,dMoNo=2,dNoEy=0,p=2) mahalconf(face,no=200,startn=20,covall=cov(face),plot="start") } \keyword{multivariate}% at least one, from doc/KEYWORDS \keyword{cluster} fpc/man/bhattacharyya.dist.Rd0000644000176200001440000000233713467541512015667 0ustar liggesusers\name{bhattacharyya.dist} \alias{bhattacharyya.dist} %- Also NEED an `\alias' for EACH other topic documented here. \title{Bhattacharyya distance between Gaussian distributions} \description{ Computes Bhattacharyya distance between two multivariate Gaussian distributions. See Fukunaga (1990). } \usage{ bhattacharyya.dist(mu1, mu2, Sigma1, Sigma2) } %- maybe also `usage' for other objects documented here. \arguments{ \item{mu1}{mean vector of component 1.} \item{mu2}{mean vector of component 2.} \item{Sigma1}{covariance matrix of component 1.} \item{Sigma2}{covariance matrix of component 2.} } \value{ The Bhattacharyya distance between the two Gaussian distributions. } \references{ Fukunaga, K. (1990) \emph{Introduction to Statistical Pattern Recognition}, 2nd edition, Academic Press, New York. Hennig, C. (2010) Methods for merging Gaussian mixture components, \emph{Advances in Data Analysis and Classification}, 4, 3-34. } \note{ Thanks to David Pinto for improving this function. } \author{Christian Hennig \email{christian.hennig@unibo.it} \url{https://www.unibo.it/sitoweb/christian.hennig/en/} } \examples{ round(bhattacharyya.dist(c(1,1),c(2,5),diag(2),diag(2)),digits=2) } \keyword{multivariate} fpc/man/randconf.Rd0000644000176200001440000000121413467541512013664 0ustar liggesusers\name{randconf} \alias{randconf} %- Also NEED an `\alias' for EACH other topic documented here. \title{Generate a sample indicator vector} \description{ Generates a logical vector of length \code{n} with \code{p TRUE}s. } \usage{ randconf(n, p) } %- maybe also `usage' for other objects documented here. \arguments{ \item{n}{positive integer.} \item{p}{positive integer.} } \value{ A logical vector. } \author{Christian Hennig \email{christian.hennig@unibo.it} \url{https://www.unibo.it/sitoweb/christian.hennig/en/}} \seealso{\code{\link{sample}}} \examples{ randconf(10,3) } \keyword{distribution}% at least one, from doc/KEYWORDS fpc/man/weightplots.Rd0000644000176200001440000000552013467541512014447 0ustar liggesusers\name{weightplots} \alias{weightplots} %- Also NEED an `\alias' for EACH other topic documented here. \title{Ordered posterior plots} \description{ Ordered posterior plots for Gaussian mixture components, see Hennig (2010). } \usage{ weightplots(z, clusternumbers="all", clustercol=2, allcol=grey(0.2+((1:ncol(z))-1)* 0.6/(ncol(z)-1)), lty=rep(1,ncol(z)),clusterlwd=3, legendposition="none", weightcutoff=0.01,ask=TRUE, ...) } %- maybe also `usage' for other objects documented here. \arguments{ \item{z}{matrix with rows corresponding to observations and columns corresponding to mixture components. Entries are probabilities that an observation has been generated by a mixture component. These will normally be estimated a posteriori probabilities, as generated as component \code{z} of the output object from \code{\link[mclust]{summary.mclustBIC}}.} \item{clusternumbers}{\code{"all"} or vector of integers. Numbers of components for which plots are drawn.} \item{clustercol}{colour used for the main components for which a plot is drawn.} \item{allcol}{colours used for respective other components in plots in which they are not main components.} \item{lty}{line types for components.} \item{clusterlwd}{numeric. Line width for main component.} \item{legendposition}{\code{"none"} or vector with two coordinates in the plot, where a legend should be printed.} \item{weightcutoff}{numeric between 0 and 1. Observations are only taken into account for which the posterior probability for the main component is larger than this.} \item{ask}{logical. If \code{TRUE}, it sets \code{par(ask=TRUE)} in the beginning and \code{par(ask=FALSE)} after all plots were showed.} \item{...}{further parameters to be passed on to \code{\link{legend}}.} } \value{ Invisible matrix of posterior probabilities \code{z} from \code{mclustsummary}. } \details{ Shows posterior probabilities for observations belonging to all mixture components on the y-axis, with points ordered by posterior probability for main component. } \references{ Hennig, C. (2010) Methods for merging Gaussian mixture components, \emph{Advances in Data Analysis and Classification}, 4, 3-34. } \author{Christian Hennig \email{christian.hennig@unibo.it} \url{https://www.unibo.it/sitoweb/christian.hennig/en/} } \examples{ require(mclust) require(MASS) data(crabs) dc <- crabs[,4:8] cm <- mclustBIC(crabs[,4:8],G=9,modelNames="EEE") scm <- summary(cm,crabs[,4:8]) weightplots(scm$z,clusternumbers=1:3,ask=FALSE) weightplots(scm$z,clusternumbers=1:3,allcol=1:9, ask=FALSE, legendposition=c(5,0.7)) # Remove ask=FALSE to have time to watch the plots. } \keyword{multivariate} \keyword{cluster} fpc/man/cweight.Rd0000644000176200001440000000116413467541512013530 0ustar liggesusers\name{cweight} \alias{cweight} %- Also NEED an `\alias' for EACH other topic documented here. \title{Weight function for AWC} \description{ For use in \code{awcoord} only. } \usage{ cweight(x, ca) } %- maybe also `usage' for other objects documented here. \arguments{ \item{x}{numerical.} \item{ca}{numerical.} } % \details{ % } \value{ \code{ca/x} if smaller than 1, else 1. } \author{Christian Hennig \email{christian.hennig@unibo.it} \url{https://www.unibo.it/sitoweb/christian.hennig/en/} } \seealso{ \code{\link{awcoord}} } \examples{ cweight(4,1) } \keyword{arith}% at least one, from doc/KEYWORDS fpc/man/stupidkcentroids.Rd0000644000176200001440000000415514355665343015505 0ustar liggesusers\name{stupidkcentroids} \alias{stupidkcentroids} %- Also NEED an `\alias' for EACH other topic documented here. \title{Stupid k-centroids random clustering} \description{ Picks k random centroids from given dataset and assigns every point to closest centroid. This is called stupid k-centroids in Hennig (2019). } \usage{ stupidkcentroids(xdata, k, distances = inherits(xdata, "dist")) } %- maybe also `usage' for other objects documented here. \arguments{ \item{xdata}{cases*variables data, \code{dist}-object or dissimilarity matrix, see \code{distances}.} \item{k}{integer. Number of clusters.} \item{distances}{logical. If \code{TRUE}, \code{xdata} is interpreted as distances.} } % \details{ % } \value{ A list with components \item{partition}{vector if integers 1 to \code{k}, of length equal to number of objects, indicates to which cluster an object belongs.} \item{centroids}{vector of integers of length \code{k}, indicating the centroids of the clusters (observation number).} \item{distances}{as argument \code{distances}.} } \references{ Hennig, C. (2019) Cluster validation by measurement of clustering characteristics relevant to the user. In C. H. Skiadas (ed.) \emph{Data Analysis and Applications 1: Clustering and Regression, Modeling-estimating, Forecasting and Data Mining, Volume 2}, Wiley, New York 1-24, \url{https://arxiv.org/abs/1703.09282} Akhanli, S. and Hennig, C. (2020) Calibrating and aggregating cluster validity indexes for context-adapted comparison of clusterings. \emph{Statistics and Computing}, 30, 1523-1544, \url{https://link.springer.com/article/10.1007/s11222-020-09958-2}, \url{https://arxiv.org/abs/2002.01822} } \author{Christian Hennig \email{christian.hennig@unibo.it} \url{https://www.unibo.it/sitoweb/christian.hennig/en/} } \seealso{ \code{\link{stupidknn}}, \code{\link{stupidkfn}}, \code{\link{stupidkaven}} } \examples{ set.seed(20000) options(digits=3) face <- rFace(200,dMoNo=2,dNoEy=0,p=2) stupidkcentroids(dist(face),3) } \keyword{multivariate}% at least one, from doc/KEYWORDS \keyword{cluster}% __ONLY ONE__ keyword per line fpc/man/stupidkaven.Rd0000644000176200001440000000312713731135244014427 0ustar liggesusers\name{stupidkaven} \alias{stupidkaven} %- Also NEED an `\alias' for EACH other topic documented here. \title{Stupid average dissimilarity random clustering} \description{ Picks k random starting points from given dataset to initialise k clusters. Then, one by one, the point not yet assigned to any cluster with smallest average dissimilarity to the points of any already existing cluster is assigned to that cluster, until all points are assigned. This is a random versione of average linkage clustering, see Akhanli and Hennig (2020). } \usage{ stupidkaven(d,k) } %- maybe also `usage' for other objects documented here. \arguments{ \item{d}{\code{dist}-object or dissimilarity matrix.} \item{k}{integer. Number of clusters.} } % \details{ % } \value{ The clustering vector (values 1 to \code{k}, length number of objects behind \code{d}), } \references{ Akhanli, S. and Hennig, C. (2020) Calibrating and aggregating cluster validity indexes for context-adapted comparison of clusterings. \emph{Statistics and Computing}, 30, 1523-1544, \url{https://link.springer.com/article/10.1007/s11222-020-09958-2}, \url{https://arxiv.org/abs/2002.01822} } \author{Christian Hennig \email{christian.hennig@unibo.it} \url{https://www.unibo.it/sitoweb/christian.hennig/en/} } \seealso{ \code{\link{stupidkcentroids}}, \code{\link{stupidknn}}, \code{\link{stupidkfn}} } \examples{ set.seed(20000) options(digits=3) face <- rFace(200,dMoNo=2,dNoEy=0,p=2) stupidkaven(dist(face),3) } \keyword{multivariate}% at least one, from doc/KEYWORDS \keyword{cluster}% __ONLY ONE__ keyword per line fpc/man/cov.wml.Rd0000644000176200001440000000432113470376676013474 0ustar liggesusers\name{cov.wml} \alias{cov.wml} %- Also NEED an `\alias' for EACH other topic documented here. \title{Weighted Covariance Matrices (Maximum Likelihood)} \description{ Returns a list containing estimates of the weighted covariance matrix and the mean of the data, and optionally of the (weighted) correlation matrix. The covariance matrix is divided by the sum of the weights, corresponding to \code{n} and the ML-estimator in the case of equal weights, as opposed to \code{n-1} for \code{\link{cov.wt}}. } \usage{ cov.wml(x, wt = rep(1/nrow(x), nrow(x)), cor = FALSE, center = TRUE) } %- maybe also `usage' for other objects documented here. \arguments{ \item{x}{a matrix or data frame. As usual, rows are observations and columns are variables.} \item{wt}{a non-negative and non-zero vector of weights for each observation. Its length must equal the number of rows of \code{x}.} \item{cor}{A logical indicating whether the estimated correlation weighted matrix will be returned as well.} \item{center}{Either a logical or a numeric vector specifying the centers to be used when computing covariances. If \code{TRUE}, the (weighted) mean of each variable is used, if `\code{FALSE}, zero is used. If \code{center} is numeric, its length must equal the number of columns of \code{x}.} } \value{ A list containing the following named components: \item{cov}{the estimated (weighted) covariance matrix.} \item{center}{an estimate for the center (mean) of the data.} \item{n.obs}{the number of observations (rows) in \code{x}.} \item{wt}{the weights used in the estimation. Only returned if given as an argument.} \item{cor}{the estimated correlation matrix. Only returned if `cor' is `TRUE'.} } \author{Christian Hennig \email{christian.hennig@unibo.it} \url{https://www.unibo.it/sitoweb/christian.hennig/en/}} \seealso{\code{\link{cov.wt}}, \code{\link{cov}}, \code{\link{var}}} \examples{ x <- c(1,2,3,4,5,6,7,8,9,10) y <- c(1,2,3,8,7,6,5,8,9,10) cov.wml(cbind(x,y),wt=c(0,0,0,1,1,1,1,1,0,0)) cov.wt(cbind(x,y),wt=c(0,0,0,1,1,1,1,1,0,0)) } \keyword{multivariate}% at least one, from doc/KEYWORDS fpc/man/discrete.recode.Rd0000644000176200001440000000511313470376752015144 0ustar liggesusers\name{discrete.recode} \alias{discrete.recode} %- Also NEED an `\alias' for EACH other topic documented here. \title{Recodes mixed variables dataset} \description{ Recodes a dataset with mixed continuous and categorical variables so that the continuous variables come first and the categorical variables have standard coding 1, 2, 3,... (in lexicographical ordering of values coerced to strings). } \usage{ discrete.recode(x,xvarsorted=TRUE,continuous=0,discrete) } %- maybe also `usage' for other objects documented here. \arguments{ \item{x}{data matrix or data frame. The data need to be organised case-wise, i.e., if there are categorical variables only, and 15 cases with values c(1,1,2) on the 3 variables, the data matrix needs 15 rows with values 1 1 2. (Categorical variables could take numbers or strings or anything that can be coerced to factor levels as values.)} \item{xvarsorted}{logical. If \code{TRUE}, the continuous variables are assumed to be the first ones, and the categorical variables to be behind them.} \item{continuous}{vector of integers giving positions of the continuous variables. If \code{xvarsorted=TRUE}, a single integer, number of continuous variables.} \item{discrete}{vector of integers giving positions of the categorical variables (the variables need to be coded in such a way that \code{\link{data.matrix}} converts them to something numeric). If \code{xvarsorted=TRUE}, a single integer, number of categorical variables.} } \value{ A list with components \item{data}{data matrix with continuous variables first and categorical variables in standard coding behind them.} \item{ppdim}{vector of categorical variable-wise numbers of categories.} \item{discretelevels}{list of levels of the categorical variables belonging to what is treated by \code{flexmixedruns} as category 1, 2, 3 etc.} \item{continuous}{number of continuous variables.} \item{discrete}{number of categorical variables.} } \author{Christian Hennig \email{christian.hennig@unibo.it} \url{https://www.unibo.it/sitoweb/christian.hennig/en}} \seealso{\code{\link{lcmixed}}} \examples{ set.seed(776655) v1 <- rnorm(20) v2 <- rnorm(20) d1 <- sample(c(2,4,6,8),20,replace=TRUE) d2 <- sample(1:4,20,replace=TRUE) ldata <- cbind(v1,d1,v2,d2) lc <- discrete.recode(ldata,xvarsorted=FALSE,continuous=c(1,3),discrete=c(2,4)) require(MASS) data(Cars93) Cars934 <- Cars93[,c(3,5,8,10)] cc <- discrete.recode(Cars934,xvarsorted=FALSE,continuous=c(2,3),discrete=c(1,4)) } \keyword{manip}% __ONLY ONE__ keyword per line fpc/man/ridgeline.Rd0000644000176200001440000000256714532232632014042 0ustar liggesusers\name{ridgeline} \alias{ridgeline} %- Also NEED an `\alias' for EACH other topic documented here. \title{Ridgeline computation} \description{ Computes \eqn{(\alpha*\Sigma_1^{-1}+(1-\alpha)*\Sigma_2^{-1})^{-1}* \alpha*(\Sigma_1^{-1}*\mu_1)+(1-\alpha)*(\Sigma_2^{-1}*\mu_2)}{% (alpha*Sigma1^{-1}+(1-alpha)*Sigma2^{-1})^{-1}* alpha*(Sigma_1^{-1}*mu_1)+(1-alpha)*(Sigma_2^{-1}*mu_2)} as required for the computation of the ridgeline (Ray and Lindsay, 2005) to find all density extrema of a two-component Gaussian mixture with mean vectors mu1 and mu2 and covariance matrices Sigma1, Sigma2. } \usage{ ridgeline(alpha, mu1, mu2, Sigma1, Sigma2) } %- maybe also `usage' for other objects documented here. \arguments{ \item{alpha}{numeric between 0 and 1.} \item{mu1}{mean vector of component 1.} \item{mu2}{mean vector of component 2.} \item{Sigma1}{covariance matrix of component 1.} \item{Sigma2}{covariance matrix of component 2.} } \value{ A vector. See above. } \references{ Ray, S. and Lindsay, B. G. (2005) The Topography of Multivariate Normal Mixtures, \emph{Annals of Statistics}, 33, 2042-2065. } \author{Christian Hennig \email{christian.hennig@unibo.it} \url{https://www.unibo.it/sitoweb/christian.hennig/en/} } \examples{ ridgeline(0.5,c(1,1),c(2,5),diag(2),diag(2)) } \keyword{cluster}% at least one, from doc/KEYWORDS \keyword{multivariate} fpc/man/mergenormals.Rd0000644000176200001440000002336213467541512014575 0ustar liggesusers\name{mergenormals} \alias{mergenormals} \alias{summary.mergenorm} \alias{print.summary.mergenorm} %- Also NEED an `\alias' for EACH other topic documented here. \title{Clustering by merging Gaussian mixture components} \description{ Clustering by merging Gaussian mixture components; computes all methods introduced in Hennig (2010) from an initial mclust clustering. See details section for details. } \usage{ mergenormals(xdata, mclustsummary=NULL, clustering, probs, muarray, Sigmaarray, z, method=NULL, cutoff=NULL, by=0.005, numberstop=NULL, renumber=TRUE, M=50, ...) \method{summary}{mergenorm}(object, ...) \method{print}{summary.mergenorm}(x, ...) } %- maybe also `usage' for other objects documented here. \arguments{ \item{xdata}{data (something that can be coerced into a matrix).} \item{mclustsummary}{output object from \code{\link[mclust]{summary.mclustBIC}} for \code{xdata}. Either \code{mclustsummary} or all of \code{clustering}, \code{probs}, \code{muarray}, \code{Sigmaarray} and \code{z} need to be specified (the latter are obtained from \code{mclustsummary} if they are not provided). I am not aware of restrictions of the usage of \code{\link[mclust]{mclustBIC}} to produce an initial clustering; covariance matrix models can be restricted and a noise component can be included if desired, although I have probably not tested all possibilities. } \item{clustering}{vector of integers. Initial assignment of data to mixture components.} \item{probs}{vector of component proportions (for all components; should sum up to one).} \item{muarray}{matrix of component means (rows).} \item{Sigmaarray}{array of component covariance matrices (third dimension refers to component number).} \item{z}{matrix of observation- (row-)wise posterior probabilities of belonging to the components (columns).} \item{method}{one of \code{"bhat"}, \code{"ridge.uni"}, \code{"ridge.ratio"}, \code{"demp"}, \code{"dipuni"}, \code{"diptantrum"}, \code{"predictive"}. See details.} \item{cutoff}{numeric between 0 and 1. Tuning constant, see details and Hennig (2010). If not specified, the default values given in (9) in Hennig (2010) are used.} \item{by}{real between 0 and 1. Interval width for density computation along the ridgeline, used for methods \code{"ridge.uni"} and \code{"ridge.ratio"}. Methods \code{"dipuni"} and \code{"diptantrum"} require ridgeline computations and use it as well.} \item{numberstop}{integer. If specified, \code{cutoff} is ignored and components are merged until the number of clusters specified here is reached.} \item{renumber}{logical. If \code{TRUE} merged clusters are renumbered from 1 to their number. If not, numbers of the original clustering are used (numbers of components that were merged into others then will not appear).} \item{M}{integer. Number of times the dataset is divided into two halves. Used if \code{method="predictive"}.} \item{...}{additional optional parameters to pass on to \code{ridgeline.diagnosis} or \code{mixpredictive} (in \code{mergenormals}).} \item{object}{object of class \code{mergenorm}, output of \code{mergenormals}.} \item{x}{object of class \code{summary.mergenorm}, output of \code{summary.mergenorm}.} } \value{ \code{mergenormals} gives out an object of class \code{mergenorm}, which is a List with components \item{clustering}{integer vector. Final clustering.} \item{clusternumbers}{vector of numbers of remaining clusters. These are given in terms of the original clusters even of \code{renumber=TRUE}, in which case they may be needed to understand the numbering of some further components, see below.} \item{defunct.components}{vector of numbers of components that were "merged away".} \item{valuemerged}{vector of values of the merging criterion (see details) at which components were merged.} \item{mergedtonumbers}{vector of numbers of clusters to which the original components were merged.} \item{parameters}{a list, if \code{mclustsummary} was provided. Entry no. i refers to number i in \code{clusternumbers}. The list entry i contains the parameters of the original mixture components that make up cluster i, as extracted by \code{\link{extract.mixturepars}}.} \item{predvalues}{vector of prediction strength values for clusternumbers from 1 to the number of components in the original mixture, if \code{method=="predictive"}. See \code{\link{mixpredictive}}.} \item{orig.decisionmatrix}{square matrix with entries giving the original values of the merging criterion (see details) for every pair of original mixture components.} \item{new.decisionmatrix}{square matrix as \code{orig.decisionmatrix}, but with final entries; numbering of rows and columns corresponds to \code{clusternumbers}; all entries corresponding to other rows and columns can be ignored.} \item{probs}{final cluster values of \code{probs} (see arguments) for merged components, generated by (potentially repeated) execution of \code{\link{mergeparameters}} out of the original ones. Numbered according to \code{clusternumbers}.} \item{muarray}{final cluster means, analogous to \code{probs}.} \item{Sigmaarray}{final cluster covariance matrices, analogous to \code{probs}.} \item{z}{final matrix of posterior probabilities of observations belonging to the clusters, analogous to \code{probs}.} \item{noise}{logical. If \code{TRUE}, there was a noise component fitted in the initial mclust clustering (see help for \code{initialization} in \code{\link[mclust]{mclustBIC}}). In this case, a cluster number 0 indicates noise. noise is ignored by the merging methods and kept as it was originally.} \item{method}{as above.} \item{cutoff}{as above.} \code{summary.mergenorm} gives out a list with components \code{clustering, clusternumbers, defunct.components, valuemerged, mergedtonumbers, predvalues, probs, muarray, Sigmaarray, z, noise, method, cutoff} as above, plus \code{onc} (original number of components) and \code{mnc} (number of clusters after merging). } \details{ Mixture components are merged in a hierarchical fashion. The merging criterion is computed for all pairs of current clusters and the two clusters with the highest criterion value (lowest, respectively, for \code{method="predictive"}) are merged. Then criterion values are recomputed for the merged cluster. Merging is continued until the criterion value to merge is below (or above, for \code{method="predictive"}) the cutoff value. Details are given in Hennig (2010). The following criteria are offered, specified by the \code{method}-argument. \describe{ \item{"ridge.uni"}{components are only merged if their mixture is unimodal according to Ray and Lindsay's (2005) ridgeline theory, see \code{\link{ridgeline.diagnosis}}. This ignores argument \code{cutoff}.} \item{"ridge.ratio"}{ratio between density minimum between components and minimum of density maxima according to Ray and Lindsay's (2005) ridgeline theory, see \code{\link{ridgeline.diagnosis}}. } \item{"bhat"}{Bhattacharyya upper bound on misclassification probability between two components, see \code{\link{bhattacharyya.matrix}}.} \item{"demp"}{direct estimation of misclassification probability between components, see Hennig (2010).} \item{"dipuni"}{this uses \code{method="ridge.ratio"} to decide which clusters to merge but stops merging according to the p-value of the dip test computed as in Hartigan and Hartigan (1985), see \code{\link[diptest]{dip.test}}.} \item{"diptantrum"}{as \code{"dipuni"}, but p-value of dip test computed as in Tantrum, Murua and Stuetzle (2003), see \code{\link{dipp.tantrum}}.} \item{"predictive"}{this uses \code{method="demp"} to decide which clusters to merge but stops merging according to the value of prediction strength (Tibshirani and Walther, 2005) as computed in \code{\link{mixpredictive}}.} } } \references{ J. A. Hartigan and P. M. Hartigan (1985) The Dip Test of Unimodality, \emph{Annals of Statistics}, 13, 70-84. Hennig, C. (2010) Methods for merging Gaussian mixture components, \emph{Advances in Data Analysis and Classification}, 4, 3-34. Ray, S. and Lindsay, B. G. (2005) The Topography of Multivariate Normal Mixtures, \emph{Annals of Statistics}, 33, 2042-2065. Tantrum, J., Murua, A. and Stuetzle, W. (2003) Assessment and Pruning of Hierarchical Model Based Clustering, \emph{Proceedings of the ninth ACM SIGKDD international conference on Knowledge discovery and data mining}, Washington, D.C., 197-205. Tibshirani, R. and Walther, G. (2005) Cluster Validation by Prediction Strength, \emph{Journal of Computational and Graphical Statistics}, 14, 511-528. } \author{Christian Hennig \email{christian.hennig@unibo.it} \url{https://www.unibo.it/sitoweb/christian.hennig/en/} } \examples{ require(mclust) require(MASS) options(digits=3) data(crabs) dc <- crabs[,4:8] cm <- mclustBIC(crabs[,4:8],G=9,modelNames="EEE") scm <- summary(cm,crabs[,4:8]) cmnbhat <- mergenormals(crabs[,4:8],scm,method="bhat") summary(cmnbhat) cmndemp <- mergenormals(crabs[,4:8],scm,method="demp") summary(cmndemp) # Other methods take a bit longer, but try them! # The values of by and M below are still chosen for reasonably fast execution. # cmnrr <- mergenormals(crabs[,4:8],scm,method="ridge.ratio",by=0.05) # cmd <- mergenormals(crabs[,4:8],scm,method="dip.tantrum",by=0.05) # cmp <- mergenormals(crabs[,4:8],scm,method="predictive",M=3) } \keyword{multivariate} \keyword{cluster} fpc/man/cat2bin.Rd0000644000176200001440000000351113470376356013424 0ustar liggesusers\name{cat2bin} \alias{cat2bin} %- Also NEED an `\alias' for EACH other topic documented here. \title{Recode nominal variables to binary variables} \description{ Recodes a dataset with nominal variables so that the nominal variables are replaced by binary variables for the categories. } \usage{ cat2bin(x,categorical=NULL) } %- maybe also `usage' for other objects documented here. \arguments{ \item{x}{data matrix or data frame. The data need to be organised case-wise, i.e., if there are categorical variables only, and 15 cases with values c(1,1,2) on the 3 variables, the data matrix needs 15 rows with values 1 1 2. (Categorical variables could take numbers or strings or anything that can be coerced to factor levels as values.)} \item{categorical}{vector of numbers of variables to be recoded.} } \value{ A list with components \item{data}{data matrix with variables specified in \code{categorical} replaced by 0-1 variables, one for each category.} \item{variableinfo}{list of lists. One list for every variable in the original dataset, with four components each, namely \code{type} (\code{"categorical"} or \code{"not recoded"}), \code{levels} (levels of nominal recoded variables in order of binary variable in output dataset), \code{ncat} (number of categories for recoded variables), \code{varnum} (number of variables in output dataset belonging to this original variable).} } \author{Christian Hennig \email{christian.hennig@unibo.it} \url{https://www.unibo.it/sitoweb/christian.hennig/en}} \seealso{\code{\link{discrete.recode}}} \examples{ set.seed(776655) v1 <- rnorm(20) v2 <- rnorm(20) d1 <- sample(1:5,20,replace=TRUE) d2 <- sample(1:4,20,replace=TRUE) ldata <-cbind(v1,v2,d1,d2) lc <- cat2bin(ldata,categorical=3:4) } \keyword{manip}% __ONLY ONE__ keyword per line fpc/man/minsize.Rd0000644000176200001440000000266213467541512013560 0ustar liggesusers\name{minsize} \alias{minsize} %- Also NEED an `\alias' for EACH other topic documented here. \title{Minimum size of regression fixed point cluster} \description{ Computes the minimum size of a fixed point cluster (FPC) which is found at least \code{mtf} times with approximated probability \code{prob} by \code{ir} fixed point iterations of \code{\link{fixreg}}. Thought for use within \code{\link{fixreg}}. } \usage{ minsize(n, p, ir, mtf, prob = 0.5) } %- maybe also `usage' for other objects documented here. \arguments{ \item{n}{positive integer. Total number of points.} \item{p}{positive integer. Number of independent variables.} \item{ir}{positive integer. Number of fixed point iterations.} \item{mtf}{positive integer.} \item{prob}{numerical between 0 and 1.} } \details{ The computation is based on the binomial distribution with probability given by \code{\link{clusexpect}} with \code{ir=1}. } \value{ An integer. } \references{ Hennig, C. (2002) Fixed point clusters for linear regression: computation and comparison, \emph{Journal of Classification} 19, 249-276. } \author{Christian Hennig \email{christian.hennig@unibo.it} \url{https://www.unibo.it/sitoweb/christian.hennig/en/}} \seealso{\code{\link{fixreg}}, \code{\link{clusexpect}}, \code{\link{itnumber}}} \examples{ minsize(500,4,7000,2) } \keyword{univar}% at least one, from doc/KEYWORDS \keyword{cluster}% __ONLY ONE__ keyword per line fpc/man/confusion.Rd0000644000176200001440000000323513467541512014102 0ustar liggesusers\name{confusion} \alias{confusion} %- Also NEED an `\alias' for EACH other topic documented here. \title{Misclassification probabilities in mixtures} \description{ Estimates a misclassification probability in a mixture distribution between two mixture components from estimated posterior probabilities regardless of component parameters, see Hennig (2010). } \usage{ confusion(z,pro,i,j,adjustprobs=FALSE) } %- maybe also `usage' for other objects documented here. \arguments{ \item{z}{matrix of posterior probabilities for observations (rows) to belong to mixture components (columns), so entries need to sum up to 1 for each row.} \item{pro}{vector of component proportions, need to sum up to 1.} \item{i}{integer. Component number.} \item{j}{integer. Component number.} \item{adjustprobs}{logical. If \code{TRUE}, probabilities are initially standardised so that those for components \code{i} and \code{j} add up to one (i.e., if they were the only components).} } \value{ Estimated probability that an observation generated by component \code{j} is classified to component \code{i} by maximum a posteriori rule. } \references{ Hennig, C. (2010) Methods for merging Gaussian mixture components, \emph{Advances in Data Analysis and Classification}, 4, 3-34. } \author{Christian Hennig \email{christian.hennig@unibo.it} \url{https://www.unibo.it/sitoweb/christian.hennig/en/} } \examples{ set.seed(12345) m <- rpois(20,lambda=5) dim(m) <- c(5,4) pro <- apply(m,2,sum) pro <- pro/sum(pro) m <- m/apply(m,1,sum) round(confusion(m,pro,1,2),digits=2) } \keyword{cluster}% at least one, from doc/KEYWORDS \keyword{multivariate} fpc/man/nselectboot.Rd0000644000176200001440000001247013674422630014420 0ustar liggesusers\name{nselectboot} \alias{nselectboot} %- Also NEED an `\alias' for EACH other topic documented here. \title{Selection of the number of clusters via bootstrap} \description{ Selection of the number of clusters via bootstrap as explained in Fang and Wang (2012). Several times 2 bootstrap samples are drawn from the data and the number of clusters is chosen by optimising an instability estimation from these pairs. In principle all clustering methods can be used that have a CBI-wrapper, see \code{\link{clusterboot}}, \code{\link{kmeansCBI}}. However, the currently implemented classification methods are not necessarily suitable for all of them, see argument \code{classification}. } \usage{ nselectboot(data,B=50,distances=inherits(data,"dist"), clustermethod=NULL, classification="averagedist",centroidname = NULL, krange=2:10, count=FALSE,nnk=1, largeisgood=FALSE,...) } %- maybe also `usage' for other objects documented here. \arguments{ \item{data}{something that can be coerced into a matrix. The data matrix - either an \code{n*p}-data matrix (or data frame) or an \code{n*n}-dissimilarity matrix (or \code{dist}-object).} \item{B}{integer. Number of resampling runs.} \item{distances}{logical. If \code{TRUE}, the data is interpreted as dissimilarity matrix. If \code{data} is a \code{dist}-object, \code{distances=TRUE} automatically, otherwise \code{distances=FALSE} by default. This means that you have to set it to \code{TRUE} manually if \code{data} is a dissimilarity matrix.} \item{clustermethod}{an interface function (the function name, not a string containing the name, has to be provided!). This defines the clustering method. See the "Details"-section of \code{\link{clusterboot}} and \code{\link{kmeansCBI}} for the format. Clustering methods for \code{nselectboot} must have a \code{k}-argument for the number of clusters and must otherwise follow the specifications in \code{\link{clusterboot}}. Note that \code{nselectboot} won't work with CBI-functions that implicitly already estimate the number of clusters such as \code{\link{pamkCBI}}; use \code{\link{claraCBI}} if you want to run it for pam/clara clustering. } \item{classification}{string. This determines how non-clustered points are classified to given clusters. Options are explained in \code{\link{classifdist}} (if \code{distances=TRUE}) and \code{\link{classifnp}} (otherwise). Certain classification methods are connected to certain clustering methods. \code{classification="averagedist"} is recommended for average linkage, \code{classification="centroid"} is recommended for k-means, clara and pam (with distances it will work with \code{\link{claraCBI}} only), \code{classification="knn"} with \code{nnk=1} is recommended for single linkage and \code{classification="qda"} is recommended for Gaussian mixtures with flexible covariance matrices. } \item{centroidname}{string. Indicates the name of the component of \code{CBIoutput$result} that contains the cluster centroids in case of \code{classification="centroid"}, where \code{CBIoutput} is the output object of \code{clustermethod}. If \code{clustermethod} is \code{kmeansCBI} or \code{claraCBI}, centroids are recognised automatically if \code{centroidname=NULL}. If \code{centroidname=NULL} and \code{distances=FALSE}, cluster means are computed as the cluster centroids.} \item{krange}{integer vector; numbers of clusters to be tried.} \item{count}{logical. If \code{TRUE}, numbers of clusters and bootstrap runs are printed.} \item{nnk}{number of nearest neighbours if \code{classification="knn"}, see \code{\link{classifdist}} (if \code{distances=TRUE}) and \code{\link{classifnp}} (otherwise).} \item{largeisgood}{logical. If \code{TRUE}, output component \code{stabk} is taken as one minus the original instability value so that larger values of \code{stabk} are better.} \item{...}{arguments to be passed on to the clustering method.} } \value{ \code{nselectboot} returns a list with components \code{kopt,stabk,stab}. \item{kopt}{optimal number of clusters.} \item{stabk}{mean instability values for numbers of clusters (or one minus this if \code{largeisgood=TRUE}).} \item{stab}{matrix of instability values for all bootstrap runs and numbers of clusters.} } \references{ Fang, Y. and Wang, J. (2012) Selection of the number of clusters via the bootstrap method. \emph{Computational Statistics and Data Analysis}, 56, 468-477. } \author{Christian Hennig \email{christian.hennig@unibo.it} \url{https://www.unibo.it/sitoweb/christian.hennig/en/} } \seealso{ \code{\link{classifdist}}, \code{\link{classifnp}}, \code{\link{clusterboot}},\code{\link{kmeansCBI}} } \examples{ set.seed(20000) face <- rFace(50,dMoNo=2,dNoEy=0,p=2) nselectboot(dist(face),B=2,clustermethod=disthclustCBI, method="average",krange=5:7) nselectboot(dist(face),B=2,clustermethod=claraCBI, classification="centroid",krange=5:7) nselectboot(face,B=2,clustermethod=kmeansCBI, classification="centroid",krange=5:7) # Of course use larger B in a real application. } \keyword{cluster}% at least one, from doc/KEYWORDS \keyword{multivariate} fpc/man/cluster.stats.Rd0000644000176200001440000002376313470320730014714 0ustar liggesusers\name{cluster.stats} \alias{cluster.stats} %- Also NEED an `\alias' for EACH other topic documented here. \title{Cluster validation statistics} \description{ Computes a number of distance based statistics, which can be used for cluster validation, comparison between clusterings and decision about the number of clusters: cluster sizes, cluster diameters, average distances within and between clusters, cluster separation, biggest within cluster gap, average silhouette widths, the Calinski and Harabasz index, a Pearson version of Hubert's gamma coefficient, the Dunn index and two indexes to assess the similarity of two clusterings, namely the corrected Rand index and Meila's VI. } \usage{ cluster.stats(d = NULL, clustering, alt.clustering = NULL, noisecluster=FALSE, silhouette = TRUE, G2 = FALSE, G3 = FALSE, wgap=TRUE, sepindex=TRUE, sepprob=0.1, sepwithnoise=TRUE, compareonly = FALSE, aggregateonly = FALSE) } %- maybe also `usage' for other objects documented here. \arguments{ \item{d}{a distance object (as generated by \code{dist}) or a distance matrix between cases.} \item{clustering}{an integer vector of length of the number of cases, which indicates a clustering. The clusters have to be numbered from 1 to the number of clusters.} \item{alt.clustering}{an integer vector such as for \code{clustering}, indicating an alternative clustering. If provided, the corrected Rand index and Meila's VI for \code{clustering} vs. \code{alt.clustering} are computed.} \item{noisecluster}{logical. If \code{TRUE}, it is assumed that the largest cluster number in \code{clustering} denotes a 'noise class', i.e. points that do not belong to any cluster. These points are not taken into account for the computation of all functions of within and between cluster distances including the validation indexes.} \item{silhouette}{logical. If \code{TRUE}, the silhouette statistics are computed, which requires package \code{cluster}.} \item{G2}{logical. If \code{TRUE}, Goodman and Kruskal's index G2 (cf. Gordon (1999), p. 62) is computed. This executes lots of sorting algorithms and can be very slow (it has been improved by R. Francois - thanks!)} \item{G3}{logical. If \code{TRUE}, the index G3 (cf. Gordon (1999), p. 62) is computed. This executes \code{sort} on all distances and can be extremely slow.} \item{wgap}{logical. If \code{TRUE}, the widest within-cluster gaps (largest link in within-cluster minimum spanning tree) are computed. This is used for finding a good number of clusters in Hennig (2013).} \item{sepindex}{logical. If \code{TRUE}, a separation index is computed, defined based on the distances for every point to the closest point not in the same cluster. The separation index is then the mean of the smallest proportion \code{sepprob} of these. This allows to formalise separation less sensitive to a single or a few ambiguous points. The output component corresponding to this is \code{sindex}, not \code{separation}! This is used for finding a good number of clusters in Hennig (2013). } \item{sepprob}{numerical between 0 and 1, see \code{sepindex}.} \item{sepwithnoise}{logical. If \code{TRUE} and \code{sepindex} and \code{noisecluster} are both \code{TRUE}, the noise points are incorporated as cluster in the separation index (\code{sepindex}) computation. Also they are taken into account for the computation for the minimum cluster separation.} \item{compareonly}{logical. If \code{TRUE}, only the corrected Rand index and Meila's VI are computed and given out (this requires \code{alt.clustering} to be specified).} \item{aggregateonly}{logical. If \code{TRUE} (and not \code{compareonly}), no clusterwise but only aggregated information is given out (this cuts the size of the output down a bit).} } \note{ Because \code{cluster.stats} processes a full dissimilarity matrix, it isn't suitable for large data sets. You may consider \code{\link{distcritmulti}} in that case. } \value{ \code{cluster.stats} returns a list containing the components \code{n, cluster.number, cluster.size, min.cluster.size, noisen, diameter, average.distance, median.distance, separation, average.toother, separation.matrix, average.between, average.within, n.between, n.within, within.cluster.ss, clus.avg.silwidths, avg.silwidth, g2, g3, pearsongamma, dunn, entropy, wb.ratio, ch, cwidegap, widestgap, sindex, corrected.rand, vi} except if \code{compareonly=TRUE}, in which case only the last two components are computed. \item{n}{number of cases.} \item{cluster.number}{number of clusters.} \item{cluster.size}{vector of cluster sizes (number of points).} \item{min.cluster.size}{size of smallest cluster.} \item{noisen}{number of noise points, see argument \code{noisecluster} (\code{noisen=0} if \code{noisecluster=FALSE}).} \item{diameter}{vector of cluster diameters (maximum within cluster distances).} \item{average.distance}{vector of clusterwise within cluster average distances.} \item{median.distance}{vector of clusterwise within cluster distance medians.} \item{separation}{vector of clusterwise minimum distances of a point in the cluster to a point of another cluster.} \item{average.toother}{vector of clusterwise average distances of a point in the cluster to the points of other clusters.} \item{separation.matrix}{matrix of separation values between all pairs of clusters.} \item{ave.between.matrix}{matrix of mean dissimilarities between points of every pair of clusters.} \item{average.between}{average distance between clusters.} \item{average.within}{average distance within clusters (reweighted so that every observation, rather than every distance, has the same weight).} \item{n.between}{number of distances between clusters.} \item{n.within}{number of distances within clusters.} \item{max.diameter}{maximum cluster diameter.} \item{min.separation}{minimum cluster separation.} \item{within.cluster.ss}{a generalisation of the within clusters sum of squares (k-means objective function), which is obtained if \code{d} is a Euclidean distance matrix. For general distance measures, this is half the sum of the within cluster squared dissimilarities divided by the cluster size.} \item{clus.avg.silwidths}{vector of cluster average silhouette widths. See \code{\link{silhouette}}.} \item{avg.silwidth}{average silhouette width. See \code{\link{silhouette}}.} \item{g2}{Goodman and Kruskal's Gamma coefficient. See Milligan and Cooper (1985), Gordon (1999, p. 62).} \item{g3}{G3 coefficient. See Gordon (1999, p. 62).} \item{pearsongamma}{correlation between distances and a 0-1-vector where 0 means same cluster, 1 means different clusters. "Normalized gamma" in Halkidi et al. (2001).} \item{dunn}{minimum separation / maximum diameter. Dunn index, see Halkidi et al. (2002).} \item{dunn2}{minimum average dissimilarity between two cluster / maximum average within cluster dissimilarity, another version of the family of Dunn indexes.} \item{entropy}{entropy of the distribution of cluster memberships, see Meila(2007).} \item{wb.ratio}{\code{average.within/average.between}.} \item{ch}{Calinski and Harabasz index (Calinski and Harabasz 1974, optimal in Milligan and Cooper 1985; generalised for dissimilarites in Hennig and Liao 2013).} \item{cwidegap}{vector of widest within-cluster gaps.} \item{widestgap}{widest within-cluster gap.} \item{sindex}{separation index, see argument \code{sepindex}.} \item{corrected.rand}{corrected Rand index (if \code{alt.clustering} has been specified), see Gordon (1999, p. 198).} \item{vi}{variation of information (VI) index (if \code{alt.clustering} has been specified), see Meila (2007).} } \references{ Calinski, T., and Harabasz, J. (1974) A Dendrite Method for Cluster Analysis, \emph{Communications in Statistics}, 3, 1-27. Gordon, A. D. (1999) \emph{Classification}, 2nd ed. Chapman and Hall. Halkidi, M., Batistakis, Y., Vazirgiannis, M. (2001) On Clustering Validation Techniques, \emph{Journal of Intelligent Information Systems}, 17, 107-145. Hennig, C. and Liao, T. (2013) How to find an appropriate clustering for mixed-type variables with application to socio-economic stratification, \emph{Journal of the Royal Statistical Society, Series C Applied Statistics}, 62, 309-369. Hennig, C. (2013) How many bee species? A case study in determining the number of clusters. In: Spiliopoulou, L. Schmidt-Thieme, R. Janning (eds.): "Data Analysis, Machine Learning and Knowledge Discovery", Springer, Berlin, 41-49. Kaufman, L. and Rousseeuw, P.J. (1990). "Finding Groups in Data: An Introduction to Cluster Analysis". Wiley, New York. Meila, M. (2007) Comparing clusterings?an information based distance, \emph{Journal of Multivariate Analysis}, 98, 873-895. Milligan, G. W. and Cooper, M. C. (1985) An examination of procedures for determining the number of clusters. \emph{Psychometrika}, 50, 159-179. } \author{Christian Hennig \email{christian.hennig@unibo.it} \url{https://www.unibo.it/sitoweb/christian.hennig/en/} } \seealso{ \code{\link{cqcluster.stats}} is a more sophisticated version of \code{cluster.stats} with more options. \code{\link{silhouette}}, \code{\link{dist}}, \code{\link{calinhara}}, \code{\link{distcritmulti}}. \code{\link{clusterboot}} computes clusterwise stability statistics by resampling. } \examples{ set.seed(20000) options(digits=3) face <- rFace(200,dMoNo=2,dNoEy=0,p=2) dface <- dist(face) complete3 <- cutree(hclust(dface),3) cluster.stats(dface,complete3, alt.clustering=as.integer(attr(face,"grouping"))) } \keyword{cluster}% at least one, from doc/KEYWORDS \keyword{multivariate} fpc/man/tonedata.Rd0000644000176200001440000000240013467541512013667 0ustar liggesusers\name{tonedata} \alias{tonedata} \docType{data} \title{Tone perception data} \description{ The tone perception data stem from an experiment of Cohen (1980) and have been analyzed in de Veaux (1989). A pure fundamental tone was played to a trained musician. Electronically generated overtones were added, determined by a stretching ratio of \code{stretchratio}. \code{stretchratio=2.0} corresponds to the harmonic pattern usually heard in traditional definite pitched instruments. The musician was asked to tune an adjustable tone to the octave above the fundamental tone. \code{tuned} gives the ratio of the adjusted tone to the fundamental, i.e. \code{tuned=2.0} would be the correct tuning for all \code{stretchratio}-values. The data analyzed here belong to 150 trials with the same musician. In the original study, there were four further musicians. } \usage{data(tonedata)} \format{A data frame with 2 variables \code{stretchratio} and \code{tuned} and 150 cases.} \source{Cohen, E. A. (1980) \emph{Inharmonic tone perception}. Unpublished Ph.D. dissertation, Stanford University} \references{ de Veaux, R. D. (1989) Mixtures of Linear Regressions, \emph{Computational Statistics and Data Analysis} 8, 227-245. } \keyword{datasets} fpc/man/batcoord.Rd0000644000176200001440000001001713467541512013670 0ustar liggesusers\name{batcoord} \alias{batcoord} \alias{batvarcoord} %- Also NEED an `\alias' for EACH other topic documented here. \title{Bhattacharyya discriminant projection} \description{ Computes Bhattacharyya discriminant projection coordinates as described in Fukunaga (1990), p. 455 ff. } \usage{ batcoord(xd, clvecd, clnum=1, dom="mean") batvarcoord(xd, clvecd, clnum=1) } %- maybe also `usage' for other objects documented here. \arguments{ \item{xd}{the data matrix; a numerical object which can be coerced to a matrix.} \item{clvecd}{integer or logical vector of class numbers; length must equal \code{nrow(xd)}.} \item{clnum}{integer, one of the values of \code{clvecd}, if this is an integer vector. Bhattacharyya projections can only be computed if there are only two classes in the dataset. \code{clnum} is the number of one of the two classes. All the points indicated by other values of \code{clvecd} are interpreted as the second class.} \item{dom}{string. \code{dom="mean"} means that the discriminant coordinate for the group means is computed as the first projection direction by \code{\link{discrcoord}} (option \code{pool="equal"}; both classes have the same weight for computing the within-class covariance matrix). Then the data is projected into a subspace orthogonal (w.r.t. the within-class covariance) to the discriminant coordinate, and the projection coordinates to maximize the differences in variance are computed. \cr \code{dom="variance"} means that the projection coordinates maximizing the difference in variances are computed. Then they are ordered with respect to the Bhattacharyya distance, which takes also the mean differences into account. Both procedures are implemented as described in Fukunaga (1990).} } \details{ \code{batvarcoord} computes the optimal projection coordinates with respect to the difference in variances. \code{batcoord} combines the differences in mean and variance as explained for the argument \code{dom}. } \value{ \code{batcoord} returns a list with the components \code{ev, rev, units, proj}. \code{batvarcoord} returns a list with the components \code{ev, rev, units, proj, W, S1, S2}. \item{ev}{vector of eigenvalues. If \code{dom="mean"}, then first eigenvalue from \code{\link{discrcoord}}. Further eigenvalues are of \eqn{S_1^{-1}S_2}, where \eqn{S_i} is the covariance matrix of class i. For \code{batvarcoord} or if \code{dom="variance"}, all eigenvalues come from \eqn{S_1^{-1}S_2} and are ordered by \code{rev}.} \item{rev}{for \code{batcoord}: vector of projected Bhattacharyya distances (Fukunaga (1990), p. 99). Determine quality of the projection coordinates. For \code{batvarcoord}: vector of amount of projected difference in variances.} \item{units}{columns are coordinates of projection basis vectors. New points \code{x} can be projected onto the projection basis vectors by \code{x \%*\% units}.} \item{proj}{projections of \code{xd} onto \code{units}.} \item{W}{matrix \eqn{S_1^{-1}S_2}.} \item{S1}{covariance matrix of the first class.} \item{S2}{covariance matrix of the second class.} } \references{ Fukunaga, K. (1990). \emph{Introduction to Statistical Pattern Recognition} (2nd ed.). Boston: Academic Press. } \author{Christian Hennig \email{christian.hennig@unibo.it} \url{https://www.unibo.it/sitoweb/christian.hennig/en/}} \seealso{ \code{\link{plotcluster}} for straight forward discriminant plots. \code{\link{discrcoord}} for discriminant coordinates. \code{\link{rFace}} for generation of the example data used below. } \examples{ set.seed(4634) face <- rFace(600,dMoNo=2,dNoEy=0) grface <- as.integer(attr(face,"grouping")) bcf2 <- batcoord(face,grface==2) plot(bcf2$proj,col=1+(grface==2)) bcfv2 <- batcoord(face,grface==2,dom="variance") plot(bcfv2$proj,col=1+(grface==2)) bcfvv2 <- batvarcoord(face,grface==2) plot(bcfvv2$proj,col=1+(grface==2)) } \keyword{multivariate}% at least one, from doc/KEYWORDS \keyword{classif}% __ONLY ONE__ keyword per line fpc/man/pamk.Rd0000644000176200001440000001253013467541512013025 0ustar liggesusers\name{pamk} \alias{pamk} %- Also NEED an `\alias' for EACH other topic documented here. \title{Partitioning around medoids with estimation of number of clusters} \description{ This calls the function \code{\link[cluster]{pam}} or \code{\link[cluster]{clara}} to perform a partitioning around medoids clustering with the number of clusters estimated by optimum average silhouette width (see \code{\link[cluster]{pam.object}}) or Calinski-Harabasz index (\code{\link{calinhara}}). The Duda-Hart test (\code{\link{dudahart2}}) is applied to decide whether there should be more than one cluster (unless 1 is excluded as number of clusters or data are dissimilarities). } \usage{ pamk(data,krange=2:10,criterion="asw", usepam=TRUE, scaling=FALSE, alpha=0.001, diss=inherits(data, "dist"), critout=FALSE, ns=10, seed=NULL, ...) } \arguments{ \item{data}{a data matrix or data frame or something that can be coerced into a matrix, or dissimilarity matrix or object. See \code{\link[cluster]{pam}} for more information.} \item{krange}{integer vector. Numbers of clusters which are to be compared by the average silhouette width criterion. Note: average silhouette width and Calinski-Harabasz can't estimate number of clusters \code{nc=1}. If 1 is included, a Duda-Hart test is applied and 1 is estimated if this is not significant.} \item{criterion}{one of \code{"asw"}, \code{"multiasw"} or \code{"ch"}. Determines whether average silhouette width (as given out by \code{\link[cluster]{pam}}/\code{\link[cluster]{clara}}, or as computed by \code{\link{distcritmulti}} if \code{"multiasw"} is specified; recommended for large data sets with \code{usepam=FALSE}) or Calinski-Harabasz is applied. Note that the original Calinski-Harabasz index is not defined for dissimilarities; if dissimilarity data is run with \code{criterion="ch"}, the dissimilarity-based generalisation in Hennig and Liao (2013) is used.} \item{usepam}{logical. If \code{TRUE}, \code{\link[cluster]{pam}} is used, otherwise \code{\link[cluster]{clara}} (recommended for large datasets with 2,000 or more observations; dissimilarity matrices can not be used with \code{\link[cluster]{clara}}).} \item{scaling}{either a logical value or a numeric vector of length equal to the number of variables. If \code{scaling} is a numeric vector with length equal to the number of variables, then each variable is divided by the corresponding value from \code{scaling}. If \code{scaling} is \code{TRUE} then scaling is done by dividing the (centered) variables by their root-mean-square, and if \code{scaling} is \code{FALSE}, no scaling is done.} \item{alpha}{numeric between 0 and 1, tuning constant for \code{\link{dudahart2}} (only used for 1-cluster test).} \item{diss}{logical flag: if \code{TRUE} (default for \code{dist} or \code{dissimilarity}-objects), then \code{data} will be considered as a dissimilarity matrix (and the potential number of clusters 1 will be ignored). If \code{FALSE}, then \code{data} will be considered as a matrix of observations by variables.} \item{critout}{logical. If \code{TRUE}, the criterion value is printed out for every number of clusters.} \item{ns}{passed on to \code{\link{distcritmulti}} if \code{criterion="multiasw"}.} \item{seed}{passed on to \code{\link{distcritmulti}} if \code{criterion="multiasw"}.} \item{...}{further arguments to be transferred to \code{\link[cluster]{pam}} or \code{\link[cluster]{clara}}.} } \note{ \code{\link[cluster]{clara}} and \code{\link[cluster]{pam}} can handle \code{NA}-entries (see their documentation) but \code{\link{dudahart2}} cannot. Therefore \code{NA} should not occur if 1 is in \code{krange}. } \value{ A list with components \item{pamobject}{The output of the optimal run of the \code{\link[cluster]{pam}}-function.} \item{nc}{the optimal number of clusters.} \item{crit}{vector of criterion values for numbers of clusters. \code{crit[1]} is the p-value of the Duda-Hart test if 1 is in \code{krange} and \code{diss=FALSE}.} } \author{Christian Hennig \email{christian.hennig@unibo.it} \url{https://www.unibo.it/sitoweb/christian.hennig/en/} } \references{ Calinski, R. B., and Harabasz, J. (1974) A Dendrite Method for Cluster Analysis, \emph{Communications in Statistics}, 3, 1-27. Duda, R. O. and Hart, P. E. (1973) \emph{Pattern Classification and Scene Analysis}. Wiley, New York. Hennig, C. and Liao, T. (2013) How to find an appropriate clustering for mixed-type variables with application to socio-economic stratification, \emph{Journal of the Royal Statistical Society, Series C Applied Statistics}, 62, 309-369. Kaufman, L. and Rousseeuw, P.J. (1990). "Finding Groups in Data: An Introduction to Cluster Analysis". Wiley, New York. } \seealso{ \code{\link[cluster]{pam}}, \code{\link[cluster]{clara}} \code{\link{distcritmulti}} } \examples{ options(digits=3) set.seed(20000) face <- rFace(50,dMoNo=2,dNoEy=0,p=2) pk1 <- pamk(face,krange=1:5,criterion="asw",critout=TRUE) pk2 <- pamk(face,krange=1:5,criterion="multiasw",ns=2,critout=TRUE) # "multiasw" is better for larger data sets, use larger ns then. pk3 <- pamk(face,krange=1:5,criterion="ch",critout=TRUE) } \keyword{cluster}% at least one, from doc/KEYWORDS \keyword{multivariate} fpc/man/cluster.varstats.Rd0000644000176200001440000001117513467541512015430 0ustar liggesusers\name{cluster.varstats} \alias{cluster.varstats} \alias{print.varwisetables} %- Also NEED an `\alias' for EACH other topic documented here. \title{Variablewise statistics for clusters} \description{ This function gives some helpful variable-wise information for cluster interpretation, given a clustering and a data set. The output object contains some tables. For categorical variables, tables compare clusterwise distributions with overall distributions. Continuous variables are categorised for this. If desired, tables, histograms, some standard statistics of continuous variables and validation plots as available through \code{\link{discrproj}} (Hennig 2004) are given out on the fly. } \usage{ cluster.varstats(clustering,vardata,contdata=vardata, clusterwise=TRUE, tablevar=NULL,catvar=NULL, quantvar=NULL, catvarcats=10, proportions=FALSE, projmethod="none",minsize=ncol(contdata)+2, ask=TRUE,rangefactor=1) \method{print}{varwisetables}(x,digits=3,...) } %- maybe also `usage' for other objects documented here. \arguments{ \item{clustering}{vector of integers. Clustering (needs to be in standard coding, 1,2,...).} \item{vardata}{data matrix or data frame of which variables are summarised.} \item{contdata}{variable matrix or data frame, normally all or some variables from \code{vardata}, on which cluster visualisation by projection methods is performed unless \code{projmethod="none"}. It should make sense to interpret these variables in a quantitative (interval-scaled) way.} \item{clusterwise}{logical. If \code{FALSE}, only the output tables are computed but no more detail and graphs are given on the fly.} \item{tablevar}{vector of integers. Numbers of variables treated as categorical (i.e., no histograms and statistics, just tables) if \code{clusterwise=TRUE}. Note that an error will be produced by factor type variables unless they are declared as categorical here.} \item{catvar}{vector of integers. Numbers of variables to be categorised by proportional quantiles for table computation. Recommended for all continuous variables.} \item{quantvar}{vector of integers. Variables for which means, standard deviations and quantiles should be given out if \code{clusterwise=TRUE}.} \item{catvarcats}{integer. Number of categories used for categorisation of variables specified in \code{quantvar}.} \item{proportions}{logical. If \code{TRUE}, output tables contain proportions, otherwise numbers of observations.} \item{projmethod}{one of \code{"none"}, \code{"dc"}, \code{"bc"}, \code{"vbc"}, \code{"mvdc"}, \code{"adc"}, \code{"awc"} (recommended if not \code{"none"}), \code{"arc"}, \code{"nc"}, \code{"wnc"}, \code{"anc"}. Cluster validation projection method introduced in Hennig (2004), passed on as \code{method} argument in \code{\link{discrproj}}.} \item{minsize}{integer. Projection is not carried out for clusters with fewer points than this. (If this is chosen smaller, it may lead to errors with some projection methods.)} \item{ask}{logical. If \code{TRUE}, \code{par(ask=TRUE)} is set in the beginning to prompt the user before plots and \code{par(ask=FALSE)} in the end.} \item{rangefactor}{numeric. Factor by which to multiply the range for projection plot ranges.} \item{x}{an object of class \code{"varwisetables"}, output object of \code{cluster.varstats}.} \item{digits}{integer. Number of digits after the decimal point to print out.} \item{...}{not used.} } \value{ An object of class \code{"varwisetables"}, which is a list with a table for each variable, giving (categorised) marginal distributions by cluster. } \references{ Hennig, C. (2004) Asymmetric linear dimension reduction for classification. Journal of Computational and Graphical Statistics 13, 930-945 . } \author{Christian Hennig \email{christian.hennig@unibo.it} \url{https://www.unibo.it/sitoweb/christian.hennig/en}} \examples{ set.seed(112233) options(digits=3) require(MASS) require(flexmix) data(Cars93) Cars934 <- Cars93[,c(3,5,8,10)] cc <- discrete.recode(Cars934,xvarsorted=FALSE,continuous=c(2,3),discrete=c(1,4)) fcc <- flexmix(cc$data~1,k=2, model=lcmixed(continuous=2,discrete=2,ppdim=c(6,3),diagonal=TRUE)) cv <- cluster.varstats(fcc@cluster,Cars934, contdata=Cars934[,c(2,3)], tablevar=c(1,4),catvar=c(2,3),quantvar=c(2,3),projmethod="awc", ask=FALSE) print(cv) } \keyword{cluster}% __ONLY ONE__ keyword per line fpc/man/neginc.Rd0000644000176200001440000000233613467541512013343 0ustar liggesusers\name{neginc} \alias{neginc} %- Also NEED an `\alias' for EACH other topic documented here. \title{Neg-entropy normality index for cluster validation} \description{ Cluster validity index based on the neg-entropy distances of within-cluster distributions to normal distribution, see Lago-Fernandez and Corbacho (2010). } \usage{ neginc(x,clustering) } %- maybe also `usage' for other objects documented here. \arguments{ \item{x}{something that can be coerced into a numerical matrix. Euclidean dataset.} \item{clustering}{vector of integers with length \code{=nrow(x)}; indicating the cluster for each observation.} } \value{ Index value, see Lago-Fernandez and Corbacho (2010). The lower (i.e., the more negative) the better. } \references{ Lago-Fernandez, L. F. and Corbacho, F. (2010) Normality-based validation for crisp clustering. \emph{Pattern Recognition} 43, 782-795. } \author{Christian Hennig \email{christian.hennig@unibo.it} \url{https://www.unibo.it/sitoweb/christian.hennig/en/} } \examples{ options(digits=3) iriss <- as.matrix(iris[c(1:10,51:55,101:105),-5]) irisc <- as.numeric(iris[c(1:10,51:55,101:105),5]) neginc(iriss,irisc) } \keyword{cluster}% at least one, from doc/KEYWORDS fpc/man/distrsimilarity.Rd0000644000176200001440000001014413467541512015330 0ustar liggesusers\name{distrsimilarity} \alias{distrsimilarity} %- Also NEED an `\alias' for EACH other topic documented here. \title{Similarity of within-cluster distributions to normal and uniform} \description{ Two measures of dissimilarity between the within-cluster distributions of a dataset and normal or uniform distribution. For the normal it's the Kolmogorov dissimilarity between the Mahalanobis distances to the center and a chi-squared distribution. For the uniform it is the Kolmogorov distance between the distance to the kth nearest neighbour and a Gamma distribution (this is based on Byers and Raftery (1998)). The clusterwise values are aggregated by weighting with the cluster sizes. } \usage{ distrsimilarity(x,clustering,noisecluster = FALSE, distribution=c("normal","uniform"),nnk=2, largeisgood=FALSE,messages=FALSE) } %- maybe also `usage' for other objects documented here. \arguments{ \item{x}{the data matrix; a numerical object which can be coerced to a matrix.} \item{clustering}{integer vector of class numbers; length must equal \code{nrow(x)}, numbers must go from 1 to the number of clusters.} \item{noisecluster}{logical. If \code{TRUE}, the cluster with the largest number is ignored for the computations.} \item{distribution}{vector of \code{"normal", "uniform"} or both. Indicates which of the two dissimilarities is/are computed.} \item{nnk}{integer. Number of nearest neighbors to use for dissimilarity to the uniform.} \item{largeisgood}{logical. If \code{TRUE}, dissimilarities are transformed to \code{1-d} (this means that larger values indicate a better fit).} \item{messages}{logical. If \code{TRUE}, warnings are given if within-cluster covariance matrices are not invertible (in which case all within-cluster Mahalanobis distances are set to zero).} } \note{ It is very hard to capture similarity to a multivariate normal or uniform in a single value, and both used here have their shortcomings. Particularly, the dissimilarity to the uniform can still indicate a good fit if there are holes or it's a uniform distribution concentrated on several not connected sets. } % \details{ % } \value{ List with the following components \item{kdnorm}{Kolmogorov distance between distribution of within-cluster Mahalanobis distances and appropriate chi-squared distribution, aggregated over clusters (I am grateful to Agustin Mayo-Iscar for the idea).} \item{kdunif}{Kolmogorov distance between distribution of distances to \code{nnk}th nearest within-cluster neighbor and appropriate Gamma-distribution, see Byers and Raftery (1998), aggregated over clusters.} \item{kdnormc}{vector of cluster-wise Kolmogorov distances between distribution of within-cluster Mahalanobis distances and appropriate chi-squared distribution.} \item{kdunifc}{vector of cluster-wise Kolmogorov distances between distribution of distances to \code{nnk}th nearest within-cluster neighbor and appropriate Gamma-distribution.} \item{xmahal}{vector of Mahalanobs distances to the respective cluster center.} \item{xdknn}{vector of distance to \code{nnk}th nearest within-cluster neighbor.} } \references{ Byers, S. and Raftery, A. E. (1998) Nearest-Neighbor Clutter Removal for Estimating Features in Spatial Point Processes, \emph{Journal of the American Statistical Association}, 93, 577-584. Hennig, C. (2017) Cluster validation by measurement of clustering characteristics relevant to the user. In C. H. Skiadas (ed.) \emph{Proceedings of ASMDA 2017}, 501-520, \url{https://arxiv.org/abs/1703.09282} } \author{Christian Hennig \email{christian.hennig@unibo.it} \url{https://www.unibo.it/sitoweb/christian.hennig/en/} } \seealso{ \code{\link{cqcluster.stats}},\code{\link{cluster.stats}} for more cluster validity statistics. } \examples{ set.seed(20000) options(digits=3) face <- rFace(200,dMoNo=2,dNoEy=0,p=2) km3 <- kmeans(face,3) distrsimilarity(face,km3$cluster) } \keyword{multivariate}% at least one, from doc/KEYWORDS \keyword{classif}% __ONLY ONE__ keyword per line \keyword{cluster}% __ONLY ONE__ keyword per line fpc/man/sseg.Rd0000644000176200001440000000142313467541512013035 0ustar liggesusers\name{sseg} \alias{sseg} %- Also NEED an `\alias' for EACH other topic documented here. \title{Position in a similarity vector} \description{ \code{sseg(i,j)} gives the position of the similarity of objects \code{i} and \code{j} in the similarity vectors produced by \code{fixreg} and \code{fixmahal}. \code{sseg} should only be used as an auxiliary function in \code{fixreg} and \code{fixmahal}. } \usage{ sseg(i, j) } %- maybe also `usage' for other objects documented here. \arguments{ \item{i}{positive integer.} \item{j}{positive integer.} } \value{A positive integer. } \author{Christian Hennig \email{christian.hennig@unibo.it} \url{https://www.unibo.it/sitoweb/christian.hennig/en/}} \examples{ sseg(3,4) } \keyword{utilities}% at least one, from doc/KEYWORDS fpc/man/findrep.Rd0000644000176200001440000000437613467541512013535 0ustar liggesusers\name{findrep} \alias{findrep} %- Also NEED an `\alias' for EACH other topic documented here. \title{Finding representatives for cluster border} \description{ Finds representative objects for the border of a cluster and the within-cluster variance as defined in the framework of the \code{\link{cdbw}} cluster validation index (and meant to be used in that context). } \usage{ findrep(x,xcen,clustering,cluster,r,p=ncol(x),n=nrow(x), nc=sum(clustering==cluster)) } %- maybe also `usage' for other objects documented here. \arguments{ \item{x}{matrix. Euclidean dataset.} \item{xcen}{mean vector of cluster.} \item{clustering}{vector of integers with length \code{=nrow(x)}; indicating the cluster for each observation.} \item{cluster}{integer. Number of cluster to be treated.} \item{r}{integer. Number of representatives.} \item{p}{integer. Number of dimensions.} \item{n}{integer. Number of observations.} \item{nc}{integer. Number of observations in \code{cluster}.} } \value{ List with components \item{repc}{vector of index of representatives (out of all observations).} \item{repx}{vector of index of representatives (out of only the observations in \code{cluster}).} \item{maxr}{number of representatives (this can be smaller than \code{r} if fewer pairwise different observations are in \code{cluster}.} \item{wvar}{estimated average within-cluster squared distance to mean.} } \references{ Halkidi, M. and Vazirgiannis, M. (2008) A density-based cluster validity approach using multi-representatives. \emph{Pattern Recognition Letters} 29, 773-786. Halkidi, M., Vazirgiannis, M. and Hennig, C. (2015) Method-independent indices for cluster validation. In C. Hennig, M. Meila, F. Murtagh, R. Rocci (eds.) \emph{Handbook of Cluster Analysis}, CRC Press/Taylor \code{&} Francis, Boca Raton. } \seealso{ \code{\link{cdbw}} } \author{Christian Hennig \email{christian.hennig@unibo.it} \url{https://www.unibo.it/sitoweb/christian.hennig/en/} } \examples{ options(digits=3) iriss <- as.matrix(iris[c(1:5,51:55,101:105),-5]) irisc <- as.numeric(iris[c(1:5,51:55,101:105),5]) findrep(iriss,colMeans(iriss),irisc,cluster=1,r=2) } \keyword{cluster}% at least one, from doc/KEYWORDS fpc/man/cmahal.Rd0000644000176200001440000000456113467541512013327 0ustar liggesusers\name{cmahal} \alias{cmahal} %- Also NEED an `\alias' for EACH other topic documented here. \title{Generation of tuning constant for Mahalanobis fixed point clusters.} \description{ Generates tuning constants \code{ca} for \code{\link{fixmahal}} dependent on the number of points and variables of the current fixed point cluster (FPC). This is experimental and only thought for use in \code{\link{fixmahal}}. } \usage{ cmahal(n, p, nmin, cmin, nc1, c1 = cmin, q = 1) } %- maybe also `usage' for other objects documented here. \arguments{ \item{n}{positive integer. Number of points.} \item{p}{positive integer. Number of variables.} \item{nmin}{integer larger than 1. Smallest number of points for which \code{ca} is computed. For smaller FPC sizes, \code{ca} is set to the value for \code{nmin}.} \item{cmin}{positive number. Minimum value for \code{ca}.} \item{nc1}{positive integer. Number of points at which \code{ca=c1}.} \item{c1}{positive numeric. Tuning constant for \code{cmahal}. Value for \code{ca} for FPC size equal to \code{nc1}.} \item{q}{numeric between 0 and 1. 1 for steepest possible descent of \code{ca} as function of the FPC size. Should presumably always be 1.} } \details{ Some experiments suggest that the tuning constant \code{ca} should decrease with increasing FPC size and increase with increasing \code{p} in \code{\link{fixmahal}}. This is to prevent too small meaningless FPCs while maintaining the significant larger ones. \code{cmahal} with \code{q=1} computes \code{ca} in such a way that as long as \code{ca>cmin}, the decrease in \code{n} is as steep as possible in order to maintain the validity of the convergence theorem in Hennig and Christlieb (2002). } \value{ A numeric vector of length \code{n}, giving the values for \code{ca} for all FPC sizes smaller or equal to \code{n}. } \references{ Hennig, C. and Christlieb, N. (2002) Validating visual clusters in large datasets: Fixed point clusters of spectral features, \emph{Computational Statistics and Data Analysis} 40, 723-739. } \author{Christian Hennig \email{christian.hennig@unibo.it} \url{https://www.unibo.it/sitoweb/christian.hennig/en/}} \seealso{\code{\link{fixmahal}}} \examples{ plot(1:100,cmahal(100,3,nmin=5,cmin=qchisq(0.99,3),nc1=90), xlab="FPC size", ylab="cmahal") } \keyword{cluster}% at least one, from doc/KEYWORDS fpc/man/fixmahal.Rd0000644000176200001440000004625313470367752013705 0ustar liggesusers\name{fixmahal} \alias{fixmahal} \alias{summary.mfpc} \alias{plot.mfpc} \alias{fpclusters.mfpc} \alias{print.summary.mfpc} \alias{print.mfpc} \alias{fpmi} %- Also NEED an `\alias' for EACH other topic documented here. \title{Mahalanobis Fixed Point Clusters} \description{ Computes Mahalanobis fixed point clusters (FPCs), i.e., subsets of the data, which consist exactly of the non-outliers w.r.t. themselves, and may be interpreted as generated from a homogeneous normal population. FPCs may overlap, are not necessarily exhausting and do not need a specification of the number of clusters. Note that while \code{fixmahal} has lots of parameters, only one (or few) of them have usually to be specified, cf. the examples. The philosophy is to allow much flexibility, but to always provide sensible defaults. } \usage{ fixmahal(dat, n = nrow(as.matrix(dat)), p = ncol(as.matrix(dat)), method = "fuzzy", cgen = "fixed", ca = NA, ca2 = NA, calpha = ifelse(method=="fuzzy",0.95,0.99), calpha2 = 0.995, pointit = TRUE, subset = n, nc1 = 100+20*p, startn = 18+p, mnc = floor(startn/2), mer = ifelse(pointit,0.1,0), distcut = 0.85, maxit = 5*n, iter = n*1e-5, init.group = list(), ind.storage = TRUE, countmode = 100, plot = "none") \method{summary}{mfpc}(object, ...) \method{print}{summary.mfpc}(x, maxnc=30, ...) \method{plot}{mfpc}(x, dat, no, bw=FALSE, main=c("Representative FPC No. ",no), xlab=NULL, ylab=NULL, pch=NULL, col=NULL, ...) \method{fpclusters}{mfpc}(object, dat=NA, ca=object$ca, p=object$p, ...) fpmi(dat, n = nrow(as.matrix(dat)), p = ncol(as.matrix(dat)), gv, ca, ca2, method = "ml", plot, maxit = 5*n, iter = n*1e-6) } %- maybe also `usage' for other objects documented here. \arguments{ \item{dat}{something that can be coerced to a numerical matrix or vector. Data matrix, rows are points, columns are variables. \code{fpclusters.rfpc} does not need specification of \code{dat} if \code{fixmahal} has been run with \code{ind.storage=TRUE}.} \item{n}{optional positive integer. Number of cases.} \item{p}{optional positive integer. Number of independent variables.} \item{method}{a string. \code{method="classical"} means 0-1 weighting of observations by Mahalanobis distances and use of the classical normal covariance estimator. \code{method="ml"} uses the ML-covariance estimator (division by \code{n} instead of \code{n-1}) This is used in Hennig and Christlieb (2002). \code{method} can also be \code{"mcd"} or \code{"mve"}, to enforce the use of robust centers and covariance matrices, see \code{\link{cov.rob}}. This is experimental, not recommended at the moment, may be very slowly and requires library \code{lqs}. The default is \code{method="fuzzy"}, where weighted means and covariance matrices are used (Hennig, 2005). The weights are computed by \code{\link{wfu}}, i.e., a function that is constant 1 for arguments smaller than \code{ca}, 0 for arguments larger than \code{ca2} and continuously linear in between. Convergence is only proven for \code{method="ml"} up to now.} \item{cgen}{optional string. \code{"fixed"} means that the same tuning constant \code{ca} is used for all iterations. \code{"auto"} means that \code{ca} is generated dependently on the size of the current data subset in each iteration by \code{\link{cmahal}}. This is experimental.} \item{ca}{optional positive number. Tuning constant, specifying required cluster separation. By default determined as \code{calpha}-quantile of the chisquared distribution with \code{p} degrees of freedom.} \item{ca2}{optional positive number. Second tuning constant needed if \code{method="fuzzy"}. By default determined as \code{calpha2}-quantile of the chisquared distribution with \code{p} degrees of freedom.} \item{calpha}{number between 0 and 1. See \code{ca}.} \item{calpha2}{number between 0 and 1, larger than \code{calpha}. See \code{ca2}.} \item{pointit}{optional logical. If \code{TRUE}, \code{subset} fixed point algorithms are started from initial configurations, which are built around single points of the dataset, cf. \code{\link{mahalconf}}. Otherwise, initial configurations are only specified by \code{init.group}.} \item{subset}{optional positive integer smaller or equal than \code{n}. Initial configurations for the fixed point algorithm (cf. \code{\link{mahalconf}}) are built from a subset of \code{subset} points from the data. No effect if \code{pointit=FALSE}. Default: all points.} \item{nc1}{optional positive integer. Tuning constant needed by \code{\link{cmahal}} to generate \code{ca} automatically. Only needed for \code{cgen="auto"}.} \item{startn}{optional positive integer. Size of the initial configurations. The default value is chosen to prevent that small meaningless FPCs are found, but it should be decreased if clusters of size smaller than the default value are of interest.} \item{mnc}{optional positive integer. Minimum size of clusters to be reported.} \item{mer}{optional nonnegative number. FPCs (groups of them, respectively, see details) are only reported as stable if the ratio of the number of their findings to their number of points exceeds \code{mer}. This holds under \code{pointit=TRUE} and \code{subset=n}. For \code{subset=3}, the optimal separating projection computed by \code{\link{batcoord}} is shown. \code{fpclusters.mfpc} produces a list of indicator vectors for the representative FPCs of stable groups. \code{fpmi} is called by \code{fixmahal} for a single fixed point algorithm and will usually not be executed alone. } \value{ \code{fixmahal} returns an object of class \code{mfpc}. This is a list containing the components \code{nc, g, means, covs, nfound, er, tsc, ncoll, skc, grto, imatrix, smatrix, stn, stfound, ser, sfpc, ssig, sto, struc, n, p, method, cgen, ca, ca2, cvec, calpha, pointit, subset, mnc, startn, mer, distcut}. \code{summary.mfpc} returns an object of class \code{summary.mfpc}. This is a list containing the components \code{means, covs, stn, stfound, sn, ser, tskip, skc, tsc, sim, ca, ca2, calpha, mer, method, cgen, pointit}. \code{fpclusters.mfpc} returns a list of indicator vectors for the representative FPCs of stable groups. \code{fpmi} returns a list with the components \code{mg, covg, md, gv, coll, method, ca}. \item{nc}{integer. Number of FPCs.} \item{g}{list of logical vectors. Indicator vectors of FPCs. \code{FALSE} if \code{ind.storage=FALSE}.} \item{means}{list of numerical vectors. Means of FPCs. In \code{summary.mfpc}, only for representative FPCs of stable groups and sorted according to \code{ser}.} \item{covs}{list of numerical matrices. Covariance matrices of FPCs. In \code{summary.mfpc}, only for representative FPCs of stable groups and sorted according to \code{ser}.} \item{nfound}{vector of integers. Number of findings for the FPCs.} \item{er}{numerical vector. Ratio of number of findings of FPCs to their size. Under \code{pointit=TRUE}, this can be taken as a measure of stability of FPCs.} \item{tsc}{integer. Number of algorithm runs leading to too small or too seldom found FPCs.} \item{ncoll}{integer. Number of algorithm runs where collinear covariance matrices occurred.} \item{skc}{integer. Number of skipped clusters.} \item{grto}{vector of integers. Numbers of FPCs to which algorithm runs led, which were started by \code{init.group}.} \item{imatrix}{vector of integers. Size of intersection between FPCs. See \code{\link{sseg}}.} \item{smatrix}{numerical vector. Similarities between FPCs. See \code{\link{sseg}}.} \item{stn}{integer. Number of representative FPCs of stable groups. In \code{summary.mfpc}, sorted according to \code{ser}.} \item{stfound}{vector of integers. Number of findings of members of all groups of FPCs. In \code{summary.mfpc}, sorted according to \code{ser}.} \item{ser}{numerical vector. Ratio of number of findings of groups of FPCs to their size. Under \code{pointit=TRUE}, this can be taken as a measure of stability of FPCs. In \code{summary.mfpc}, sorted from largest to smallest.} \item{sfpc}{vector of integers. Numbers of representative FPCs of all groups.} \item{ssig}{vector of integers of length \code{stn}. Numbers of representative FPCs of the stable groups.} \item{sto}{vector of integers. Numbers of groups ordered according to largest \code{ser}.} \item{struc}{vector of integers. Number of group an FPC belongs to.} \item{n}{see arguments.} \item{p}{see arguments.} \item{method}{see arguments.} \item{cgen}{see arguments.} \item{ca}{see arguments, if \code{cgen} has been \code{"fixed"}. Else numerical vector of length \code{nc} (see below), giving the final values of \code{ca} for all FPC. In \code{fpmi}, tuning constant for the iterated FPC.} \item{ca2}{see arguments.} \item{cvec}{numerical vector of length \code{n} for \code{cgen="auto"}. The values for the tuning constant \code{ca} corresponding to the cluster sizes from \code{1} to \code{n}.} \item{calpha}{see arguments.} \item{pointit}{see arguments.} \item{subset}{see arguments.} \item{mnc}{see arguments.} \item{startn}{see arguments.} \item{mer}{see arguments.} \item{distcut}{see arguments.} \item{sn}{vector of integers. Number of points of representative FPCs.} \item{tskip}{integer. Number of algorithm runs leading to skipped FPCs.} \item{sim}{vector of integers. Size of intersections between representative FPCs of stable groups. See \code{\link{sseg}}.} \item{mg}{mean vector.} \item{covg}{covariance matrix.} \item{md}{Mahalanobis distances.} \item{gv}{logical (numerical, respectively, if \code{method="fuzzy"}) indicator vector of iterated FPC.} \item{coll}{logical. \code{TRUE} means that singular covariance matrices occurred during the iterations.} } \references{ Hennig, C. (2002) Fixed point clusters for linear regression: computation and comparison, \emph{Journal of Classification} 19, 249-276. Hennig, C. (2005) Fuzzy and Crisp Mahalanobis Fixed Point Clusters, in Baier, D., Decker, R., and Schmidt-Thieme, L. (eds.): \emph{Data Analysis and Decision Support}. Springer, Heidelberg, 47-56. Hennig, C. and Christlieb, N. (2002) Validating visual clusters in large datasets: Fixed point clusters of spectral features, \emph{Computational Statistics and Data Analysis} 40, 723-739. } \author{Christian Hennig \email{christian.hennig@unibo.it} \url{https://www.unibo.it/sitoweb/christian.hennig/en/} } \seealso{ \code{\link{fixreg}} for linear regression fixed point clusters. \code{\link{mahalconf}}, \code{\link{wfu}}, \code{\link{cmahal}} for computation of initial configurations, weights, tuning constants. \code{\link{sseg}} for indexing the similarity/intersection vectors computed by \code{fixmahal}. \code{\link{batcoord}}, \code{\link{cov.rob}}, \code{\link{solvecov}}, \code{\link{cov.wml}}, \code{\link{plotcluster}} for computation of projections, (inverted) covariance matrices, plotting. \code{\link{rFace}} for generation of example data, see below. } \examples{ options(digits=2) set.seed(20000) face <- rFace(400,dMoNo=2,dNoEy=0, p=3) # The first example uses grouping information via init.group. initg <- list() grface <- as.integer(attr(face,"grouping")) for (i in 1:5) initg[[i]] <- (grface==i) ff0 <- fixmahal(face, pointit=FALSE, init.group=initg) summary(ff0) cff0 <- fpclusters(ff0) plot(face, col=1+cff0[[1]]) plot(face, col=1+cff0[[4]]) # Why does this come out as a cluster? plot(ff0, face, 4) # A bit clearer... # Without grouping information, examples need more time: # ff1 <- fixmahal(face) # summary(ff1) # cff1 <- fpclusters(ff1) # plot(face, col=1+cff1[[1]]) # plot(face, col=1+cff1[[6]]) # Why does this come out as a cluster? # plot(ff1, face, 6) # A bit clearer... # ff2 <- fixmahal(face,method="ml") # summary(ff2) # ff3 <- fixmahal(face,method="ml",calpha=0.95,subset=50) # summary(ff3) ## ...fast, but lots of clusters. mer=0.3 may be useful here. # set.seed(3000) # face2 <- rFace(400,dMoNo=2,dNoEy=0) # ff5 <- fixmahal(face2) # summary(ff5) ## misses right eye of face data; with p=6, ## initial configurations are too large for 40 point clusters # ff6 <- fixmahal(face2, startn=30) # summary(ff6) # cff6 <- fpclusters(ff6) # plot(face2, col=1+cff6[[3]]) # plot(ff6, face2, 3) # x <- c(1,2,3,6,6,7,8,120) # ff8 <- fixmahal(x) # summary(ff8) # ...dataset a bit too small for the defaults... # ff9 <- fixmahal(x, mnc=3, startn=3) # summary(ff9) } \keyword{cluster}% at least one, from doc/KEYWORDS \keyword{multivariate} \keyword{robust}% __ONLY ONE__ keyword per line fpc/man/distcritmulti.Rd0000644000176200001440000000602013467541512014772 0ustar liggesusers\name{distcritmulti} \alias{distcritmulti} %- Also NEED an `\alias' for EACH other topic documented here. \title{Distance based validity criteria for large data sets} \description{ Approximates average silhouette width or the Pearson version of Hubert's gamma criterion by hacking the dataset into pieces and averaging the subset-wise values, see Hennig and Liao (2013). } \usage{ distcritmulti(x,clustering,part=NULL,ns=10,criterion="asw", fun="dist",metric="euclidean", count=FALSE,seed=NULL,...) } %- maybe also `usage' for other objects documented here. \arguments{ \item{x}{cases times variables data matrix.} \item{clustering}{vector of integers indicating the clustering.} \item{part}{vector of integer subset sizes; sum should be smaller or equal to the number of cases of \code{x}. If \code{NULL}, subset sizes are chosen approximately equal.} \item{ns}{integer. Number of subsets, only used if \code{part==NULL}.} \item{criterion}{\code{"asw"} or \code{"pearsongamma"}, specifies whether the average silhouette width or the Pearson version of Hubert's gamma is computed.} \item{fun}{\code{"dist"} or \code{"daisy"}, specifies which function is used for computing dissimilarities.} \item{metric}{passed on to \code{\link{dist}} (as argument \code{method}) or \code{\link[cluster]{daisy}} to determine which dissimilarity is used.} \item{count}{logical. if \code{TRUE}, the subset number just processed is printed.} \item{seed}{integer, random seed. (If \code{NULL}, result depends on random numbers.)} \item{...}{further arguments to be passed on to \code{\link{dist}} or \code{\link[cluster]{daisy}}.} } \value{ A list with components \code{crit.overall,crit.sub,crit.sd,part}. \item{crit.overall}{value of criterion.} \item{crit.sub}{vector of subset-wise criterion values.} \item{crit.sd}{standard deviation of \code{crit.sub}, can be used to assess stability.} \item{subsets}{list of case indexes in subsets.} } \references{ Halkidi, M., Batistakis, Y., Vazirgiannis, M. (2001) On Clustering Validation Techniques, \emph{Journal of Intelligent Information Systems}, 17, 107-145. Hennig, C. and Liao, T. (2013) How to find an appropriate clustering for mixed-type variables with application to socio-economic stratification, \emph{Journal of the Royal Statistical Society, Series C Applied Statistics}, 62, 309-369. Kaufman, L. and Rousseeuw, P.J. (1990). "Finding Groups in Data: An Introduction to Cluster Analysis". Wiley, New York. } \author{Christian Hennig \email{christian.hennig@unibo.it} \url{https://www.unibo.it/sitoweb/christian.hennig/en}} \seealso{\code{\link{cluster.stats}}, \code{\link[cluster]{silhouette}}} \examples{ set.seed(20000) options(digits=3) face <- rFace(50,dMoNo=2,dNoEy=0,p=2) clustering <- as.integer(attr(face,"grouping")) distcritmulti(face,clustering,ns=3,seed=100000,criterion="pearsongamma") } \keyword{cluster}% __ONLY ONE__ keyword per line fpc/man/mergeparameters.Rd0000644000176200001440000000502213467541512015256 0ustar liggesusers\name{mergeparameters} \alias{mergeparameters} %- Also NEED an `\alias' for EACH other topic documented here. \title{New parameters from merging two Gaussian mixture components} \description{ Re-computes pointwise posterior probabilities, mean and covariance matrix for a mixture component obtained by merging two mixture components in a Gaussian mixture. } \usage{ mergeparameters(xdata, j1, j2, probs, muarray,Sigmaarray, z) } %- maybe also `usage' for other objects documented here. \arguments{ \item{xdata}{data (something that can be coerced into a matrix).} \item{j1}{integer. Number of first mixture component to be merged.} \item{j2}{integer. Number of second mixture component to be merged.} \item{probs}{vector of component proportions (for all components; should sum up to one).} \item{muarray}{matrix of component means (rows).} \item{Sigmaarray}{array of component covariance matrices (third dimension refers to component number).} \item{z}{matrix of observation- (row-)wise posterior probabilities of belonging to the components (columns).} } \value{ List with components \item{probs}{see above; sum of probabilities for original components \code{j1} and \code{j2} is now \code{probs[j1]}. Note that generally, also for the further components, values for the merged component are in place \code{j1} and values in place \code{j2} are not changed. This means that in order to have only the information for the new mixture after merging, the entries in places \code{j2} need to be suppressed.} \item{muarray}{see above; weighted mean of means of component \code{j1} and \code{j2} is now in place \code{j1}.} \item{Sigmaarray}{see above; weighted covariance matrix handled as above.} \item{z}{see above; original entries for columns \code{j1} and \code{j2} are summed up and now in column \code{j1}.} } \references{ Hennig, C. (2010) Methods for merging Gaussian mixture components, \emph{Advances in Data Analysis and Classification}, 4, 3-34. } \author{Christian Hennig \email{christian.hennig@unibo.it} \url{https://www.unibo.it/sitoweb/christian.hennig/en/} } \examples{ options(digits=3) set.seed(98765) require(mclust) iriss <- iris[sample(150,20),-5] irisBIC <- mclustBIC(iriss) siris <- summary(irisBIC,iriss) probs <- siris$parameters$pro muarray <- siris$parameters$mean Sigmaarray <- siris$parameters$variance$sigma z <- siris$z mpi <- mergeparameters(iriss,1,2,probs,muarray,Sigmaarray,z) mpi$probs mpi$muarray } \keyword{multivariate} \keyword{cluster} fpc/man/ridgeline.diagnosis.Rd0000644000176200001440000000775113470400604016015 0ustar liggesusers\name{ridgeline.diagnosis} \alias{ridgeline.diagnosis} %- Also NEED an `\alias' for EACH other topic documented here. \title{Ridgeline plots, ratios and unimodality} \description{ Computes ridgeline ratios and unimodality checks for pairs of components given the parameters of a Gaussian mixture. Produces ridgeline plots. } \usage{ ridgeline.diagnosis (propvector,muarray,Sigmaarray, k=length(propvector), ipairs="all", compute.ratio=TRUE,by=0.001, ratiocutoff=NULL,ridgelineplot="matrix") } %- maybe also `usage' for other objects documented here. \arguments{ \item{propvector}{vector of component proportions. Length must be number of components, and must sum up to 1.} \item{muarray}{matrix of component means (different components are in different columns).} \item{Sigmaarray}{three dimensional array with component covariance matrices (the third dimension refers to components).} \item{k}{integer. Number of components.} \item{ipairs}{\code{"all"} or list of vectors of two integers. If \code{ipairs="all"}, computations are carried out for all pairs of components. Otherwise, ipairs gives the pairs of components for which computations are carried out.} \item{compute.ratio}{logical. If \code{TRUE}, a matrix of ridgeline ratios is computed, see Hennig (2010a).} \item{by}{real between 0 and 1. Interval width for density computation along the ridgeline.} \item{ratiocutoff}{real between 0 and 1. If not \code{NULL}, the \code{connection.matrix} (see below) is computed by checking whether ridgeline ratios between components are below \code{ratiocutoff}.} \item{ridgelineplot}{one of \code{"none"}, \code{"matrix"}, \code{"pairwise"}. If \code{"matrix"}, a matrix of pairwise ridgeline plots (see Hennig 2010b) will be plotted. If \code{"pairwise"}, pairwise ridgeline plots are plotted (you may want to set \code{par(ask=TRUE)} to see them all). No plotting if \code{"none"}.} } \value{ A list with components \item{merged.clusters}{vector of integers, stating for every mixture component the number of the cluster of components that would be merged by merging connectivity components of the graph specified by \code{connection.matrix}.} \item{connection.matrix}{zero-one matrix, in which a one means that the mixture of the corresponding pair of components of the original mixture is either unimodel (if \code{ratiocutoff=NULL}) or that their ridgeline ratio is above \code{ratiocutoff}. If \code{ipairs!="all"}, ignored pairs always have 0 in this matrix, same for \code{ratio.matrix}.} \item{ratio.matrix}{matrix with entries between 0 und 1, giving the ridgeline ratio, which is the density minimum of the mixture of the corresponding pair of components along the ridgeline divided by the minimum of the two maxima closest to the beginning and the end of the ridgeline.} } \references{ Hennig, C. (2010a) Methods for merging Gaussian mixture components, \emph{Advances in Data Analysis and Classification}, 4, 3-34. Hennig, C. (2010b) Ridgeline plot and clusterwise stability as tools for merging Gaussian mixture components. To appear in \emph{Classification as a Tool for Research}, Proceedings of IFCS 2009. Ray, S. and Lindsay, B. G. (2005) The Topography of Multivariate Normal Mixtures, \emph{Annals of Statistics}, 33, 2042-2065. } \author{Christian Hennig \email{christian.hennig@unibo.it} \url{https://www.unibo.it/sitoweb/christian.hennig/en/} } \seealso{ \code{\link{ridgeline}}, \code{\link{dridgeline}}, \code{\link{piridge}}, \code{\link{piridge.zeroes}} } \examples{ muarray <- cbind(c(0,0),c(0,0.1),c(10,10)) sigmaarray <- array(c(diag(2),diag(2),diag(2)),dim=c(2,2,3)) rd <- ridgeline.diagnosis(c(0.5,0.3,0.2),muarray,sigmaarray,ridgelineplot="matrix",by=0.1) # Much slower but more precise with default by=0.001. } \keyword{cluster}% at least one, from doc/KEYWORDS \keyword{multivariate} fpc/man/clujaccard.Rd0000644000176200001440000000135513467541512014173 0ustar liggesusers\name{clujaccard} \alias{clujaccard} %- Also NEED an `\alias' for EACH other topic documented here. \title{Jaccard similarity between logical vectors} \description{ Jaccard similarity between logical or 0-1 vectors: \code{sum(c1 & c2)/sum(c1 | c2)}. } \usage{ clujaccard(c1,c2,zerobyzero=NA) } \arguments{ \item{c1}{logical or 0-1-vector.} \item{c2}{logical or 0-1-vector (same length).} \item{zerobyzero}{result if \code{sum(c1 | c2)=0}.} } \value{ Numeric between 0 and 1. } \author{Christian Hennig \email{christian.hennig@unibo.it} \url{https://www.unibo.it/sitoweb/christian.hennig/en/} } \examples{ c1 <- rep(TRUE,10) c2 <- c(FALSE,rep(TRUE,9)) clujaccard(c1,c2) } \keyword{cluster}% at least one, from doc/KEYWORDS fpc/man/calinhara.Rd0000644000176200001440000000237513467541512014025 0ustar liggesusers\name{calinhara} \alias{calinhara} %- Also NEED an `\alias' for EACH other topic documented here. \title{Calinski-Harabasz index} \description{ Calinski-Harabasz index for estimating the number of clusters, based on an observations/variables-matrix here. A distance based version is available through \code{cluster.stats}. } \usage{ calinhara(x,clustering,cn=max(clustering)) } %- maybe also `usage' for other objects documented here. \arguments{ \item{x}{data matrix or data frame.} \item{clustering}{vector of integers. Clustering.} \item{cn}{integer. Number of clusters.} } \value{ Calinski-Harabasz statistic, which is \code{(n-cn)*sum(diag(B))/((cn-1)*sum(diag(W)))}. B being the between-cluster means, and W being the within-clusters covariance matrix. } \references{ Calinski, T., and Harabasz, J. (1974) A Dendrite Method for Cluster Analysis, \emph{Communications in Statistics}, 3, 1-27. } \author{Christian Hennig \email{christian.hennig@unibo.it} \url{https://www.unibo.it/sitoweb/christian.hennig/en}} \seealso{\code{\link{cluster.stats}}} \examples{ set.seed(98765) iriss <- iris[sample(150,20),-5] km <- kmeans(iriss,3) round(calinhara(iriss,km$cluster),digits=2) } \keyword{cluster}% __ONLY ONE__ keyword per line fpc/man/kmeansruns.Rd0000644000176200001440000000760513467541512014272 0ustar liggesusers\name{kmeansruns} \alias{kmeansruns} %- Also NEED an `\alias' for EACH other topic documented here. \title{k-means with estimating k and initialisations} \description{ This calls the function \code{\link{kmeans}} to perform a k-means clustering, but initializes the k-means algorithm several times with random points from the data set as means. Furthermore, it is more robust against the occurrence of empty clusters in the algorithm and it estimates the number of clusters by either the Calinski Harabasz index (\code{\link{calinhara}}) or average silhouette width (see \code{\link[cluster]{pam.object}}). The Duda-Hart test (\code{\link{dudahart2}}) is applied to decide whether there should be more than one cluster (unless 1 is excluded as number of clusters). } \usage{ kmeansruns(data,krange=2:10,criterion="ch", iter.max=100,runs=100, scaledata=FALSE,alpha=0.001, critout=FALSE,plot=FALSE,...) } \arguments{ \item{data}{A numeric matrix of data, or an object that can be coerced to such a matrix (such as a numeric vector or a data frame with all numeric columns). } \item{krange}{integer vector. Numbers of clusters which are to be compared by the average silhouette width criterion. Note: average silhouette width and Calinski-Harabasz can't estimate number of clusters \code{nc=1}. If 1 is included, a Duda-Hart test is applied and 1 is estimated if this is not significant.} \item{criterion}{one of \code{"asw"} or \code{"ch"}. Determines whether average silhouette width or Calinski-Harabasz is applied.} \item{iter.max}{integer. The maximum number of iterations allowed.} \item{runs}{integer. Number of starts of the k-means algorithm.} \item{scaledata}{logical. If \code{TRUE}, the variables are centered and scaled to unit variance before execution.} \item{alpha}{numeric between 0 and 1, tuning constant for \code{\link{dudahart2}} (only used for 1-cluster test).} \item{critout}{logical. If \code{TRUE}, the criterion value is printed out for every number of clusters.} \item{plot}{logical. If \code{TRUE}, every clustering resulting from a run of the algorithm is plotted.} \item{...}{further arguments to be passed on to \code{\link{kmeans}}.} } \value{ The output of the optimal run of the \code{\link{kmeans}}-function with added components \code{bestk} and \code{crit}. A list with components \item{cluster}{A vector of integers indicating the cluster to which each point is allocated.} \item{centers}{A matrix of cluster centers.} \item{withinss}{The within-cluster sum of squares for each cluster.} \item{size}{The number of points in each cluster.} \item{bestk}{The optimal number of clusters.} \item{crit}{Vector with values of the \code{criterion} for all used numbers of clusters (0 if number not tried).} } \author{Christian Hennig \email{christian.hennig@unibo.it} \url{https://www.unibo.it/sitoweb/christian.hennig/en/} } \references{ Calinski, T., and Harabasz, J. (1974) A Dendrite Method for Cluster Analysis, \emph{Communications in Statistics}, 3, 1-27. Duda, R. O. and Hart, P. E. (1973) \emph{Pattern Classification and Scene Analysis}. Wiley, New York. Hartigan, J. A. and Wong, M. A. (1979). A K-means clustering algorithm. \emph{Applied Statistics}, 28, 100-108. Kaufman, L. and Rousseeuw, P.J. (1990). "Finding Groups in Data: An Introduction to Cluster Analysis". Wiley, New York. } \seealso{ \code{\link{kmeans}}, \code{\link{pamk}}, \code{\link{calinhara}}, \code{\link{dudahart2}}) } \examples{ options(digits=3) set.seed(20000) face <- rFace(50,dMoNo=2,dNoEy=0,p=2) pka <- kmeansruns(face,krange=1:5,critout=TRUE,runs=2,criterion="asw") pkc <- kmeansruns(face,krange=1:5,critout=TRUE,runs=2,criterion="ch") } \keyword{cluster}% at least one, from doc/KEYWORDS \keyword{multivariate} fpc/man/awcoord.Rd0000644000176200001440000001061713467541512013537 0ustar liggesusers\name{awcoord} \alias{awcoord} %- Also NEED an `\alias' for EACH other topic documented here. \title{Asymmetric weighted discriminant coordinates} \description{ Asymmetric weighted discriminant coordinates as defined in Hennig (2003). Asymmetric discriminant projection means that there are two classes, one of which is treated as the homogeneous class (i.e., it should appear homogeneous and separated in the resulting projection) while the other may be heterogeneous. The principle is to maximize the ratio between the projection of a between classes separation matrix and the projection of the covariance matrix within the homogeneous class. Points are weighted according to their (robust) Mahalanobis distance to the homogeneous class. } \usage{ awcoord(xd, clvecd, clnum=1, mahal="square", method="classical", clweight=switch(method,classical=FALSE,TRUE), alpha=0.99, subsample=0, countmode=1000, ...) } %- maybe also `usage' for other objects documented here. \arguments{ \item{xd}{the data matrix; a numerical object which can be coerced to a matrix.} \item{clvecd}{integer vector of class numbers; length must equal \code{nrow(xd)}.} \item{clnum}{integer. Number of the homogeneous class.} \item{mahal}{"md" or "square". If "md", the points are weighted by the square root of the \code{alpha}-quantile of the corresponding chi squared distribution over the roots of their Mahalanobis distance to the homogeneous class, unless this is smaller than 1. If "square" (which is recommended), the (originally squared) Mahalanobis distance and the unrooted quantile is used.} \item{method}{one of "mve", "mcd" or "classical". Covariance matrix used within the homogeneous class and for the computation of the Mahalanobis distances. "mcd" and "mve" are robust covariance matrices as implemented in \code{\link{cov.rob}}. "classical" refers to the classical covariance matrix.} \item{clweight}{logical. If \code{FALSE}, only the points of the heterogeneous class are weighted. This, together with \code{method="classical"}, computes AWC as defined in Hennig (2003). If \code{TRUE}, all points are weighted. This, together with \code{method="mcd"}, computes ARC as defined in Hennig (2003).} \item{alpha}{numeric between 0 and 1. The corresponding quantile of the chi squared distribution is used for the downweighting of points. Points with a smaller Mahalanobis distance to the homogeneous class get full weight.} \item{subsample}{integer. If 0, all points are used. Else, only a subsample of \code{subsample} of the points is used.} \item{countmode}{optional positive integer. Every \code{countmode} algorithm runs \code{awcoord} shows a message.} \item{...}{no effect} } \details{ The square root of the homogeneous classes covariance matrix is inverted by use of \code{\link{tdecomp}}, which can be expected to give reasonable results for singular within-class covariance matrices. } % \details{ % } \value{ List with the following components \item{ev}{eigenvalues in descending order.} \item{units}{columns are coordinates of projection basis vectors. New points \code{x} can be projected onto the projection basis vectors by \code{x \%*\% units}} \item{proj}{projections of \code{xd} onto \code{units}.} } \references{ Hennig, C. (2004) Asymmetric linear dimension reduction for classification. Journal of Computational and Graphical Statistics 13, 930-945 . Hennig, C. (2005) A method for visual cluster validation. In: Weihs, C. and Gaul, W. (eds.): Classification - The Ubiquitous Challenge. Springer, Heidelberg 2005, 153-160. } \author{Christian Hennig \email{christian.hennig@unibo.it} \url{https://www.unibo.it/sitoweb/christian.hennig/en/} } \seealso{ \code{\link{plotcluster}} for straight forward discriminant plots. \code{\link{discrproj}} for alternatives. \code{\link{rFace}} for generation of the example data used below. } \examples{ set.seed(4634) face <- rFace(600,dMoNo=2,dNoEy=0) grface <- as.integer(attr(face,"grouping")) awcf <- awcoord(face,grface==1) # awcf2 <- ancoord(face,grface==1, method="mcd") plot(awcf$proj,col=1+(grface==1)) # plot(awcf2$proj,col=1+(grface==1)) # ...done in one step by function plotcluster. } \keyword{multivariate}% at least one, from doc/KEYWORDS \keyword{classif}% __ONLY ONE__ keyword per line fpc/man/valstat.object.Rd0000644000176200001440000001440214355615570015023 0ustar liggesusers\name{valstat.object} \alias{valstat.object} %- Also NEED an `\alias' for EACH other topic documented here. \title{Cluster validation statistics - object} \description{ The objects of class \code{"valstat"} store cluster validation statistics from various clustering methods run with various numbers of clusters. } \section{GENERATION}{ These objects are generated as part of the \code{\link{clusterbenchstats}}-output. } \section{METHODS}{ The \code{valstat} class has methods for the following generic functions: \code{print}, \code{plot}, see \code{\link{plot.valstat}}. } \value{ A legitimate \code{valstat} object is a list. The format of the list relies on the number of involved clustering methods, \code{nmethods}, say, i.e., the length of the \code{method}-component explained below. The first \code{nmethods} elements of the \code{valstat}-list are just numbered. These are themselves lists that are numbered between 1 and the \code{maxG}-component defined below. Element \code{[[i]][[j]]} refers to the clustering from clustering method number i with number of clusters j. Every such element is a list with components \code{avewithin, mnnd, cvnnd, maxdiameter, widestgap, sindex, minsep, asw, dindex, denscut, highdgap, pearsongamma, withinss, entropy}: Further optional components are \code{pamc, kdnorm, kdunif, dmode, aggregated}. All these are cluster validation indexes, as follows. \item{avewithin}{average distance within clusters (reweighted so that every observation, rather than every distance, has the same weight).} \item{mnnd}{average distance to \code{nnk}th nearest neighbour within cluster. (\code{nnk} is a parameter of \code{\link{cqcluster.stats}}, default 2.)} \item{cvnnd}{coefficient of variation of dissimilarities to \code{nnk}th nearest wthin-cluster neighbour, measuring uniformity of within-cluster densities, weighted over all clusters, see Sec. 3.7 of Hennig (2019). (\code{nnk} is a parameter of \code{\link{cqcluster.stats}}, default 2.)} \item{maxdiameter}{maximum cluster diameter.} \item{widestgap}{widest within-cluster gap or average of cluster-wise widest within-cluster gap, depending on parameter \code{averagegap} of \code{\link{cqcluster.stats}}, default \code{FALSE}.} \item{sindex}{separation index. Defined based on the distances for every point to the closest point not in the same cluster. The separation index is then the mean of the smallest proportion \code{sepprob} (parameter of \code{\link{cqcluster.stats}}, default 0.1) of these. See Hennig (2019).} \item{minsep}{minimum cluster separation.} \item{asw}{average silhouette width. See \code{\link{silhouette}}.} \item{dindex}{this index measures to what extent the density decreases from the cluster mode to the outskirts; I-densdec in Sec. 3.6 of Hennig (2019); low values are good.} \item{denscut}{this index measures whether cluster boundaries run through density valleys; I-densbound in Sec. 3.6 of Hennig (2019); low values are good.} \item{highdgap}{this measures whether there is a large within-cluster gap with high density on both sides; I-highdgap in Sec. 3.6 of Hennig (2019); low values are good.} \item{pearsongamma}{correlation between distances and a 0-1-vector where 0 means same cluster, 1 means different clusters. "Normalized gamma" in Halkidi et al. (2001).} \item{withinss}{a generalisation of the within clusters sum of squares (k-means objective function), which is obtained if \code{d} is a Euclidean distance matrix. For general distance measures, this is half the sum of the within cluster squared dissimilarities divided by the cluster size.} \item{entropy}{entropy of the distribution of cluster memberships, see Meila(2007).} \item{pamc}{average distance to cluster centroid, which is the observation that minimises this average distance.} \item{kdnorm}{Kolmogorov distance between distribution of within-cluster Mahalanobis distances and appropriate chi-squared distribution, aggregated over clusters (I am grateful to Agustin Mayo-Iscar for the idea).} \item{kdunif}{Kolmogorov distance between distribution of distances to \code{dnnk}th nearest within-cluster neighbor and appropriate Gamma-distribution, see Byers and Raftery (1998), aggregated over clusters. \code{dnnk} is parameter \code{nnk} of \code{\link{distrsimilarity}}, corresponding to \code{dnnk} of \code{\link{clusterbenchstats}}.} \item{dmode}{aggregated density mode index equal to \code{0.75*dindex+0.25*highdgap} after standardisation by \code{\link{cgrestandard}}.} Furthermore, a \code{valstat} object has the following list components: \item{maxG}{maximum number of clusters.} \item{minG}{minimum number of clusters (list entries below that number are empty lists).} \item{method}{vector of names (character strings) of clustering CBI-functions, see \code{\link{kmeansCBI}}.} \item{name}{vector of names (character strings) of clustering methods. These can be user-chosen names (see argument \code{methodsnames} in \code{\link{clusterbenchstats}}) and may distinguish different methods run by the same CBI-function but with different parameter values such as complete and average linkage for \code{\link{hclustCBI}}.} \item{statistics}{vector of names (character strings) of cluster validation indexes.} } \references{ Hennig, C. (2019) Cluster validation by measurement of clustering characteristics relevant to the user. In C. H. Skiadas (ed.) \emph{Data Analysis and Applications 1: Clustering and Regression, Modeling-estimating, Forecasting and Data Mining, Volume 2}, Wiley, New York 1-24, \url{https://arxiv.org/abs/1703.09282} Akhanli, S. and Hennig, C. (2020) Calibrating and aggregating cluster validity indexes for context-adapted comparison of clusterings. \emph{Statistics and Computing}, 30, 1523-1544, \url{https://link.springer.com/article/10.1007/s11222-020-09958-2}, \url{https://arxiv.org/abs/2002.01822} } \author{Christian Hennig \email{christian.hennig@unibo.it} \url{https://www.unibo.it/sitoweb/christian.hennig/en/} } \seealso{ \code{\link{clusterbenchstats}}, \code{\link{plot.valstat}}. } \keyword{cluster}% at least one, from doc/KEYWORDS fpc/man/dbscan.Rd0000644000176200001440000001026613470376516013337 0ustar liggesusers\name{dbscan} \alias{dbscan} \alias{print.dbscan} \alias{plot.dbscan} \alias{predict.dbscan} \title{DBSCAN density reachability and connectivity clustering} \description{ Generates a density based clustering of arbitrary shape as introduced in Ester et al. (1996). } \usage{ dbscan(data, eps, MinPts = 5, scale = FALSE, method = c("hybrid", "raw", "dist"), seeds = TRUE, showplot = FALSE, countmode = NULL) \method{print}{dbscan}(x, ...) \method{plot}{dbscan}(x, data, ...) \method{predict}{dbscan}(object, data, newdata = NULL, predict.max=1000, ...) } \arguments{ \item{data}{data matrix, data.frame, dissimilarity matrix or \code{dist}-object. Specify \code{method="dist"} if the data should be interpreted as dissimilarity matrix or object. Otherwise Euclidean distances will be used.} \item{eps}{ Reachability distance, see Ester et al. (1996). } \item{MinPts}{ Reachability minimum no. of points, see Ester et al. (1996). } \item{scale}{ scale the data if \code{TRUE}. } \item{method}{ "dist" treats data as distance matrix (relatively fast but memory expensive), "raw" treats data as raw data and avoids calculating a distance matrix (saves memory but may be slow), "hybrid" expects also raw data, but calculates partial distance matrices (very fast with moderate memory requirements).} \item{seeds}{FALSE to not include the \code{isseed}-vector in the \code{dbscan}-object.} \item{showplot}{ 0 = no plot, 1 = plot per iteration, 2 = plot per subiteration. } \item{countmode}{ NULL or vector of point numbers at which to report progress. } \item{x}{object of class \code{dbscan}.} \item{object}{object of class \code{dbscan}.} \item{newdata}{ matrix or data.frame with raw data to predict. } \item{predict.max}{ max. batch size for predictions. } \item{...}{Further arguments transferred to plot methods.} } \details{ Clusters require a minimum no of points (MinPts) within a maximum distance (eps) around one of its members (the seed). Any point within eps around any point which satisfies the seed condition is a cluster member (recursively). Some points may not belong to any clusters (noise). We have clustered a 100.000 x 2 dataset in 40 minutes on a Pentium M 1600 MHz. \code{print.dbscan} shows a statistic of the number of points belonging to the clusters that are seeds and border points. \code{plot.dbscan} distinguishes between seed and border points by plot symbol. } \value{ \code{predict.dbscan} gives out a vector of predicted clusters for the points in \code{newdata}. \code{dbscan} gives out an object of class 'dbscan' which is a LIST with components \item{cluster}{integer vector coding cluster membership with noise observations (singletons) coded as 0 } \item{isseed}{logical vector indicating whether a point is a seed (not border, not noise)} \item{eps}{parameter eps} \item{MinPts}{parameter MinPts} } \references{ Martin Ester, Hans-Peter Kriegel, Joerg Sander, Xiaowei Xu (1996). A Density-Based Algorithm for Discovering Clusters in Large Spatial Databases with Noise. Institute for Computer Science, University of Munich. Proceedings of 2nd International Conference on Knowledge Discovery and Data Mining (KDD-96). } \author{Jens Oehlschlaegel, based on a draft by Christian Hennig.} \note{ this is a simplified version of the original algorithm (no K-D-trees used), thus we have \eqn{o(n^2)} instead of \eqn{o(n*log(n))} } \examples{ set.seed(665544) n <- 600 x <- cbind(runif(10, 0, 10)+rnorm(n, sd=0.2), runif(10, 0, 10)+rnorm(n, sd=0.2)) par(bg="grey40") ds <- dbscan(x, 0.2) # run with showplot=1 to see how dbscan works. ds plot(ds, x) x2 <- matrix(0,nrow=4,ncol=2) x2[1,] <- c(5,2) x2[2,] <- c(8,3) x2[3,] <- c(4,4) x2[4,] <- c(9,9) predict(ds, x, x2) n <- 600 x <- cbind((1:3)+rnorm(n, sd=0.2), (1:3)+rnorm(n, sd=0.2)) # Not run, but results from my machine are 0.105 - 0.068 - 0.255: # system.time(ds <- dbscan(x, 0.3, countmode=NULL, method="raw"))[3] # system.time(dsb <- dbscan(x, 0.3, countmode=NULL, method="hybrid"))[3] # system.time(dsc <- dbscan(dist(x), 0.3, countmode=NULL, # method="dist"))[3] } \keyword{multivariate} \keyword{cluster} fpc/man/piridge.zeroes.Rd0000644000176200001440000000301613467541512015025 0ustar liggesusers\name{piridge.zeroes} \alias{piridge.zeroes} %- Also NEED an `\alias' for EACH other topic documented here. \title{Extrema of two-component Gaussian mixture} \description{ By use of the Pi-function in Ray and Lindsay, 2005, locations of two-component Gaussian mixture density extrema or saddlepoints are computed. } \usage{ piridge.zeroes(prop, mu1, mu2, Sigma1, Sigma2, alphamin=0, alphamax=1,by=0.001) } %- maybe also `usage' for other objects documented here. \arguments{ \item{prop}{proportion of mixture component 1.} \item{mu1}{mean vector of component 1.} \item{mu2}{mean vector of component 2.} \item{Sigma1}{covariance matrix of component 1.} \item{Sigma2}{covariance matrix of component 2.} \item{alphamin}{minimum alpha value.} \item{alphamax}{maximum alpha value.} \item{by}{interval between alpha-values where to look for extrema.} } \value{ list with components \item{number.zeroes}{number of zeroes of Pi-function, i.e., extrema or saddlepoints of density.} \item{estimated.roots}{estimated \code{alpha}-values at which extrema or saddlepoints occur.} } \references{ Ray, S. and Lindsay, B. G. (2005) The Topography of Multivariate Normal Mixtures, \emph{Annals of Statistics}, 33, 2042-2065. } \author{Christian Hennig \email{christian.hennig@unibo.it} \url{https://www.unibo.it/sitoweb/christian.hennig/en/} } \examples{ q <- piridge.zeroes(0.2,c(1,1),c(2,5),diag(2),diag(2),by=0.1) } \keyword{cluster}% at least one, from doc/KEYWORDS \keyword{multivariate} fpc/man/plotcluster.Rd0000644000176200001440000001375113467541512014463 0ustar liggesusers\name{plotcluster} \alias{plotcluster} %- Also NEED an `\alias' for EACH other topic documented here. \title{Discriminant projection plot.} \description{ Plots to distinguish given classes by ten available projection methods. Includes classical discriminant coordinates, methods to project differences in mean and covariance structure, asymmetric methods (separation of a homogeneous class from a heterogeneous one), local neighborhood-based methods and methods based on robust covariance matrices. One-dimensional data is plotted against the cluster number. } \usage{ plotcluster(x, clvecd, clnum=NULL, method=ifelse(is.null(clnum),"dc","awc"), bw=FALSE, ignorepoints=FALSE, ignorenum=0, pointsbyclvecd=TRUE, xlab=NULL, ylab=NULL, pch=NULL, col=NULL, ...) } %- maybe also `usage' for other objects documented here. \arguments{ \item{x}{the data matrix; a numerical object which can be coerced to a matrix.} \item{clvecd}{vector of class numbers which can be coerced into integers; length must equal \code{nrow(xd)}.} \item{method}{one of \describe{ \item{"dc"}{usual discriminant coordinates, see \code{\link{discrcoord}},} \item{"bc"}{Bhattacharyya coordinates, first coordinate showing mean differences, second showing covariance matrix differences, see \code{\link{batcoord}},} \item{"vbc"}{variance dominated Bhattacharyya coordinates, see \code{\link{batcoord}},} \item{"mvdc"}{added mean and variance differences optimizing coordinates, see \code{\link{mvdcoord}},} \item{"adc"}{asymmetric discriminant coordinates, see \code{\link{adcoord}},} \item{"awc"}{asymmetric discriminant coordinates with weighted observations, see \code{\link{awcoord}},} \item{"arc"}{asymmetric discriminant coordinates with weighted observations and robust MCD-covariance matrix, see \code{\link{awcoord}},} \item{"nc"}{neighborhood based coordinates, see \code{\link{ncoord}},} \item{"wnc"}{neighborhood based coordinates with weighted neighborhoods, see \code{\link{ncoord}},} \item{"anc"}{asymmetric neighborhood based coordinates, see \code{\link{ancoord}}.} } Note that "bc", "vbc", "adc", "awc", "arc" and "anc" assume that there are only two classes.} \item{clnum}{integer. Number of the class which is attempted to plot homogeneously by "asymmetric methods", which are the methods assuming that there are only two classes, as indicated above. \code{clnum} is ignored for methods "dc" and "nc".} \item{bw}{logical. If \code{TRUE}, the classes are distinguished by symbols, and the default color is black/white. If \code{FALSE}, the classes are distinguished by colors, and the default symbol is \code{pch=1}.} \item{ignorepoints}{logical. If \code{TRUE}, points with label \code{ignorenum} in \code{clvecd} are ignored in the computation for \code{method} and are only projected afterwards onto the resulting units. If \code{pch=NULL}, the plot symbol for these points is "N".} \item{ignorenum}{one of the potential values of the components of \code{clvecd}. Only has effect if \code{ignorepoints=TRUE}, see above.} \item{pointsbyclvecd}{logical. If \code{TRUE} and \code{pch=NULL} and/or \code{col=NULL}, some hopefully suitable plot symbols (numbers and letters) and colors are chosen to distinguish the values of \code{clvecd}, starting with "1"/"black" for the cluster with the smallest \code{clvecd}-code (note that colors for clusters with numbers larger than minimum number \code{+3} are drawn at random from all available colors). \code{FALSE} produces potentially less reasonable (but nonrandom) standard colors and symbols if \code{method} is "dc" or "nc", and will only distinguish whether \code{clvecd=clnum} or not for the other methods.} \item{xlab}{label for x-axis. If \code{NULL}, a default text is used.} \item{ylab}{label for y-axis. If \code{NULL}, a default text is used.} \item{pch}{plotting symbol, see \code{\link{par}}. If \code{NULL}, the default is used.} \item{col}{plotting color, see \code{\link{par}}. If \code{NULL}, the default is used.} \item{...}{additional parameters passed to \code{plot} or the projection methods.} } % \details{ % } \note{ For some of the asymmetric methods, the area in the plot occupied by the "homogeneous class" (see \code{clnum} above) may be very small, and it may make sense to run \code{plotcluster} a second time specifying plot parameters \code{xlim} and \code{ylim} in a suitable way. It often makes sense to magnify the plot region containing the homogeneous class in this way so that its separation from the rest can be seen more clearly. } \references{ Hennig, C. (2004) Asymmetric linear dimension reduction for classification. Journal of Computational and Graphical Statistics 13, 930-945 . Hennig, C. (2005) A method for visual cluster validation. In: Weihs, C. and Gaul, W. (eds.): Classification - The Ubiquitous Challenge. Springer, Heidelberg 2005, 153-160. Seber, G. A. F. (1984). \emph{Multivariate Observations}. New York: Wiley. Fukunaga (1990). \emph{Introduction to Statistical Pattern Recognition} (2nd ed.). Boston: Academic Press. } \author{Christian Hennig \email{christian.hennig@unibo.it} \url{https://www.unibo.it/sitoweb/christian.hennig/en/}} \seealso{ \code{\link{discrcoord}}, \code{\link{batcoord}}, \code{\link{mvdcoord}}, \code{\link{adcoord}}, \code{\link{awcoord}}, \code{\link{ncoord}}, \code{\link{ancoord}}. \code{\link{discrproj}} is an interface to all these projection methods. \code{\link{rFace}} for generation of the example data used below. } \examples{ set.seed(4634) face <- rFace(300,dMoNo=2,dNoEy=0) grface <- as.integer(attr(face,"grouping")) plotcluster(face,grface) plotcluster(face,grface==1) plotcluster(face,grface, clnum=1, method="vbc") } \keyword{multivariate}% at least one, from doc/KEYWORDS \keyword{classif}% __ONLY ONE__ keyword per line fpc/man/itnumber.Rd0000644000176200001440000000304513467541512013723 0ustar liggesusers\name{itnumber} \alias{itnumber} %- Also NEED an `\alias' for EACH other topic documented here. \title{Number of regression fixed point cluster iterations} \description{ Computes the number of fixed point iterations needed by \code{\link{fixreg}} to find \code{mtf} times a fixed point cluster (FPC) of size \code{cn} with an approximated probability of \code{prob}. Thought for use within \code{\link{fixreg}}. } \usage{ itnumber(n, p, cn, mtf, prob = 0.95, maxir = 20000) } %- maybe also `usage' for other objects documented here. \arguments{ \item{n}{positive integer. Total number of points.} \item{p}{positive integer. Number of independent variables.} \item{cn}{positive integer smaller or equal to \code{n}. Size of the FPC.} \item{mtf}{positive integer.} \item{prob}{number between 0 and 1.} \item{maxir}{positive integer. \code{itnumber} is set to this value if it would otherwise be larger.} } \details{ The computation is based on the binomial distribution with probability given by \code{\link{clusexpect}} with \code{ir=1}. } \value{ An integer. } \references{ Hennig, C. (2002) Fixed point clusters for linear regression: computation and comparison, \emph{Journal of Classification} 19, 249-276. } \author{Christian Hennig \email{christian.hennig@unibo.it} \url{https://www.unibo.it/sitoweb/christian.hennig/en/}} \seealso{\code{\link{fixreg}}, \code{\link{clusexpect}}} \examples{ itnumber(500,4,150,2) } \keyword{univar}% at least one, from doc/KEYWORDS \keyword{cluster}% __ONLY ONE__ keyword per line fpc/man/mvdcoord.Rd0000644000176200001440000000462413467541512013717 0ustar liggesusers\name{mvdcoord} \alias{mvdcoord} %- Also NEED an `\alias' for EACH other topic documented here. \title{Mean/variance differences discriminant coordinates} \description{ Discriminant projections as defined in Young, Marco and Odell (1987). The principle is to maximize the projection of a matrix consisting of the differences between the means of all classes and the first mean and the differences between the covariance matrices of all classes and the forst covariance matrix. } \usage{ mvdcoord(xd, clvecd, clnum=1, sphere="mcd", ...) } %- maybe also `usage' for other objects documented here. \arguments{ \item{xd}{the data matrix; a numerical object which can be coerced to a matrix.} \item{clvecd}{integer vector of class numbers; length must equal \code{nrow(xd)}.} \item{clnum}{integer. Number of the class to which all differences are computed.} \item{sphere}{a covariance matrix or one of "mve", "mcd", "classical", "none". The matrix used for sphering the data. "mcd" and "mve" are robust covariance matrices as implemented in \code{\link{cov.rob}}. "classical" refers to the classical covariance matrix. "none" means no sphering and use of the raw data.} \item{...}{no effect} } % \details{ % } \value{ List with the following components \item{ev}{eigenvalues in descending order.} \item{units}{columns are coordinates of projection basis vectors. New points \code{x} can be projected onto the projection basis vectors by \code{x \%*\% units}} \item{proj}{projections of \code{xd} onto \code{units}.} } \references{ Young, D. M., Marco, V. R. and Odell, P. L. (1987). Quadratic discrimination: some results on optimal low-dimensional representation, \emph{Journal of Statistical Planning and Inference}, 17, 307-319. } \author{Christian Hennig \email{christian.hennig@unibo.it} \url{https://www.unibo.it/sitoweb/christian.hennig/en/} } \seealso{ \code{\link{plotcluster}} for straight forward discriminant plots. \code{\link{discrproj}} for alternatives. \code{\link{rFace}} for generation of the example data used below. } \examples{ set.seed(4634) face <- rFace(300,dMoNo=2,dNoEy=0,p=3) grface <- as.integer(attr(face,"grouping")) mcf <- mvdcoord(face,grface) plot(mcf$proj,col=grface) # ...done in one step by function plotcluster. } \keyword{multivariate}% at least one, from doc/KEYWORDS \keyword{classif}% __ONLY ONE__ keyword per line fpc/man/discrcoord.Rd0000644000176200001440000000432713467541512014235 0ustar liggesusers\name{discrcoord} \alias{discrcoord} %- Also NEED an `\alias' for EACH other topic documented here. \title{Discriminant coordinates/canonical variates} \description{ Computes discriminant coordinates, sometimes referred to as "canonical variates" as described in Seber (1984). } \usage{ discrcoord(xd, clvecd, pool = "n", ...) } %- maybe also `usage' for other objects documented here. \arguments{ \item{xd}{the data matrix; a numerical object which can be coerced to a matrix.} \item{clvecd}{integer vector of class numbers; length must equal \code{nrow(xd)}.} \item{pool}{string. Determines how the within classes covariance is pooled. "n" means that the class covariances are weighted corresponding to the number of points in each class (default). "equal" means that all classes get equal weight.} \item{...}{no effect} } \details{ The matrix T (see Seber (1984), p. 270) is inverted by use of \code{\link{tdecomp}}, which can be expected to give reasonable results for singular within-class covariance matrices. } \value{ List with the following components \item{ev}{eigenvalues in descending order.} \item{units}{columns are coordinates of projection basis vectors. New points \code{x} can be projected onto the projection basis vectors by \code{x \%*\% units}} \item{proj}{projections of \code{xd} onto \code{units}.} } \references{ Seber, G. A. F. (1984). \emph{Multivariate Observations}. New York: Wiley. } \author{Christian Hennig \email{christian.hennig@unibo.it} \url{https://www.unibo.it/sitoweb/christian.hennig/en/} } \seealso{ \code{\link{plotcluster}} for straight forward discriminant plots. \code{\link{batcoord}} for discriminating projections for two classes, so that also the differences in variance are shown (\code{discrcoord} is based only on differences in mean). \code{\link{rFace}} for generation of the example data used below. } \examples{ set.seed(4634) face <- rFace(600,dMoNo=2,dNoEy=0) grface <- as.integer(attr(face,"grouping")) dcf <- discrcoord(face,grface) plot(dcf$proj,col=grface) # ...done in one step by function plotcluster. } \keyword{multivariate}% at least one, from doc/KEYWORDS \keyword{classif}% __ONLY ONE__ keyword per line fpc/man/mahalanofix.Rd0000644000176200001440000000530013470377246014366 0ustar liggesusers\name{mahalanofix} \alias{mahalanofix} \alias{mahalanofuz} %- Also NEED an `\alias' for EACH other topic documented here. \title{Mahalanobis distances from center of indexed points} \description{ Computes the vector of (classical or robust) Mahalanobis distances of all points of \code{x} to the center of the points indexed (or weighted) by \code{gv}. The latter also determine the covariance matrix. Thought for use within \code{\link{fixmahal}}. } \usage{ mahalanofix(x, n = nrow(as.matrix(x)), p = ncol(as.matrix(x)), gv = rep(1, times = n), cmax = 1e+10, method = "ml") mahalanofuz(x, n = nrow(as.matrix(x)), p = ncol(as.matrix(x)), gv = rep(1, times=n), cmax = 1e+10) } %- maybe also `usage' for other objects documented here. \arguments{ \item{x}{a numerical data matrix, rows are points, columns are variables.} \item{n}{positive integer. Number of points.} \item{p}{positive integer. Number of variables.} \item{gv}{for \code{mahalanofix} a logical or 0-1 vector of length \code{n}. For \code{mahalanofuz} a numerical vector with values between 0 and 1.} \item{cmax}{positive number. used in \code{\link{solvecov}} if covariance matrix is singular.} \item{method}{\code{"ml"}, \code{"classical"}, \code{"mcd"} or \code{"mve"}. Method to compute the covariance matrix estimator. See \code{\link{cov.rob}}, \code{\link{fixmahal}}.} } \details{ \code{\link{solvecov}} is used to invert the covariance matrix. The methods \code{"mcd"} and \code{"mve"} in \code{mahalanofix} do not work properly with point constellations with singular covariance matrices! } \value{ A list of the following components: \item{md}{vector of Mahalanobis distances.} \item{mg}{mean of the points indexed by \code{gv}, weighted mean in \code{mahalanofuz}.} \item{covg}{covariance matrix of the points indexed by \code{gv}, weighted covariance matrix in \code{mahalanofuz}.} \item{covinv}{\code{covg} inverted by \code{\link{solvecov}}.} \item{coll}{logical. If \code{TRUE}, \code{covg} has been (numerically) singular.} } \author{Christian Hennig \email{christian.hennig@unibo.it} \url{https://www.unibo.it/sitoweb/christian.hennig/en/}} \note{Methods \code{"mcd"} and \code{"mve"} require library \code{lqs}.} \seealso{\code{\link{fixmahal}}, \code{\link{solvecov}}, \code{\link{cov.rob}}} \examples{ x <- c(1,2,3,4,5,6,7,8,9,10) y <- c(1,2,3,8,7,6,5,8,9,10) mahalanofix(cbind(x,y),gv=c(0,0,0,1,1,1,1,1,0,0)) mahalanofix(cbind(x,y),gv=c(0,0,0,1,1,1,1,0,0,0)) mahalanofix(cbind(x,y),gv=c(0,0,0,1,1,1,1,1,0,0),method="mcd") mahalanofuz(cbind(x,y),gv=c(0,0,0.5,0.5,1,1,1,0.5,0.5,0)) } \keyword{multivariate}% at least one, from doc/KEYWORDS fpc/man/clusexpect.Rd0000644000176200001440000000275313467541512014262 0ustar liggesusers\name{clusexpect} \alias{clusexpect} %- Also NEED an `\alias' for EACH other topic documented here. \title{Expected value of the number of times a fixed point cluster is found} \description{ A rough approximation of the expectation of the number of times a well separated fixed point cluster (FPC) of size \code{n} is found in \code{ir} fixed point iterations of \code{\link{fixreg}}. } \usage{ clusexpect(n, p, cn, ir) } %- maybe also `usage' for other objects documented here. \arguments{ \item{n}{positive integer. Total number of points.} \item{p}{positive integer. Number of independent variables.} \item{cn}{positive integer smaller or equal to \code{n}. Size of the FPC.} \item{ir}{positive integer. Number of fixed point iterations.} } \details{ The approximation is based on the assumption that a well separated FPC is found iff all \code{p+2} points of the initial coinfiguration come from the FPC. The value is \code{ir} times the probability for this. For a discussion of this assumption cf. Hennig (2002). } \value{ A number. } \references{ Hennig, C. (2002) Fixed point clusters for linear regression: computation and comparison, \emph{Journal of Classification} 19, 249-276. } \author{Christian Hennig \email{christian.hennig@unibo.it} \url{https://www.unibo.it/sitoweb/christian.hennig/en/}} \seealso{\code{\link{fixreg}}} \examples{ round(clusexpect(500,4,150,2000),digits=2) } \keyword{univar}% at least one, from doc/KEYWORDS \keyword{cluster} fpc/man/concomp.Rd0000644000176200001440000000301213467541512013526 0ustar liggesusers\name{con.comp} \alias{con.comp} %- Also NEED an `\alias' for EACH other topic documented here. \title{Connectivity components of an undirected graph} \description{ Computes the connectivity components of an undirected graph from a matrix giving the edges. } \usage{ con.comp(comat) } %- maybe also `usage' for other objects documented here. \arguments{ \item{comat}{a symmetric logical or 0-1 matrix, where \code{comat[i,j]=TRUE} means that there is an edge between vertices \code{i} and \code{j}. The diagonal is ignored.} } \details{ The "depth-first search" algorithm of Cormen, Leiserson and Rivest (1990, p. 477) is used. } \value{ An integer vector, giving the number of the connectivity component for each vertice. } \references{ Cormen, T. H., Leiserson, C. E. and Rivest, R. L. (1990), \emph{Introduction to Algorithms}, Cambridge: MIT Press. } \author{Christian Hennig \email{christian.hennig@unibo.it} \url{https://www.unibo.it/sitoweb/christian.hennig/en/} } \seealso{ \code{\link{hclust}}, \code{\link{cutree}} for cutted single linkage trees (often equivalent). } \examples{ set.seed(1000) x <- rnorm(20) m <- matrix(0,nrow=20,ncol=20) for(i in 1:20) for(j in 1:20) m[i,j] <- abs(x[i]-x[j]) d <- m<0.2 cc <- con.comp(d) max(cc) # number of connectivity components plot(x,cc) # The same should be produced by # cutree(hclust(as.dist(m),method="single"),h=0.2). } \keyword{array}% at least one, from doc/KEYWORDS \keyword{cluster}% __ONLY ONE__ keyword per line fpc/man/cvnn.Rd0000644000176200001440000000345513467541512013047 0ustar liggesusers\name{cvnn} \alias{cvnn} %- Also NEED an `\alias' for EACH other topic documented here. \title{Cluster validation based on nearest neighbours} \description{ Cluster validity index based on nearest neighbours as defined in Liu et al. (2013) with a correction explained in Halkidi et al. (2015). } \usage{ cvnn(d=NULL,clusterings,k=5) } %- maybe also `usage' for other objects documented here. \arguments{ \item{d}{dissimilarity matrix or \code{dist}-object.} \item{clusterings}{list of vectors of integers with length \code{=nrow(d)}; indicating the cluster for each observation for several clusterings (list elements) to be compared.} \item{k}{integer. Number of nearest neighbours.} } \value{ List with components (see Liu et al. (2013), Halkidi et al. (2015) for details) \item{cvnnindex}{vector of index values for the various clusterings, see Liu et al. (2013), the lower the better.} \item{sep}{vector of separation values.} \item{comp}{vector of compactness values.} } \references{ Halkidi, M., Vazirgiannis, M. and Hennig, C. (2015) Method-independent indices for cluster validation. In C. Hennig, M. Meila, F. Murtagh, R. Rocci (eds.) \emph{Handbook of Cluster Analysis}, CRC Press/Taylor \code{&} Francis, Boca Raton. Liu, Y, Li, Z., Xiong, H., Gao, X., Wu, J. and Wu, S. (2013) Understanding and enhancement of internal clustering validation measures. \emph{IEEE Transactions on Cybernetics} 43, 982-994. } \author{Christian Hennig \email{christian.hennig@unibo.it} \url{https://www.unibo.it/sitoweb/christian.hennig/en/} } \examples{ options(digits=3) iriss <- as.matrix(iris[c(1:10,51:55,101:105),-5]) irisc <- as.numeric(iris[c(1:10,51:55,101:105),5]) print(cvnn(dist(iriss),list(irisc,rep(1:4,5)))) } \keyword{cluster}% at least one, from doc/KEYWORDS fpc/man/flexmixedruns.Rd0000644000176200001440000001637213470377054015004 0ustar liggesusers\name{flexmixedruns} \alias{flexmixedruns} %- Also NEED an `\alias' for EACH other topic documented here. \title{Fitting mixed Gaussian/multinomial mixtures with flexmix} \description{ \code{flexmixedruns} fits a latent class mixture (clustering) model where some variables are continuous and modelled within the mixture components by Gaussian distributions and some variables are categorical and modelled within components by independent multinomial distributions. The fit is by maximum likelihood estimation computed with the EM-algorithm. The number of components can be estimated by the BIC. Note that at least one categorical variable is needed, but it is possible to use data without continuous variable. } \usage{ flexmixedruns(x,diagonal=TRUE,xvarsorted=TRUE, continuous,discrete,ppdim=NULL,initial.cluster=NULL, simruns=20,n.cluster=1:20,verbose=TRUE,recode=TRUE, allout=TRUE,control=list(minprior=0.001),silent=TRUE) } %- maybe also `usage' for other objects documented here. \arguments{ \item{x}{data matrix or data frame. The data need to be organised case-wise, i.e., if there are categorical variables only, and 15 cases with values c(1,1,2) on the 3 variables, the data matrix needs 15 rows with values 1 1 2. (Categorical variables could take numbers or strings or anything that can be coerced to factor levels as values.)} \item{diagonal}{logical. If \code{TRUE}, Gaussian models are fitted restricted to diagonal covariance matrices. Otherwise, covariance matrices are unrestricted. \code{TRUE} is consistent with the "within class independence" assumption for the multinomial variables.} \item{xvarsorted}{logical. If \code{TRUE}, the continuous variables are assumed to be the first ones, and the categorical variables to be behind them.} \item{continuous}{vector of integers giving positions of the continuous variables. If \code{xvarsorted=TRUE}, a single integer, number of continuous variables.} \item{discrete}{vector of integers giving positions of the categorical variables. If \code{xvarsorted=TRUE}, a single integer, number of categorical variables.} \item{ppdim}{vector of integers specifying the number of (in the data) existing categories for each categorical variable. If \code{recode=TRUE}, this can be omitted and is computed automatically.} \item{initial.cluster}{this corresponds to the \code{cluster} parameter in \code{flexmix} and should only be specified if \code{simruns=1} and \code{n.cluster} is a single number. Either a matrix with \code{n.cluster} columns of initial cluster membership probabilities for each observation; or a factor or integer vector with the initial cluster assignments of observations at the start of the EM algorithm. Default is random assignment into \code{n.cluster} clusters.} \item{simruns}{integer. Number of starts of the EM algorithm with random initialisation in order to find a good global optimum.} \item{n.cluster}{vector of integers, numbers of components (the optimum one is found by minimising the BIC).} \item{verbose}{logical. If \code{TRUE}, some information about the different runs of the EM algorithm is given out.} \item{recode}{logical. If \code{TRUE}, the function \code{discrete.recode} is applied in order to recode categorical data so that the \code{lcmixed}-method can use it. Only set this to \code{FALSE} if your data already has that format (even it that case, \code{TRUE} doesn't do harm). If \code{recode=FALSE}, the categorical variables are assumed to be coded 1,2,3,...} \item{allout}{logical. If \code{TRUE}, the regular \code{flexmix}-output is given out for every single number of clusters, which can create a huge output object.} \item{control}{list of control parameters for \code{flexmix}, for details see the help page of \code{\link[flexmix]{FLXcontrol-class}}.} \item{silent}{logical. This is passed on to the \code{\link{try}}-function. If \code{FALSE}, error messages from failed runs of \code{flexmix} are suppressed. (The information that a \code{flexmix}-error occurred is still given out if \code{verbose=TRUE}).} } \details{ Sometimes flexmix produces errors because of degenerating covariance matrices, too small clusters etc. \code{flexmixedruns} tolerates these and treats them as non-optimal runs. (Higher \code{simruns} or different \code{control} may be required to get a valid solution.) General documentation on flexmix can be found in Friedrich Leisch's "FlexMix: A General Framework for Finite Mixture Models and Latent Class Regression in R", \url{https://CRAN.R-project.org/package=flexmix} } \value{ A list with components \item{optsummary}{summary object for \code{flexmix} object with optimal number of components.} \item{optimalk}{optimal number of components.} \item{errcount}{vector with numbers of EM runs for each number of components that led to flexmix errors.} \item{flexout}{if \code{allout=TRUE}, list of flexmix output objects for all numbers of components, for details see the help page of \code{\link[flexmix]{flexmix-class}}. Slots that can be used include for example \code{cluster} and \code{components}. So if \code{fo} is the \code{flexmixedruns}-output object, \code{fo$flexout[[fo$optimalk]]@cluster} gives a component number vector for the observations (maximum posterior rule), and \code{fo$flexout[[fo$optimalk]]@components} gives the estimated model parameters, which for \code{lcmixed} and therefore \code{flexmixedruns} are called \describe{ \item{center}{mean vector} \item{cov}{covariance matrix} \item{pp}{list of categorical variable-wise category probabilities} } If \code{allout=FALSE}, only the flexmix output object for the optimal number of components, i.e., the \code{[[fo$optimalk]]} indexing above can then be omitted. } \item{bicvals}{vector of values of the BIC for each number of components.} \item{ppdim}{vector of categorical variable-wise numbers of categories.} \item{discretelevels}{list of levels of the categorical variables belonging to what is treated by \code{flexmixedruns} as category 1, 2, 3 etc.} } \references{ Hennig, C. and Liao, T. (2013) How to find an appropriate clustering for mixed-type variables with application to socio-economic stratification, \emph{Journal of the Royal Statistical Society, Series C Applied Statistics}, 62, 309-369. } \author{Christian Hennig \email{christian.hennig@unibo.it} \url{https://www.unibo.it/sitoweb/christian.hennig/en}} \seealso{\code{\link{lcmixed}}, \code{\link[flexmix]{flexmix}}, \code{\link[flexmix]{FLXcontrol-class}}, \code{\link[flexmix]{flexmix-class}}, \code{\link{discrete.recode}}.} \examples{ options(digits=3) set.seed(776655) v1 <- rnorm(100) v2 <- rnorm(100) d1 <- sample(1:5,100,replace=TRUE) d2 <- sample(1:4,100,replace=TRUE) ldata <- cbind(v1,v2,d1,d2) fr <- flexmixedruns(ldata, continuous=2,discrete=2,simruns=2,n.cluster=2:3,allout=FALSE) print(fr$optimalk) print(fr$optsummary) print(fr$flexout@cluster) print(fr$flexout@components) } \keyword{cluster}% __ONLY ONE__ keyword per line fpc/man/kmeansCBI.Rd0000644000176200001440000004333613731165066013701 0ustar liggesusers\name{kmeansCBI} \alias{kmeansCBI} \alias{hclustCBI} \alias{hclusttreeCBI} \alias{disthclustCBI} \alias{disthclusttreeCBI} \alias{noisemclustCBI} \alias{distnoisemclustCBI} \alias{claraCBI} \alias{pamkCBI} %\alias{trimkmeansCBI} %\alias{disttrimkmeansCBI} \alias{dbscanCBI} \alias{mahalCBI} \alias{mergenormCBI} \alias{speccCBI} \alias{tclustCBI} \alias{pdfclustCBI} \alias{emskewCBI} \alias{stupidkcentroidsCBI} \alias{stupidknnCBI} \alias{stupidkfnCBI} \alias{stupidkavenCBI} %- Also NEED an `\alias' for EACH other topic documented here. \title{Interface functions for clustering methods} \description{ These functions provide an interface to several clustering methods implemented in R, for use together with the cluster stability assessment in \code{\link{clusterboot}} (as parameter \code{clustermethod}; "CBI" stands for "clusterboot interface"). In some situations it could make sense to use them to compute a clustering even if you don't want to run \code{clusterboot}, because some of the functions contain some additional features (e.g., normal mixture model based clustering of dissimilarity matrices projected into the Euclidean space by MDS or partitioning around medoids with estimated number of clusters, noise/outlier identification in hierarchical clustering). } \usage{ kmeansCBI(data,krange,k,scaling=FALSE,runs=1,criterion="ch",...) hclustCBI(data,k,cut="number",method,scaling=TRUE,noisecut=0,...) hclusttreeCBI(data,minlevel=2,method,scaling=TRUE,...) disthclustCBI(dmatrix,k,cut="number",method,noisecut=0,...) % disthclusttreeCBI(dmatrix,minlevel=2,method,...) noisemclustCBI(data,G,k,modelNames,nnk,hcmodel=NULL,Vinv=NULL, summary.out=FALSE,...) distnoisemclustCBI(dmatrix,G,k,modelNames,nnk, hcmodel=NULL,Vinv=NULL,mdsmethod="classical", mdsdim=4, summary.out=FALSE, points.out=FALSE,...) claraCBI(data,k,usepam=TRUE,diss=inherits(data,"dist"),...) pamkCBI(data,krange=2:10,k=NULL,criterion="asw", usepam=TRUE, scaling=FALSE,diss=inherits(data,"dist"),...) tclustCBI(data,k,trim=0.05,...) dbscanCBI(data,eps,MinPts,diss=inherits(data,"dist"),...) mahalCBI(data,clustercut=0.5,...) mergenormCBI(data, G=NULL, k=NULL, modelNames=NULL, nnk=0, hcmodel = NULL, Vinv = NULL, mergemethod="bhat", cutoff=0.1,...) speccCBI(data,k,...) pdfclustCBI(data,...) % emskewCBI(data,k,distr="mst",repeats=100,...) stupidkcentroidsCBI(dmatrix,k,distances=TRUE) stupidknnCBI(dmatrix,k) stupidkfnCBI(dmatrix,k) stupidkavenCBI(dmatrix,k) } \arguments{ \item{data}{a numeric matrix. The data matrix - usually a cases*variables-data matrix. \code{claraCBI}, \code{pamkCBI} and \code{dbscanCBI} work with an \code{n*n}-dissimilarity matrix as well, see parameter \code{diss}.} \item{dmatrix}{a squared numerical dissimilarity matrix or a \code{dist}-object.} \item{k}{numeric, usually integer. In most cases, this is the number of clusters for methods where this is fixed. For \code{hclustCBI} and \code{disthclustCBI} see parameter \code{cut} below. Some methods have a \code{k} parameter on top of a \code{G} or \code{krange} parameter for compatibility; \code{k} in these cases does not have to be specified but if it is, it is always a single number of clusters and overwrites \code{G} and \code{krange}.} \item{scaling}{either a logical value or a numeric vector of length equal to the number of variables. If \code{scaling} is a numeric vector with length equal to the number of variables, then each variable is divided by the corresponding value from \code{scaling}. If \code{scaling} is \code{TRUE} then scaling is done by dividing the (centered) variables by their root-mean-square, and if \code{scaling} is \code{FALSE}, no scaling is done before execution.} \item{runs}{integer. Number of random initializations from which the k-means algorithm is started.} \item{criterion}{\code{"ch"} or \code{"asw"}. Decides whether number of clusters is estimated by the Calinski-Harabasz criterion or by the average silhouette width.} \item{cut}{either "level" or "number". This determines how \code{cutree} is used to obtain a partition from a hierarchy tree. \code{cut="level"} means that the tree is cut at a particular dissimilarity level, \code{cut="number"} means that the tree is cut in order to obtain a fixed number of clusters. The parameter \code{k} specifies the number of clusters or the dissimilarity level, depending on \code{cut}.} \item{method}{method for hierarchical clustering, see the documentation of \code{\link{hclust}}.} \item{noisecut}{numeric. All clusters of size \code{<=noisecut} in the \code{disthclustCBI}/\code{hclustCBI}-partition are joined and declared as noise/outliers.} \item{minlevel}{integer. \code{minlevel=1} means that all clusters in the tree are given out by \code{hclusttreeCBI} or \code{disthclusttreeCBI}, including one-point clusters (but excluding the cluster with all points). \code{minlevel=2} excludes the one-point clusters. \code{minlevel=3} excludes the two-point cluster which has been merged first, and increasing the value of \code{minlevel} by 1 in all further steps means that the remaining earliest formed cluster is excluded.} \item{G}{vector of integers. Number of clusters or numbers of clusters used by \code{\link[mclust]{mclustBIC}}. If \code{G} has more than one entry, the number of clusters is estimated by the BIC.} \item{modelNames}{vector of string. Models for covariance matrices, see documentation of \code{\link[mclust]{mclustBIC}}.} \item{nnk}{integer. Tuning constant for \code{\link[prabclus]{NNclean}}, which is used to estimate the initial noise for \code{noisemclustCBI} and \code{distnoisemclustCBI}. See parameter \code{k} in the documentation of \code{\link[prabclus]{NNclean}}. \code{nnk=0} means that no noise component is fitted.} \item{hcmodel}{string or \code{NULL}. Determines the initialization of the EM-algorithm for \code{\link[mclust]{mclustBIC}}. Documented in \code{\link[mclust]{hc}}.} \item{Vinv}{numeric. See documentation of \code{\link[mclust]{mclustBIC}}.} \item{summary.out}{logical. If \code{TRUE}, the result of \code{\link[mclust]{summary.mclustBIC}} is added as component \code{mclustsummary} to the output of \code{noisemclustCBI} and \code{distnoisemclustCBI}.} \item{mdsmethod}{"classical", "kruskal" or "sammon". Determines the multidimensional scaling method to compute Euclidean data from a dissimilarity matrix. See \code{\link{cmdscale}}, \code{\link{isoMDS}} and \code{\link{sammon}}.} \item{mdsdim}{integer. Dimensionality of MDS solution.} \item{points.out}{logical. If \code{TRUE}, the matrix of MDS points is added as component \code{points} to the output of \code{noisemclustCBI}.} \item{usepam}{logical. If \code{TRUE}, the function \code{\link[cluster]{pam}} is used for clustering, otherwise \code{\link[cluster]{clara}}. \code{\link{pam}} is better, \code{\link[cluster]{clara}} is faster.} \item{diss}{logical. If \code{TRUE}, \code{data} will be considered as a dissimilarity matrix. In \code{claraCBI}, this requires \code{usepam=TRUE}.} \item{krange}{vector of integers. Numbers of clusters to be compared.} \item{trim}{numeric between 0 and 1. Proportion of data points trimmed, i.e., assigned to noise. See \code{tclust} in the tclust package.} \item{eps}{numeric. The radius of the neighborhoods to be considered by \code{\link{dbscan}}.} \item{MinPts}{integer. How many points have to be in a neighborhood so that a point is considered to be a cluster seed? See documentation of \code{\link{dbscan}}.} \item{clustercut}{numeric between 0 and 1. If \code{\link{fixmahal}} is used for fuzzy clustering, a crisp partition is generated and points with cluster membership values above \code{clustercut} are considered as members of the corresponding cluster.} \item{mergemethod}{method for merging Gaussians, passed on as \code{method} to \code{\link{mergenormals}}.} \item{cutoff}{numeric between 0 and 1, tuning constant for \code{\link{mergenormals}}.} % \item{distr}{one of \code{"mvn", "mvt", "msn", "mst"}. Defines the % family of mixtures (multivariate normal, multivariate t, % multivariate skew normal, or multivariate skew t). See % \code{\link[EMMIXskew]{EmSkew}}.}, % \item{repeats}{integer. In case that \code{\link[EMMIXskew]{EmSkew}} % doesn't give a solution, how often should execution be repeated with % new random initialisations? (\code{EmSkew}'s own \code{nrandom} % doesn't help if an initialisation leads to a \code{NULL} output.)} \item{distances}{logical (only for \code{stupidkcentroidsCBI}). If \code{FALSE}, \code{dmatrix} is interpreted as cases&variables data matrix.} \item{...}{further parameters to be transferred to the original clustering functions (not required).} } \details{ All these functions call clustering methods implemented in R to cluster data and to provide output in the format required by \code{\link{clusterboot}}. Here is a brief overview. For further details see the help pages of the involved clustering methods. \describe{ \item{kmeansCBI}{an interface to the function \code{\link{kmeansruns}} calling \code{\link{kmeans}} for k-means clustering. (\code{\link{kmeansruns}} allows the specification of several random initializations of the k-means algorithm and estimation of k by the Calinski-Harabasz index or the average silhouette width.)} \item{hclustCBI}{an interface to the function \code{\link{hclust}} for agglomerative hierarchical clustering with noise component (see parameter \code{noisecut} above). This function produces a partition and assumes a cases*variables matrix as input.} \item{hclusttreeCBI}{an interface to the function \code{hclust} for agglomerative hierarchical clustering. This function gives out all clusters belonging to the hierarchy (upward from a certain level, see parameter \code{minlevel} above).} \item{disthclustCBI}{an interface to the function \code{hclust} for agglomerative hierarchical clustering with noise component (see parameter \code{noisecut} above). This function produces a partition and assumes a dissimilarity matrix as input.} % \item{disthclusttreeCBI}{an interface to the function % \code{hclust} for agglomerative hierarchical clustering. This % function gives out all clusters belonging to the hierarchy % (upward from a certain level, see parameter \code{minlevel} % above), and assumes a dissimilarity matrix as input.} \item{noisemclustCBI}{an interface to the function \code{\link[mclust]{mclustBIC}}, for normal mixture model based clustering. Warning: \code{\link[mclust]{mclustBIC}} often has problems with multiple points. In \code{\link{clusterboot}}, it is recommended to use this together with \code{multipleboot=FALSE}.} \item{distnoisemclustCBI}{an interface to the function \code{\link[mclust]{mclustBIC}} for normal mixture model based clustering. This assumes a dissimilarity matrix as input and generates a data matrix by multidimensional scaling first. Warning: \code{\link[mclust]{mclustBIC}} often has problems with multiple points. In \code{\link{clusterboot}}, it is recommended to use this together with \code{multipleboot=FALSE}.} \item{claraCBI}{an interface to the functions \code{\link[cluster]{pam}} and \code{\link[cluster]{clara}} for partitioning around medoids.} \item{pamkCBI}{an interface to the function \code{\link{pamk}} calling \code{\link[cluster]{pam}} for partitioning around medoids. The number of clusters is estimated by the Calinski-Harabasz index or by the average silhouette width.} \item{tclustCBI}{an interface to the function \code{tclust} in the tclust package for trimmed Gaussian clustering. This assumes a cases*variables matrix as input.} % % NOTE: This package is currently only available in CRAN as % archived version. Therefore I cannot currently offer the % \code{tclustCBI}-function in \code{fpc}. The code for the % function is below in the Examples-Section, so if you need it, % run that code first.} % \item{disttrimkmeansCBI}{an interface to the function % \code{\link[trimcluster]{trimkmeans}} for trimmed k-means % clustering. This assumes a dissimilarity matrix as input and % generates a data matrix by multidimensional scaling first.} \item{dbscanCBI}{an interface to the function \code{\link{dbscan}} for density based clustering.} \item{mahalCBI}{an interface to the function \code{\link{fixmahal}} for fixed point clustering. This assumes a cases*variables matrix as input.} \item{mergenormCBI}{an interface to the function \code{\link{mergenormals}} for clustering by merging Gaussian mixture components. Unlike \code{\link{mergenormals}}, \code{mergenormCBI} includes the computation of the initial Gaussian mixture. This assumes a cases*variables matrix as input. } \item{speccCBI}{an interface to the function \code{\link[kernlab]{specc}} for spectral clustering. See the \code{\link[kernlab]{specc}} help page for additional tuning parameters. This assumes a cases*variables matrix as input.} \item{pdfclustCBI}{an interface to the function \code{\link[pdfCluster]{pdfCluster}} for density-based clustering. See the \code{\link[pdfCluster]{pdfCluster}} help page for additional tuning parameters. This assumes a cases*variables matrix as input.} % \item{emskewCBI}{an interface to the function % \code{\link[EMMIXskew]{EmSkew}} for clustering with the % EM-algorithm based on Gaussian, skew Gaussian, t or skew-t % mixtures. See % help page of \code{\link[EMMIXskew]{EmSkew}}. This assumes a % cases*variables matrix as input. Note that by September 2020, % package \code{EMMIXskew} is not available on CRAN but only % in the CRAN archives; CRAN states that it needs an update.} \item{stupidkcentroidsCBI}{an interface to the function \code{stupidkcentroids} for random centroid-based clustering. See the \code{\link{stupidkcentroids}} help page. This can have a distance matrix as well as a cases*variables matrix as input, see parameter \code{distances}.} \item{stupidknnCBI}{an interface to the function \code{stupidknn} for random nearest neighbour clustering. See the \code{\link{stupidknn}} help page. This assumes a distance matrix as input.} \item{stupidkfnCBI}{an interface to the function \code{stupidkfn} for random farthest neighbour clustering. See the \code{\link{stupidkfn}} help page. This assumes a distance matrix as input.} \item{stupidkavenCBI}{an interface to the function \code{stupidkaven} for random average dissimilarity clustering. See the \code{\link{stupidkaven}} help page. This assumes a distance matrix as input.} } } \value{ All interface functions return a list with the following components (there may be some more, see \code{summary.out} and \code{points.out} above): \item{result}{clustering result, usually a list with the full output of the clustering method (the precise format doesn't matter); whatever you want to use later.} \item{nc}{number of clusters. If some points don't belong to any cluster, these are declared "noise". \code{nc} includes the "noise cluster", and there should be another component \code{nccl}, being the number of clusters not including the noise cluster.} \item{clusterlist}{this is a list consisting of a logical vectors of length of the number of data points (\code{n}) for each cluster, indicating whether a point is a member of this cluster (\code{TRUE}) or not. If a noise cluster is included, it should always be the last vector in this list.} \item{partition}{an integer vector of length \code{n}, partitioning the data. If the method produces a partition, it should be the clustering. This component is only used for plots, so you could do something like \code{rep(1,n)} for non-partitioning methods. If a noise cluster is included, \code{nc=nccl+1} and the noise cluster is cluster no. \code{nc}.} \item{clustermethod}{a string indicating the clustering method.} The output of some of the functions has further components: \item{nccl}{see \code{nc} above.} \item{nnk}{by \code{noisemclustCBI} and \code{distnoisemclustCBI}, see above.} \item{initnoise}{logical vector, indicating initially estimated noise by \code{\link[prabclus]{NNclean}}, called by \code{noisemclustCBI} and \code{distnoisemclustCBI}.} \item{noise}{logical. \code{TRUE} if points were classified as noise/outliers by \code{disthclustCBI}.} } \author{Christian Hennig \email{christian.hennig@unibo.it} \url{https://www.unibo.it/sitoweb/christian.hennig/en/} } \seealso{ \code{\link{clusterboot}}, \code{\link{dist}}, \code{\link{kmeans}}, \code{\link{kmeansruns}}, \code{\link{hclust}}, \code{\link[mclust]{mclustBIC}}, \code{\link[cluster]{pam}}, \code{\link{pamk}}, \code{\link[cluster]{clara}}, \code{\link{dbscan}}, \code{\link{fixmahal}}, \code{\link[tclust]{tclust}}, \code{\link[pdfCluster]{pdfCluster}} % \code{\link[EMMIXskew]{EmSkew}} } \examples{ options(digits=3) set.seed(20000) face <- rFace(50,dMoNo=2,dNoEy=0,p=2) dbs <- dbscanCBI(face,eps=1.5,MinPts=4) dhc <- disthclustCBI(dist(face),method="average",k=1.5,noisecut=2) table(dbs$partition,dhc$partition) dm <- mergenormCBI(face,G=10,modelNames="EEE",nnk=2) dtc <- tclustCBI(face,6,trim=0.1,restr.fact=500) table(dm$partition,dtc$partition) } \keyword{cluster}% at least one, from doc/KEYWORDS \keyword{multivariate}fpc/man/mixpredictive.Rd0000644000176200001440000000544413467541512014757 0ustar liggesusers\name{mixpredictive} \alias{mixpredictive} %- Also NEED an `\alias' for EACH other topic documented here. \title{Prediction strength of merged Gaussian mixture} \description{ Computes the prediction strength of clustering by merging Gaussian mixture components, see \code{\link{mergenormals}}. The predictive strength is defined according to Tibshirani and Walther (2005), carried out as described in Hennig (2010), see details. } \usage{ mixpredictive(xdata, Gcomp, Gmix, M=50, ...) } %- maybe also `usage' for other objects documented here. \arguments{ \item{xdata}{data (something that can be coerced into a matrix).} \item{Gcomp}{integer. Number of components of the underlying Gaussian mixture.} \item{Gmix}{integer. Number of clusters after merging Gaussian components.} \item{M}{integer. Number of times the dataset is divided into two halves.} \item{...}{further arguments that can potentially arrive in calls but are currently not used.} } \value{ List with components \item{predcorr}{vector of length \code{M} with relative frequencies of correct predictions (clusterwise minimum).} \item{mean.pred}{mean of \code{predcorr}.} } \details{ The prediction strength for a certain number of clusters \code{Gmix} under a random partition of the dataset in halves A and B is defined as follows. Both halves are clustered with \code{Gmix} clusters. Then the points of A are classified to the clusters of B. This is done by use of the maximum a posteriori rule for mixtures as in Hennig (2010), differently from Tibshirani and Walther (2005). A pair of points A in the same A-cluster is defined to be correctly predicted if both points are classified into the same cluster on B. The same is done with the points of B relative to the clustering on A. The prediction strength for each of the clusterings is the minimum (taken over all clusters) relative frequency of correctly predicted pairs of points of that cluster. The final mean prediction strength statistic is the mean over all 2M clusterings. } \references{ Hennig, C. (2010) Methods for merging Gaussian mixture components, \emph{Advances in Data Analysis and Classification}, 4, 3-34. Tibshirani, R. and Walther, G. (2005) Cluster Validation by Prediction Strength, \emph{Journal of Computational and Graphical Statistics}, 14, 511-528. } \author{Christian Hennig \email{christian.hennig@unibo.it} \url{https://www.unibo.it/sitoweb/christian.hennig/en/} } \seealso{ \code{\link{prediction.strength}} for Tibshirani and Walther's original method. \code{\link{mergenormals}} for the clustering method applied here. } \examples{ set.seed(98765) iriss <- iris[sample(150,20),-5] mp <- mixpredictive(iriss,2,2,M=2) } \keyword{cluster}% at least one, from doc/KEYWORDS \keyword{multivariate} fpc/man/ancoord.Rd0000644000176200001440000000634613467541512013532 0ustar liggesusers\name{ancoord} \alias{ancoord} %- Also NEED an `\alias' for EACH other topic documented here. \title{Asymmetric neighborhood based discriminant coordinates} \description{ Asymmetric neighborhood based discriminant coordinates as defined in Hennig (2003). Asymmetric discriminant projection means that there are two classes, one of which is treated as the homogeneous class (i.e., it should appear homogeneous and separated in the resulting projection) while the other may be heterogeneous. The principle is to maximize the ratio between the projection of a between classes covariance matrix, which is defined by averaging the between classes covariance matrices in the neighborhoods of the points of the homogeneous class and the projection of the covariance matrix within the homogeneous class. } \usage{ ancoord(xd, clvecd, clnum=1, nn=50, method="mcd", countmode=1000, ...) } %- maybe also `usage' for other objects documented here. \arguments{ \item{xd}{the data matrix; a numerical object which can be coerced to a matrix.} \item{clvecd}{integer vector of class numbers; length must equal \code{nrow(xd)}.} \item{clnum}{integer. Number of the homogeneous class.} \item{nn}{integer. Number of points which belong to the neighborhood of each point (including the point itself).} \item{method}{one of "mve", "mcd" or "classical". Covariance matrix used within the homogeneous class. "mcd" and "mve" are robust covariance matrices as implemented in \code{\link{cov.rob}}. "classical" refers to the classical covariance matrix.} \item{countmode}{optional positive integer. Every \code{countmode} algorithm runs \code{ancoord} shows a message.} \item{...}{no effect} } \details{ The square root of the homogeneous classes covariance matrix is inverted by use of \code{\link{tdecomp}}, which can be expected to give reasonable results for singular within-class covariance matrices. } % \details{ % } \value{ List with the following components \item{ev}{eigenvalues in descending order.} \item{units}{columns are coordinates of projection basis vectors. New points \code{x} can be projected onto the projection basis vectors by \code{x \%*\% units}} \item{proj}{projections of \code{xd} onto \code{units}.} } \references{ Hennig, C. (2004) Asymmetric linear dimension reduction for classification. Journal of Computational and Graphical Statistics 13, 930-945 . Hennig, C. (2005) A method for visual cluster validation. In: Weihs, C. and Gaul, W. (eds.): Classification - The Ubiquitous Challenge. Springer, Heidelberg 2005, 153-160. } \author{Christian Hennig \email{christian.hennig@unibo.it} \url{https://www.unibo.it/sitoweb/christian.hennig/en/} } \seealso{ \code{\link{plotcluster}} for straight forward discriminant plots. \code{\link{discrproj}} for alternatives. \code{\link{rFace}} for generation of the example data used below. } \examples{ set.seed(4634) face <- rFace(600,dMoNo=2,dNoEy=0) grface <- as.integer(attr(face,"grouping")) ancf2 <- ancoord(face,grface==4) plot(ancf2$proj,col=1+(grface==4)) # ...done in one step by function plotcluster. } \keyword{multivariate}% at least one, from doc/KEYWORDS \keyword{classif}% __ONLY ONE__ keyword per line fpc/man/bhattacharyya.matrix.Rd0000644000176200001440000000423513470374260016225 0ustar liggesusers\name{bhattacharyya.matrix} \alias{bhattacharyya.matrix} %- Also NEED an `\alias' for EACH other topic documented here. \title{Matrix of pairwise Bhattacharyya distances} \description{ Computes Bhattachryya distances for pairs of components given the parameters of a Gaussian mixture. } \usage{ bhattacharyya.matrix(muarray,Sigmaarray,ipairs="all", misclassification.bound=TRUE) } %- maybe also `usage' for other objects documented here. \arguments{ \item{muarray}{matrix of component means (different components are in different columns).} \item{Sigmaarray}{three dimensional array with component covariance matrices (the third dimension refers to components).} \item{ipairs}{\code{"all"} or list of vectors of two integers. If \code{ipairs="all"}, computations are carried out for all pairs of components. Otherwise, ipairs gives the pairs of components for which computations are carried out.} \item{misclassification.bound}{logical. If \code{TRUE}, upper bounds for misclassification probabilities \code{exp(-b)} are given out instead of the original Bhattacharyya distances \code{b}.} } \value{ A matrix with Bhattacharyya distances (or derived misclassification bounds, see above) between pairs of Gaussian distributions with the provided parameters. If \code{ipairs!="all"}, the Bhattacharyya distance and the misclassification bound are given as \code{NA} for pairs not included in \code{ipairs}. } \references{ Fukunaga, K. (1990) \emph{Introduction to Statistical Pattern Recognition}, 2nd edition, Academic Press, New York. Hennig, C. (2010) Methods for merging Gaussian mixture components, \emph{Advances in Data Analysis and Classification}, 4, 3-34. } \author{Christian Hennig \email{christian.hennig@unibo.it} \url{https://www.unibo.it/sitoweb/christian.hennig/en/} } \seealso{ \code{\link{bhattacharyya.dist}} } \examples{ muarray <-cbind(c(0,0),c(0,0.1),c(10,10)) sigmaarray <- array(c(diag(2),diag(2),diag(2)),dim=c(2,2,3)) bhattacharyya.matrix(muarray,sigmaarray,ipairs=list(c(1,2),c(2,3))) } \keyword{cluster}% at least one, from doc/KEYWORDS \keyword{multivariate} fpc/man/distancefactor.Rd0000644000176200001440000000574013470377022015070 0ustar liggesusers\name{distancefactor} \alias{distancefactor} %- Also NEED an `\alias' for EACH other topic documented here. \title{Factor for dissimilarity of mixed type data} \description{ Computes a factor that can be used to standardise ordinal categorical variables and binary dummy variables coding categories of nominal scaled variables for Euclidean dissimilarity computation in mixed type data. See Hennig and Liao (2013). } \usage{ distancefactor(cat,n=NULL, catsizes=NULL,type="categorical", normfactor=2,qfactor=ifelse(type=="categorical",1/2, 1/(1+1/(cat-1)))) } %- maybe also `usage' for other objects documented here. \arguments{ \item{cat}{integer. Number of categories of the variable to be standardised. Note that for \code{type="categorical"} the number of categories of the original variable is required, although the \code{distancefactor} is used to standardise dummy variables for the categories.} \item{n}{integer. Number of data points.} \item{catsizes}{vector of integers giving numbers of observations per category. One of \code{n} and \code{catsizes} must be supplied. If \code{catsizes=NULL}, \code{rep(round(n/cat),cat)} is used (this may be appropriate as well if numbers of observations of categories are unequal, if the researcher decides that the dissimilarity measure should not be influenced by empirical category sizes.} \item{type}{\code{"categorical"} if the factor is used for dummy variables belonging to a nominal variable, \code{"ordinal"} if the factor is used for an ordinal variable ind standard Likert coding.} \item{normfactor}{numeric. Factor on which standardisation is based. As a default, this is \code{E(X_1-X_2)^2=2} for independent unit variance variables.} \item{qfactor}{numeric. Factor q in Hennig and Liao (2013) to adjust for clumping effects due to discreteness.} } \value{ A factor by which to multiply the variable in order to make it comparable to a unit variance continuous variable when aggregated in Euclidean fashion for dissimilarity computation, so that expected effective difference between two realisations of the variable equals \code{qfactor*normfactor}. } \references{ Hennig, C. and Liao, T. (2013) How to find an appropriate clustering for mixed-type variables with application to socio-economic stratification, \emph{Journal of the Royal Statistical Society, Series C Applied Statistics}, 62, 309-369. } \author{Christian Hennig \email{christian.hennig@unibo.it} \url{https://www.unibo.it/sitoweb/christian.hennig/en}} \seealso{\code{\link{lcmixed}}, \code{\link[cluster]{pam}}} \examples{ set.seed(776655) d1 <- sample(1:5,20,replace=TRUE) d2 <- sample(1:4,20,replace=TRUE) ldata <- cbind(d1,d2) lc <- cat2bin(ldata,categorical=1)$data lc[,1:5] <- lc[,1:5]*distancefactor(5,20,type="categorical") lc[,6] <- lc[,6]*distancefactor(4,20,type="ordinal") } \keyword{cluster}% __ONLY ONE__ keyword per line fpc/man/randomclustersim.Rd0000644000176200001440000001540414536666067015506 0ustar liggesusers\name{randomclustersim} \alias{randomclustersim} %- Also NEED an `\alias' for EACH other topic documented here. \title{Simulation of validity indexes based on random clusterings} \description{ For a given dataset this simulates random clusterings using \code{\link{stupidkcentroids}}, \code{\link{stupidknn}}, \code{\link{stupidkfn}}, and \code{\link{stupidkaven}}. It then computes and stores a set of cluster validity indexes for every clustering. } \usage{ randomclustersim(datadist,datanp=NULL,npstats=FALSE,useboot=FALSE, bootmethod="nselectboot", bootruns=25, G,nnruns=100,kmruns=100,fnruns=100,avenruns=100, nnk=4,dnnk=2, pamcrit=TRUE, multicore=FALSE,cores=detectCores()-1,monitor=TRUE) } %- maybe also `usage' for other objects documented here. \arguments{ \item{datadist}{distances on which validation-measures are based, \code{dist} object or distance matrix.} \item{datanp}{optional observations times variables data matrix, see \code{npstats}.} \item{npstats}{logical. If \code{TRUE}, \code{\link{distrsimilarity}} is called and the two statistics computed there are added to the output. These are based on \code{datanp} and require \code{datanp} to be specified.} \item{useboot}{logical. If \code{TRUE}, a stability index (either \code{nselectboot} or \code{prediction.strength}) will be involved.} \item{bootmethod}{either \code{"nselectboot"} or \code{"prediction.strength"}; stability index to be used if \code{useboot=TRUE}.} \item{bootruns}{integer. Number of resampling runs. If \code{useboot=TRUE}, passed on as \code{B} to \code{nselectboot} or \code{M} to \code{prediction.strength}.} \item{G}{vector of integers. Numbers of clusters to consider.} \item{nnruns}{integer. Number of runs of \code{\link{stupidknn}}.} \item{kmruns}{integer. Number of runs of \code{\link{stupidkcentroids}}.} \item{fnruns}{integer. Number of runs of \code{\link{stupidkfn}}.} \item{avenruns}{integer. Number of runs of \code{\link{stupidkaven}}.} \item{nnk}{\code{nnk}-argument to be passed on to \code{\link{cqcluster.stats}}.} \item{dnnk}{\code{nnk}-argument to be passed on to \code{\link{distrsimilarity}}.} \item{pamcrit}{\code{pamcrit}-argument to be passed on to \code{\link{cqcluster.stats}}.} \item{multicore}{logical. If \code{TRUE}, parallel computing is used through the function \code{\link{mclapply}} from package \code{parallel}; read warnings there if you intend to use this; it won't work on Windows.} \item{cores}{integer. Number of cores for parallelisation.} \item{monitor}{logical. If \code{TRUE}, it will print some runtime information.} } % \details{ % } \value{ List with components \item{nn}{list, indexed by number of clusters. Every entry is a data frame with \code{nnruns} observations for every simulation run of \code{\link{stupidknn}}. The variables of the data frame are \code{avewithin, mnnd, cvnnd, maxdiameter, widestgap, sindex, minsep, asw, dindex, denscut, highdgap, pearsongamma, withinss, entropy}, if \code{pamcrit=TRUE} also \code{pamc}, if \code{npstats=TRUE} also \code{kdnorm, kdunif}. All these are cluster validation indexes; documented as values of \code{\link{clustatsum}}.} \item{fn}{list, indexed by number of clusters. Every entry is a data frame with \code{fnruns} observations for every simulation run of \code{\link{stupidkfn}}. The variables of the data frame are \code{avewithin, mnnd, cvnnd, maxdiameter, widestgap, sindex, minsep, asw, dindex, denscut, highdgap, pearsongamma, withinss, entropy}, if \code{pamcrit=TRUE} also \code{pamc}, if \code{npstats=TRUE} also \code{kdnorm, kdunif}. All these are cluster validation indexes; documented as values of \code{\link{clustatsum}}.} \item{aven}{list, indexed by number of clusters. Every entry is a data frame with \code{avenruns} observations for every simulation run of \code{\link{stupidkaven}}. The variables of the data frame are \code{avewithin, mnnd, cvnnd, maxdiameter, widestgap, sindex, minsep, asw, dindex, denscut, highdgap, pearsongamma, withinss, entropy}, if \code{pamcrit=TRUE} also \code{pamc}, if \code{npstats=TRUE} also \code{kdnorm, kdunif}. All these are cluster validation indexes; documented as values of \code{\link{clustatsum}}.} \item{km}{list, indexed by number of clusters. Every entry is a data frame with \code{kmruns} observations for every simulation run of \code{\link{stupidkcentroids}}. The variables of the data frame are \code{avewithin, mnnd, cvnnd, maxdiameter, widestgap, sindex, minsep, asw, dindex, denscut, highdgap, pearsongamma, withinss, entropy}, if \code{pamcrit=TRUE} also \code{pamc}, if \code{npstats=TRUE} also \code{kdnorm, kdunif}. All these are cluster validation indexes; documented as values of \code{\link{clustatsum}}.} \item{nnruns}{number of involved runs of \code{\link{stupidknn}},} \item{fnruns}{number of involved runs of \code{\link{stupidkfn}},} \item{avenruns}{number of involved runs of \code{\link{stupidkaven}},} \item{kmruns}{number of involved runs of \code{\link{stupidkcentroids}},} \item{boot}{if \code{useboot=TRUE}, stability value; \code{stabk} for method \code{\link{nselectboot}}; \code{mean.pred} for method \code{\link{prediction.strength}}.} } \references{ Hennig, C. (2019) Cluster validation by measurement of clustering characteristics relevant to the user. In C. H. Skiadas (ed.) \emph{Data Analysis and Applications 1: Clustering and Regression, Modeling-estimating, Forecasting and Data Mining, Volume 2}, Wiley, New York 1-24, \url{https://arxiv.org/abs/1703.09282} Akhanli, S. and Hennig, C. (2020) Calibrating and aggregating cluster validity indexes for context-adapted comparison of clusterings. \emph{Statistics and Computing}, 30, 1523-1544, \url{https://link.springer.com/article/10.1007/s11222-020-09958-2}, \url{https://arxiv.org/abs/2002.01822} } \author{Christian Hennig \email{christian.hennig@unibo.it} \url{https://www.unibo.it/sitoweb/christian.hennig/en/} } \seealso{ \code{\link{stupidkcentroids}}, \code{\link{stupidknn}}, \code{\link{stupidkfn}}, \code{\link{stupidkaven}}, \code{\link{clustatsum}} } \examples{ set.seed(20000) options(digits=3) face <- rFace(10,dMoNo=2,dNoEy=0,p=2) rmx <- randomclustersim(dist(face),datanp=face,npstats=TRUE,G=2:3, nnruns=2,kmruns=2, fnruns=1,avenruns=1,nnk=2) \dontrun{ rmx$km # Produces slightly different but basically identical results on ATLAS } rmx$aven rmx$fn rmx$nn } \keyword{multivariate}% at least one, from doc/KEYWORDS \keyword{cluster}% __ONLY ONE__ keyword per line fpc/man/cqcluster.stats.Rd0000644000176200001440000004626113731163227015244 0ustar liggesusers\name{cqcluster.stats} \alias{cqcluster.stats} \alias{summary.cquality} \alias{print.summary.cquality} %- Also NEED an `\alias' for EACH other topic documented here. \title{Cluster validation statistics (version for use with clusterbenchstats} \description{ This is a more sophisticated version of \code{\link{cluster.stats}} for use with \code{\link{clusterbenchstats}}, see Hennig (2017). Computes a number of distance-based statistics, which can be used for cluster validation, comparison between clusterings and decision about the number of clusters: cluster sizes, cluster diameters, average distances within and between clusters, cluster separation, biggest within cluster gap, average silhouette widths, the Calinski and Harabasz index, a Pearson version of Hubert's gamma coefficient, the Dunn index, further statistics introduced in Hennig (2017) and two indexes to assess the similarity of two clusterings, namely the corrected Rand index and Meila's VI. } \usage{ cqcluster.stats(d = NULL, clustering, alt.clustering = NULL, noisecluster = FALSE, silhouette = TRUE, G2 = FALSE, G3 = FALSE, wgap = TRUE, sepindex = TRUE, sepprob = 0.1, sepwithnoise = TRUE, compareonly = FALSE, aggregateonly = FALSE, averagegap=FALSE, pamcrit=TRUE, dquantile=0.1, nndist=TRUE, nnk=2, standardisation="max", sepall=TRUE, maxk=10, cvstan=sqrt(length(clustering))) \method{summary}{cquality}(object,stanbound=TRUE,largeisgood=TRUE, ...) \method{print}{summary.cquality}(x, ...) } %- maybe also `usage' for other objects documented here. \arguments{ \item{d}{a distance object (as generated by \code{dist}) or a distance matrix between cases.} \item{clustering}{an integer vector of length of the number of cases, which indicates a clustering. The clusters have to be numbered from 1 to the number of clusters.} \item{alt.clustering}{an integer vector such as for \code{clustering}, indicating an alternative clustering. If provided, the corrected Rand index and Meila's VI for \code{clustering} vs. \code{alt.clustering} are computed.} \item{noisecluster}{logical. If \code{TRUE}, it is assumed that the largest cluster number in \code{clustering} denotes a 'noise class', i.e. points that do not belong to any cluster. These points are not taken into account for the computation of all functions of within and between cluster distances including the validation indexes.} \item{silhouette}{logical. If \code{TRUE}, the silhouette statistics are computed, which requires package \code{cluster}.} \item{G2}{logical. If \code{TRUE}, Goodman and Kruskal's index G2 (cf. Gordon (1999), p. 62) is computed. This executes lots of sorting algorithms and can be very slow (it has been improved by R. Francois - thanks!)} \item{G3}{logical. If \code{TRUE}, the index G3 (cf. Gordon (1999), p. 62) is computed. This executes \code{sort} on all distances and can be extremely slow.} \item{wgap}{logical. If \code{TRUE}, the widest within-cluster gaps (largest link in within-cluster minimum spanning tree) are computed. This is used for finding a good number of clusters in Hennig (2013). See also parameter \code{averagegap}.} \item{sepindex}{logical. If \code{TRUE}, a separation index is computed, defined based on the distances for every point to the closest point not in the same cluster. The separation index is then the mean of the smallest proportion \code{sepprob} of these. This allows to formalise separation less sensitive to a single or a few ambiguous points. The output component corresponding to this is \code{sindex}, not \code{separation}! This is used for finding a good number of clusters in Hennig (2013). See also parameter \code{sepall}.} \item{sepprob}{numerical between 0 and 1, see \code{sepindex}.} \item{sepwithnoise}{logical. If \code{TRUE} and \code{sepindex} and \code{noisecluster} are both \code{TRUE}, the noise points are incorporated as cluster in the separation index (\code{sepindex}) computation. Also they are taken into account for the computation for the minimum cluster separation.} \item{compareonly}{logical. If \code{TRUE}, only the corrected Rand index and Meila's VI are computed and given out (this requires \code{alt.clustering} to be specified).} \item{aggregateonly}{logical. If \code{TRUE} (and not \code{compareonly}), no clusterwise but only aggregated information is given out (this cuts the size of the output down a bit).} \item{averagegap}{logical. If \code{TRUE}, the average of the widest within-cluster gaps over all clusters is given out; if \code{FALSE}, the maximum is given out.} \item{pamcrit}{logical. If \code{TRUE}, the average distance of points to their respective cluster centroids is computed (criterion of the PAM clustering method); centroids are chosen so that they minimise this criterion for the given clustering.} \item{dquantile}{numerical between 0 and 1; quantile used for kernel density estimator for density indexes, see Hennig (2019), Sec. 3.6.} \item{nndist}{logical. If \code{TRUE}, average distance to \code{nnk}th nearest neighbour within cluster is computed.} \item{nnk}{integer. Number of neighbours used in average and coefficient of variation of distance to nearest within cluster neighbour (clusters with \code{nnk} or fewer points are ignored for this).} \item{standardisation}{\code{"none"}, \code{"max"}, \code{"ave"}, \code{"q90"}, or a number. See details.} \item{sepall}{logical. If \code{TRUE}, a fraction of smallest \code{sepprob} distances to other clusters is used from every cluster. Otherwise, a fraction of smallest \code{sepprob} distances overall is used in the computation of \code{sindex}.} \item{maxk}{numeric. Parsimony is defined as the number of clusters divided by \code{maxk}.} \item{cvstan}{numeric. \code{cvnnd} is standardised by \code{cvstan} if there is standardisation, see Details.} \item{object}{object of class \code{cquality}, output of \code{cqcluster.stats}.} \item{x}{object of class \code{cquality}, output of \code{cqcluster.stats}.} \item{stanbound}{logical. If \code{TRUE}, all index values larger than 1 will be set to 1, and all values smaller than 0 will be set to 0. This is for preparation in case of \code{largeisgood=TRUE} (if values are already suitably standardised within \code{cqcluster.stats}, it won't do harm and can do good).} \item{largeisgood}{logical. If \code{TRUE}, indexes \code{x} are transformed to \code{1-x} in case that before transformation smaller values indicate a better clustering (that's \code{average.within, mnnd, widestgap, within.cluster.ss, dindex, denscut, pamc, max.diameter, highdgap, cvnnd}. For this to make sense, \code{cqcluster.stats} should be run with \code{standardisation="max"} and \code{summary.cquality} with \code{stanbound=TRUE}.} \item{...}{no effect.} } \details{ The \code{standardisation}-parameter governs the standardisation of the index values. \code{standardisation="none"} means that unstandardised raw values of indexes are given out. Otherwise, \code{entropy} will be standardised by the maximum possible value for the given number of clusters; \code{within.cluster.ss} and \code{between.cluster.ss} will be standardised by the overall sum of squares; \code{mnnd} will be standardised by the maximum distance to the \code{nnk}th nearest neighbour within cluster; \code{pearsongamma} will be standardised by adding 1 and dividing by 2; \code{cvnn} will be standardised by \code{cvstan} (the default is the possible maximum). \code{standardisation} allows options for the standardisation of \code{average.within, sindex, wgap, pamcrit, max.diameter, min.separation} and can be \code{"max"} (maximum distance), \code{"ave"} (average distance), \code{q90} (0.9-quantile of distances), or a positive number. \code{"max"} is the default and standardises all the listed indexes into the range [0,1].} \note{ Because \code{cqcluster.stats} processes a full dissimilarity matrix, it isn't suitable for large data sets. You may consider \code{\link{distcritmulti}} in that case. } \value{ \code{cqcluster.stats} with \code{compareonly=FALSE} and \code{aggregateonly=FALSE} returns a list of type \code{cquality} containing the components \code{n, cluster.number, cluster.size, min.cluster.size, noisen, diameter, average.distance, median.distance, separation, average.toother, separation.matrix, ave.between.matrix, average.between, average.within, n.between, n.within, max.diameter, min.separation, within.cluster.ss, clus.avg.silwidths, avg.silwidth, g2, g3, pearsongamma, dunn, dunn2, entropy, wb.ratio, ch, cwidegap, widestgap, corrected.rand, vi, sindex, svec, psep, stan, nnk, mnnd, pamc, pamcentroids, dindex, denscut, highdgap, npenalty, dpenalty, withindensp, densoc, pdistto, pclosetomode, distto, percwdens, percdensoc, parsimony, cvnnd, cvnndc}. Some of these are standardised, see Details. If \code{compareonly=TRUE}, only \code{corrected.rand, vi} are given out. If \code{aggregateonly=TRUE}, only \code{n, cluster.number, min.cluster.size, noisen, diameter, average.between, average.within, max.diameter, min.separation, within.cluster.ss, avg.silwidth, g2, g3, pearsongamma, dunn, dunn2, entropy, wb.ratio, ch, widestgap, corrected.rand, vi, sindex, svec, psep, stan, nnk, mnnd, pamc, pamcentroids, dindex, denscut, highdgap, parsimony, cvnnd, cvnndc} are given out. \code{summary.cquality} returns a list of type \code{summary.cquality} with components \code{average.within,nnk,mnnd, avg.silwidth, widestgap,sindex, pearsongamma,entropy,pamc, within.cluster.ss, dindex,denscut,highdgap, parsimony,max.diameter, min.separation,cvnnd}. These are as documented below for \code{cqcluster.stats}, but after transformation by \code{stanbound} and \code{largeisgood}, see arguments. \item{n}{number of points.} \item{cluster.number}{number of clusters.} \item{cluster.size}{vector of cluster sizes (number of points).} \item{min.cluster.size}{size of smallest cluster.} \item{noisen}{number of noise points, see argument \code{noisecluster} (\code{noisen=0} if \code{noisecluster=FALSE}).} \item{diameter}{vector of cluster diameters (maximum within cluster distances).} \item{average.distance}{vector of clusterwise within cluster average distances.} \item{median.distance}{vector of clusterwise within cluster distance medians.} \item{separation}{vector of clusterwise minimum distances of a point in the cluster to a point of another cluster.} \item{average.toother}{vector of clusterwise average distances of a point in the cluster to the points of other clusters.} \item{separation.matrix}{matrix of separation values between all pairs of clusters.} \item{ave.between.matrix}{matrix of mean dissimilarities between points of every pair of clusters.} \item{avebetween}{average distance between clusters.} \item{avewithin}{average distance within clusters (reweighted so that every observation, rather than every distance, has the same weight).} \item{n.between}{number of distances between clusters.} \item{n.within}{number of distances within clusters.} \item{maxdiameter}{maximum cluster diameter.} \item{minsep}{minimum cluster separation.} \item{withinss}{a generalisation of the within clusters sum of squares (k-means objective function), which is obtained if \code{d} is a Euclidean distance matrix. For general distance measures, this is half the sum of the within cluster squared dissimilarities divided by the cluster size.} \item{clus.avg.silwidths}{vector of cluster average silhouette widths. See \code{\link{silhouette}}.} \item{asw}{average silhouette width. See \code{\link{silhouette}}.} \item{g2}{Goodman and Kruskal's Gamma coefficient. See Milligan and Cooper (1985), Gordon (1999, p. 62).} \item{g3}{G3 coefficient. See Gordon (1999, p. 62).} \item{pearsongamma}{correlation between distances and a 0-1-vector where 0 means same cluster, 1 means different clusters. "Normalized gamma" in Halkidi et al. (2001).} \item{dunn}{minimum separation / maximum diameter. Dunn index, see Halkidi et al. (2002).} \item{dunn2}{minimum average dissimilarity between two cluster / maximum average within cluster dissimilarity, another version of the family of Dunn indexes.} \item{entropy}{entropy of the distribution of cluster memberships, see Meila(2007).} \item{wb.ratio}{\code{average.within/average.between}.} \item{ch}{Calinski and Harabasz index (Calinski and Harabasz 1974, optimal in Milligan and Cooper 1985; generalised for dissimilarites in Hennig and Liao 2013).} \item{cwidegap}{vector of widest within-cluster gaps.} \item{widestgap}{widest within-cluster gap or average of cluster-wise widest within-cluster gap, depending on parameter \code{averagegap}.} \item{corrected.rand}{corrected Rand index (if \code{alt.clustering} has been specified), see Gordon (1999, p. 198).} \item{vi}{variation of information (VI) index (if \code{alt.clustering} has been specified), see Meila (2007).} \item{sindex}{separation index, see argument \code{sepindex}.} \item{svec}{vector of smallest closest distances of points to next cluster that are used in the computation of \code{sindex} if \code{sepall=TRUE}.} \item{psep}{vector of all closest distances of points to next cluster.} \item{stan}{value by which som statistics were standardised, see Details.} \item{nnk}{value of input parameter \code{nnk}.} \item{mnnd}{average distance to \code{nnk}th nearest neighbour within cluster.} \item{pamc}{average distance to cluster centroid.} \item{pamcentroids}{index numbers of cluster centroids.} \item{dindex}{this index measures to what extent the density decreases from the cluster mode to the outskirts; I-densdec in Sec. 3.6 of Hennig (2019); low values are good.} \item{denscut}{this index measures whether cluster boundaries run through density valleys; I-densbound in Sec. 3.6 of Hennig (2019); low values are good.} \item{highdgap}{this measures whether there is a large within-cluster gap with high density on both sides; I-highdgap in Sec. 3.6 of Hennig (2019); low values are good.} \item{npenalty}{vector of penalties for all clusters that are used in the computation of \code{denscut}, see Hennig (2019) (these are sums of penalties over all points in the cluster).} \item{depenalty}{vector of penalties for all clusters that are used in the computation of \code{dindex}, see Hennig (2019) (these are sums of several penalties for density increase when going from the mode outward in the cluster).} \item{withindensp}{distance-based kernel density values for all points as computed in Sec. 3.6 of Hennig (2019).} \item{densoc}{contribution of points from other clusters than the one to which a point is assigned to the density, for all points; called \code{h_o} in Sec. 3.6 of Hennig (2019).} \item{pdistto}{list that for all clusters has a sequence of point numbers. These are the points already incorporated in the sequence of points constructed in the algorithm in Sec. 3.6 of Hennig (2019) to which the next point to be joined is connected.} \item{pclosetomode}{list that for all clusters has a sequence of point numbers. Sequence of points to be incorporated in the sequence of points constructed in the algorithm in Sec. 3.6 of Hennig (2019).} \item{distto}{list that for all clusters has a sequence of differences between the standardised densities (see \code{percwdens}) at the new point added and the point to which it is connected (if this is positive, the penalty is this to the square), in the algorithm in Sec. 3.6 of Hennig (2019).} \item{percwdens}{this is \code{withindensp} divided by its maximum.} \item{percdensoc}{this is \code{densoc} divided by the maximum of \code{withindensp}, called \code{h_o^*} in Sec. 3.6 of Hennig (2019).} \item{parsimony}{number of clusters divided by \code{maxk}.} \item{cvnnd}{coefficient of variation of dissimilarities to \code{nnk}th nearest within-cluster neighbour, measuring uniformity of within-cluster densities, weighted over all clusters, see Sec. 3.7 of Hennig (2019).} \item{cvnndc}{vector of cluster-wise coefficients of variation of dissimilarities to \code{nnk}th nearest wthin-cluster neighbour as required in computation of \code{cvnnd}.} } \references{ Akhanli, S. and Hennig, C. (2020) Calibrating and aggregating cluster validity indexes for context-adapted comparison of clusterings. \emph{Statistics and Computing}, 30, 1523-1544, \url{https://link.springer.com/article/10.1007/s11222-020-09958-2}, \url{https://arxiv.org/abs/2002.01822} Calinski, T., and Harabasz, J. (1974) A Dendrite Method for Cluster Analysis, \emph{Communications in Statistics}, 3, 1-27. Gordon, A. D. (1999) \emph{Classification}, 2nd ed. Chapman and Hall. Halkidi, M., Batistakis, Y., Vazirgiannis, M. (2001) On Clustering Validation Techniques, \emph{Journal of Intelligent Information Systems}, 17, 107-145. Hennig, C. and Liao, T. (2013) How to find an appropriate clustering for mixed-type variables with application to socio-economic stratification, \emph{Journal of the Royal Statistical Society, Series C Applied Statistics}, 62, 309-369. Hennig, C. (2013) How many bee species? A case study in determining the number of clusters. In: Spiliopoulou, L. Schmidt-Thieme, R. Janning (eds.): "Data Analysis, Machine Learning and Knowledge Discovery", Springer, Berlin, 41-49. Hennig, C. (2019) Cluster validation by measurement of clustering characteristics relevant to the user. In C. H. Skiadas (ed.) \emph{Data Analysis and Applications 1: Clustering and Regression, Modeling-estimating, Forecasting and Data Mining, Volume 2}, Wiley, New York 1-24, \url{https://arxiv.org/abs/1703.09282} Kaufman, L. and Rousseeuw, P.J. (1990). "Finding Groups in Data: An Introduction to Cluster Analysis". Wiley, New York. Meila, M. (2007) Comparing clusterings?an information based distance, \emph{Journal of Multivariate Analysis}, 98, 873-895. Milligan, G. W. and Cooper, M. C. (1985) An examination of procedures for determining the number of clusters. \emph{Psychometrika}, 50, 159-179. } \author{Christian Hennig \email{christian.hennig@unibo.it} \url{https://www.unibo.it/sitoweb/christian.hennig/en/} } \seealso{ \code{\link{cluster.stats}}, \code{\link{silhouette}}, \code{\link{dist}}, \code{\link{calinhara}}, \code{\link{distcritmulti}}. \code{\link{clusterboot}} computes clusterwise stability statistics by resampling. } \examples{ set.seed(20000) options(digits=3) face <- rFace(200,dMoNo=2,dNoEy=0,p=2) dface <- dist(face) complete3 <- cutree(hclust(dface),3) cqcluster.stats(dface,complete3, alt.clustering=as.integer(attr(face,"grouping"))) } \keyword{cluster}% at least one, from doc/KEYWORDS \keyword{multivariate} fpc/man/prediction.strength.Rd0000644000176200001440000001466413674423060016101 0ustar liggesusers\name{prediction.strength} \alias{prediction.strength} \alias{print.predstr} %- Also NEED an `\alias' for EACH other topic documented here. \title{Prediction strength for estimating number of clusters} \description{ Computes the prediction strength of a clustering of a dataset into different numbers of components. The prediction strength is defined according to Tibshirani and Walther (2005), who recommend to choose as optimal number of cluster the largest number of clusters that leads to a prediction strength above 0.8 or 0.9. See details. Various clustering methods can be used, see argument \code{clustermethod}. In Tibshirani and Walther (2005), only classification to the nearest centroid is discussed, but more methods are offered here, see argument \code{classification}. } \usage{ prediction.strength(xdata, Gmin=2, Gmax=10, M=50, clustermethod=kmeansCBI, classification="centroid", centroidname = NULL, cutoff=0.8,nnk=1, distances=inherits(xdata,"dist"),count=FALSE,...) \method{print}{predstr}(x, ...) } %- maybe also `usage' for other objects documented here. \arguments{ \item{xdata}{data (something that can be coerced into a matrix).} \item{Gmin}{integer. Minimum number of clusters. Note that the prediction strength for 1 cluster is trivially 1, which is automatically included if \code{GMin>1}. Therefore \code{GMin<2} is useless.} \item{Gmax}{integer. Maximum number of clusters.} \item{M}{integer. Number of times the dataset is divided into two halves.} \item{clustermethod}{an interface function (the function name, not a string containing the name, has to be provided!). This defines the clustering method. See the "Details"-section of \code{\link{clusterboot}} and \code{\link{kmeansCBI}} for the format. Clustering methods for \code{prediction.strength} must have a \code{k}-argument for the number of clusters, must operate on n times p data matrices and must otherwise follow the specifications in \code{\link{clusterboot}} Note that \code{prediction.strength} won't work with CBI-functions that implicitly already estimate the number of clusters such as \code{\link{pamkCBI}}; use \code{\link{claraCBI}} if you want to run it for pam/clara clustering.} \item{classification}{string. This determines how non-clustered points are classified to given clusters. Options are explained in \code{\link{classifnp}} and \code{\link{classifdist}}, the latter for dissimilarity data. Certain classification methods are connected to certain clustering methods. \code{classification="averagedist"} is recommended for average linkage, \code{classification="centroid"} is recommended for k-means, clara and pam (with distances it will work with \code{\link{claraCBI}} only), \code{classification="knn"} with \code{nnk=1} is recommended for single linkage and \code{classification="qda"} is recommended for Gaussian mixtures with flexible covariance matrices. } \item{centroidname}{string. Indicates the name of the component of \code{CBIoutput$result} that contains the cluster centroids in case of \code{classification="centroid"}, where \code{CBIoutput} is the output object of \code{clustermethod}. If \code{clustermethod} is \code{kmeansCBI} or \code{claraCBI}, centroids are recognised automatically if \code{centroidname=NULL}. If \code{centroidname=NULL} and \code{distances=FALSE}, cluster means are computed as the cluster centroids.} \item{cutoff}{numeric between 0 and 1. The optimal number of clusters is the maximum one with prediction strength above \code{cutoff}.} \item{nnk}{number of nearest neighbours if \code{classification="knn"}, see \code{\link{classifnp}}.} \item{distances}{logical. If \code{TRUE}, data will be interpreted as dissimilarity matrix, passed on to clustering methods as \code{"dist"}-object, and \code{\link{classifdist}} will be used for classification.} \item{count}{logical. \code{TRUE} will print current number of clusters and simulation run number on the screen.} \item{x}{object of class \code{predstr}.} \item{...}{arguments to be passed on to the clustering method.} } \value{ \code{prediction.strength} gives out an object of class \code{predstr}, which is a list with components \item{predcorr}{list of vectors of length \code{M} with relative frequencies of correct predictions (clusterwise minimum). Every list entry refers to a certain number of clusters.} \item{mean.pred}{means of \code{predcorr} for all numbers of clusters.} \item{optimalk}{optimal number of clusters.} \item{cutoff}{see above.} \item{method}{a string identifying the clustering method.} \item{Gmax}{see above.} \item{M}{see above.} } \details{ The prediction strength for a certain number of clusters k under a random partition of the dataset in halves A and B is defined as follows. Both halves are clustered with k clusters. Then the points of A are classified to the clusters of B. In the original paper this is done by assigning every observation in A to the closest cluster centroid in B (corresponding to \code{classification="centroid"}), but other methods are possible, see \code{\link{classifnp}}. A pair of points A in the same A-cluster is defined to be correctly predicted if both points are classified into the same cluster on B. The same is done with the points of B relative to the clustering on A. The prediction strength for each of the clusterings is the minimum (taken over all clusters) relative frequency of correctly predicted pairs of points of that cluster. The final mean prediction strength statistic is the mean over all 2M clusterings. } \references{ Tibshirani, R. and Walther, G. (2005) Cluster Validation by Prediction Strength, \emph{Journal of Computational and Graphical Statistics}, 14, 511-528. } \seealso{ \code{\link{kmeansCBI}}, \code{\link{classifnp}} } \author{Christian Hennig \email{christian.hennig@unibo.it} \url{https://www.unibo.it/sitoweb/christian.hennig/en/} } \examples{ options(digits=3) set.seed(98765) iriss <- iris[sample(150,20),-5] prediction.strength(iriss,2,3,M=3) prediction.strength(iriss,2,3,M=3,clustermethod=claraCBI) # The examples are fast, but of course M should really be larger. } \keyword{cluster}% at least one, from doc/KEYWORDS \keyword{multivariate} fpc/man/wfu.Rd0000644000176200001440000000177713467541512012711 0ustar liggesusers\name{wfu} \alias{wfu} %- Also NEED an `\alias' for EACH other topic documented here. \title{Weight function (for Mahalabobis distances)} \description{ Function of the elements of \code{md}, which is 1 for arguments smaller than \code{ca}, 0 for arguments larger than \code{ca2} and linear (default: continuous) in between. Thought for use in \code{fixmahal}. } \usage{ wfu(md, ca, ca2, a1 = 1/(ca - ca2), a0 = -a1 * ca2) } %- maybe also `usage' for other objects documented here. \arguments{ \item{md}{vector of positive numericals.} \item{ca}{positive numerical.} \item{ca2}{positive numerical.} \item{a1}{numerical. Slope.} \item{a0}{numerical. Intercept.} } \value{ A vector of numericals between 0 and 1. } \author{Christian Hennig \email{christian.hennig@unibo.it} \url{https://www.unibo.it/sitoweb/christian.hennig/en/}} \seealso{\code{\link{fixmahal}}} \examples{ md <- seq(0,10,by=0.1) round(wfu(md,ca=5,ca2=8),digits=2) } \keyword{arith}% at least one, from doc/KEYWORDS fpc/man/xtable.Rd0000644000176200001440000000214713467541512013357 0ustar liggesusers\name{xtable} \alias{xtable} %- Also NEED an `\alias' for EACH other topic documented here. \title{Partition crosstable with empty clusters} \description{ This produces a crosstable between two integer vectors (partitions) of the same length with a given maximum vector entry \code{k} so that the size of the table is \code{k*k} with zeroes for missing entries between 1 and \code{k} (the command \code{\link{table}} does pretty much the same thing but will leave out missing entries). } \usage{ xtable(c1,c2,k) } \arguments{ \item{c1}{vector of integers.} \item{c2}{vector of integers of same length as \code{c1}.} \item{k}{integer. Must be larger or equal to maximum entry in \code{c1} and \code{c2}.} } \value{ A matrix of dimensions \code{c(k,k)}. Entry \code{[i,j]} gives the number of places in which \code{c1==i & c2==j}. } \author{Christian Hennig \email{christian.hennig@unibo.it} \url{https://www.unibo.it/sitoweb/christian.hennig/en/} } \seealso{ \code{\link{table}} } \examples{ c1 <- 1:3 c2 <- c(1,1,2) xtable(c1,c2,3) } \keyword{array}% at least one, from doc/KEYWORDS fpc/man/cgrestandard.Rd0000644000176200001440000001142514532224760014535 0ustar liggesusers\name{cgrestandard} \alias{cgrestandard} %- Also NEED an `\alias' for EACH other topic documented here. \title{Standardise cluster validation statistics by random clustering results} \description{ Standardises cluster validity statistics as produced by \code{\link{clustatsum}} relative to results that were achieved by random clusterings on the same data by \code{\link{randomclustersim}}. The aim is to make differences between values comparable between indexes, see Hennig (2019), Akhanli and Hennig (2020). This is mainly for use within \code{\link{clusterbenchstats}}. } \usage{ cgrestandard(clusum,clusim,G,percentage=FALSE, useallmethods=FALSE, useallg=FALSE, othernc=list()) } %- maybe also `usage' for other objects documented here. \arguments{ \item{clusum}{object of class "valstat", see \code{\link{clusterbenchstats}}.} \item{clusim}{list; output object of \code{\link{randomclustersim}}, see there.} \item{G}{vector of integers. Numbers of clusters to consider.} \item{percentage}{logical. If \code{FALSE}, standardisation is done to mean zero and standard deviation 1 using the random clusterings. If \code{TRUE}, the output is the percentage of simulated values below the result (more precisely, this number plus one divided by the total plus one).} \item{useallmethods}{logical. If \code{FALSE}, only random clustering results from \code{clusim} are used for standardisation. If \code{TRUE}, also clustering results from other methods as given in \code{clusum} are used.} \item{useallg}{logical. If \code{TRUE}, standardisation uses results from all numbers of clusters in \code{G}. If \code{FALSE}, standardisation of results for a specific number of cluster only uses results from that number of clusters.} \item{othernc}{list of integer vectors of length 2. This allows the incorporation of methods that bring forth other numbers of clusters than those in \code{G}, for example because a method may have automatically estimated a number of clusters. The first number is the number of the clustering method (the order is determined by argument \code{clustermethod} in \code{\link{clusterbenchstats}}), the second number is the number of clusters. Results specified here are only standardised in \code{useallg=TRUE}.} } \details{ \code{cgrestandard} will add a statistic named \code{dmode} to the input set of validation statistics, which is defined as \code{0.75*dindex+0.25*highdgap}, aggregating these two closely related statistics, see \code{\link{clustatsum}}. } \value{ List of class \code{"valstat"}, see \code{\link{valstat.object}}, with standardised results as explained above. } \references{ Hennig, C. (2019) Cluster validation by measurement of clustering characteristics relevant to the user. In C. H. Skiadas (ed.) \emph{Data Analysis and Applications 1: Clustering and Regression, Modeling-estimating, Forecasting and Data Mining, Volume 2}, Wiley, New York 1-24, \url{https://arxiv.org/abs/1703.09282} Akhanli, S. and Hennig, C. (2020) Calibrating and aggregating cluster validity indexes for context-adapted comparison of clusterings. \emph{Statistics and Computing}, 30, 1523-1544, \url{https://link.springer.com/article/10.1007/s11222-020-09958-2}, \url{https://arxiv.org/abs/2002.01822} } \author{Christian Hennig \email{christian.hennig@unibo.it} \url{https://www.unibo.it/sitoweb/christian.hennig/en/} } \seealso{ \code{\link{valstat.object}}, \code{\link{clusterbenchstats}}, \code{\link{stupidkcentroids}}, \code{\link{stupidknn}}, \code{\link{stupidkfn}}, \code{\link{stupidkaven}}, \code{\link{clustatsum}} } \examples{ set.seed(20000) options(digits=3) face <- rFace(10,dMoNo=2,dNoEy=0,p=2) dif <- dist(face) clusum <- list() clusum[[2]] <- list() cl12 <- kmeansCBI(face,2) cl13 <- kmeansCBI(face,3) cl22 <- claraCBI(face,2) cl23 <- claraCBI(face,2) ccl12 <- clustatsum(dif,cl12$partition) ccl13 <- clustatsum(dif,cl13$partition) ccl22 <- clustatsum(dif,cl22$partition) ccl23 <- clustatsum(dif,cl23$partition) clusum[[1]] <- list() clusum[[1]][[2]] <- ccl12 clusum[[1]][[3]] <- ccl13 clusum[[2]][[2]] <- ccl22 clusum[[2]][[3]] <- ccl23 clusum$maxG <- 3 clusum$minG <- 2 clusum$method <- c("kmeansCBI","claraCBI") clusum$name <- c("kmeansCBI","claraCBI") clusim <- randomclustersim(dist(face),G=2:3,nnruns=1,kmruns=1, fnruns=1,avenruns=1,monitor=FALSE) cgr <- cgrestandard(clusum,clusim,2:3) cgr2 <- cgrestandard(clusum,clusim,2:3,useallg=TRUE) cgr3 <- cgrestandard(clusum,clusim,2:3,percentage=TRUE) print(str(cgr)) print(str(cgr2)) print(cgr3[[1]][[2]]) } \keyword{multivariate}% at least one, from doc/KEYWORDS \keyword{cluster}% __ONLY ONE__ keyword per line fpc/man/zmisclassification.matrix.Rd0000644000176200001440000000470113467541512017277 0ustar liggesusers\name{zmisclassification.matrix} \alias{zmisclassification.matrix} %- Also NEED an `\alias' for EACH other topic documented here. \title{Matrix of misclassification probabilities between mixture components} \description{ Matrix of misclassification probabilities in a mixture distribution between two mixture components from estimated posterior probabilities regardless of component parameters, see Hennig (2010). } \usage{ zmisclassification.matrix(z,pro=NULL,clustering=NULL, ipairs="all",symmetric=TRUE, stat="max") } %- maybe also `usage' for other objects documented here. \arguments{ \item{z}{matrix of posterior probabilities for observations (rows) to belong to mixture components (columns), so entries need to sum up to 1 for each row.} \item{pro}{vector of component proportions, need to sum up to 1. Computed from \code{z} as default.} \item{clustering}{vector of integers giving the estimated mixture components for every observation. Computed from \code{z} as default.} \item{ipairs}{\code{"all"} or list of vectors of two integers. If \code{ipairs="all"}, computations are carried out for all pairs of components. Otherwise, ipairs gives the pairs of components for which computations are carried out.} \item{symmetric}{logical. If \code{TRUE}, the matrix is symmetrised, see parameter \code{stat}.} \item{stat}{\code{"max"} or \code{"mean"}. The statistic by which the two misclassification probabilities are aggregated if \code{symmetric=TRUE}.} } \value{ A matrix with the (symmetrised, if required) misclassification probabilities between each pair of mixture components. If \code{symmetric=FALSE}, matrix entry \code{[i,j]} is the estimated probability that an observation generated by component \code{j} is classified to component \code{i} by maximum a posteriori rule. } \references{ Hennig, C. (2010) Methods for merging Gaussian mixture components, \emph{Advances in Data Analysis and Classification}, 4, 3-34. } \author{Christian Hennig \email{christian.hennig@unibo.it} \url{https://www.unibo.it/sitoweb/christian.hennig/en/} } \seealso{ \code{\link{confusion}} } \examples{ set.seed(12345) m <- rpois(20,lambda=5) dim(m) <- c(5,4) m <- m/apply(m,1,sum) round(zmisclassification.matrix(m,symmetric=FALSE),digits=2) } \keyword{cluster}% at least one, from doc/KEYWORDS \keyword{multivariate} fpc/man/regmix.Rd0000644000176200001440000001405313674511116013367 0ustar liggesusers\name{regmix} \alias{regmix} \alias{regem} %- Also NEED an `\alias' for EACH other topic documented here. \title{Mixture Model ML for Clusterwise Linear Regression} \description{ Computes an ML-estimator for clusterwise linear regression under a regression mixture model with Normal errors. Parameters are proportions, regression coefficients and error variances, all independent of the values of the independent variable, and all may differ for different clusters. Computation is by the EM-algorithm. The number of clusters is estimated via the Bayesian Information Criterion (BIC). Note that package \code{flexmix} has more sophisticated tools to do the same thing and is recommended. The functions are kept in here only for compatibility reasons. } \usage{ regmix(indep, dep, ir=1, nclust=1:7, icrit=1.e-5, minsig=1.e-6, warnings=FALSE) regem(indep, dep, m, cln, icrit=1.e-5, minsig=1.e-6, warnings=FALSE) } %- maybe also `usage' for other objects documented here. \arguments{ \item{indep}{numerical matrix or vector. Independent variables.} \item{dep}{numerical vector. Dependent variable.} \item{ir}{positive integer. Number of iteration runs for every number of clusters.} \item{nclust}{vector of positive integers. Numbers of clusters.} \item{icrit}{positive numerical. Stopping criterion for the iterations (difference of loglikelihoods).} \item{minsig}{positive numerical. Minimum value for the variance parameters (likelihood is unbounded if variances are allowed to converge to 0).} \item{warnings}{logical. If \code{TRUE}, warnings are given during the EM iteration in case of collinear regressors, too small mixture components and error variances smaller than minimum. In the former two cases, the algorithm is terminated without a result, but an optimal solution is still computed from other algorithm runs (if there are others). In the latter case, the corresponding variance is set to the minimum.} \item{cln}{positive integer. (Single) number of clusters.} \item{m}{matrix of positive numericals. Number of columns must be \code{cln}. Number of rows must be number of data points. Columns must add up to 1. Initial configuration for the EM iteration in terms of a probabilty vector for every point which gives its degree of membership to every cluster. As generated by \code{\link{randcmatrix}}.} } \details{ The result of the EM iteration depends on the initial configuration, which is generated randomly by \code{\link{randcmatrix}} for \code{regmix}. \code{regmix} calls \code{regem}. To provide the initial configuration manually, use parameter \code{m} of \code{regem} directly. Take a look at the example about how to generate \code{m} if you want to specify initial parameters. The original paper DeSarbo and Cron (1988) suggests the AIC for estimating the number of clusters. The use of the BIC is advocated by Wedel and DeSarbo (1995). The BIC is defined here as \code{2*loglik - log(n)*((p+3)*cln-1)}, \code{p} being the number of independent variables, i.e., the larger the better. See the entry for the input parameter \code{warnings} for the treatment of several numerical problems. } \value{ \code{regmix} returns a list containing the components \code{clnopt, loglik, bic, coef, var, eps, z, g}. \code{regem} returns a list containing the components \code{loglik, coef, var, z, g, warn}. \item{clnopt}{optimal number of clusters according to the BIC.} \item{loglik}{loglikelihood for the optimal model.} \item{bic}{vector of BIC values for all numbers of clusters in \code{nclust}.} \item{coef}{matrix of regression coefficients. First row: intercept parameter. Second row: parameter of first independent variable and so on. Columns corresponding to clusters.} \item{var}{vector of error variance estimators for the clusters.} \item{eps}{vector of cluster proportion estimators.} \item{z}{matrix of estimated a posteriori probabilities of the points (rows) to be generated by the clusters (columns). Compare input argument \code{m}.} \item{g}{integer vector of estimated cluster numbers for the points (via argmax over \code{z}).} \item{warn}{logical. \code{TRUE} if one of the estimated clusters has too few points and/or collinear regressors.} } \references{ DeSarbo, W. S. and Cron, W. L. (1988) A maximum likelihood methodology for clusterwise linear regression, \emph{Journal of Classification} 5, 249-282. Wedel, M. and DeSarbo, W. S. (1995) A mixture likelihood approach for generalized linear models, \emph{Journal of Classification} 12, 21-56. } \author{Christian Hennig \email{christian.hennig@unibo.it} \url{https://www.unibo.it/sitoweb/christian.hennig/en/}} \seealso{ Regression mixtures can also (and probably better) be computed with the flexmix package, see \code{\link[flexmix]{flexmix}}. (When I first write the \code{regmix}-function, \code{flexmix} didn't exist.) \code{\link{fixreg}} for fixed point clusters for clusterwise linear regression. \code{\link[mclust:mclustBIC]{EMclust}} for Normal mixture model fitting (non-regression). } \examples{ \dontrun{ # This apparently gives slightly different # but data-analytically fine results # on some versions of R. set.seed(12234) data(tonedata) attach(tonedata) rmt1 <- regmix(stretchratio,tuned,nclust=1:2) # nclust=1:2 makes the example fast; # a more serious application would rather use the default. rmt1$g round(rmt1$bic,digits=2) # start with initial parameter values cln <- 3 n <- 150 initcoef <- cbind(c(2,0),c(0,1),c(0,2.5)) initvar <- c(0.001,0.0001,0.5) initeps <- c(0.4,0.3,0.3) # computation of m from initial parameters m <- matrix(nrow=n, ncol=cln) stm <- numeric(0) for (i in 1:cln) for (j in 1:n){ m[j,i] <- initeps[i]*dnorm(tuned[j],mean=initcoef[1,i]+ initcoef[2,i]*stretchratio[j], sd=sqrt(initvar[i])) } for (j in 1:n){ stm[j] <- sum(m[j,]) for (i in 1:cln) m[j,i] <- m[j,i]/stm[j] } rmt2 <- regem(stretchratio, tuned, m, cln) } } \keyword{cluster}% at least one, from doc/KEYWORDS \keyword{regression} fpc/man/rFace.Rd0000644000176200001440000000522613467541512013121 0ustar liggesusers\name{rFace} \alias{rFace} %- Also NEED an `\alias' for EACH other topic documented here. \title{"Face-shaped" clustered benchmark datasets} \description{ Generates "face-shaped" clustered benchmark datasets. This is based on a collaboration with Martin Maechler. } \usage{ rFace(n, p = 6, nrep.top = 2, smile.coef = 0.6, dMoNo = 1.2, dNoEy = 1) } %- maybe also `usage' for other objects documented here. \arguments{ \item{n}{integer greater or equal to 10. Number of points.} \item{p}{integer greater or equal to 2. Dimension.} \item{nrep.top}{integer. Number of repetitions of the hair-top point.} \item{smile.coef}{numeric. Coefficient for quadratic term used for generation of mouth-points. Positive values=>smile.} \item{dMoNo}{number. Distance from mouth to nose.} \item{dNoEy}{number. Minimum vertical distance from mouth to eyes.} } \details{ The function generates a nice benchmark example for cluster analysis. There are six "clusters" in this data, of which the first five are clearly homogeneous patterns, but with different distributional shapes and different qualities of separation. The clusters are distinguished only in the first two dimensions. The attribute \code{grouping} is a factor giving the cluster numbers, see below. The sixth group of points corresponds to some hairs, and is rather a collection of outliers than a cluster in itself. This group contains \code{nrep.top+2} points. Of the remaining points, 20\% belong to cluster 1, the chin (quadratic function plus noise). 10\% belong to cluster 2, the right eye (Gaussian). 30\% belong to cluster 3, the mouth (Gaussian/squared Gaussian). 20\% belong to cluster 4, the nose (Gaussian/gamma), and 20\% belong to cluster 5, the left eye (uniform). The distributions of the further variables are homogeneous over all points. The third dimension is exponentially distributed, the fourth dimension is Cauchy distributed, all further distributions are Gaussian. Please consider the source code for exact generation of the clusters. } \value{ An \code{n} times \code{p} numeric matrix with attributes \item{grouping}{a factor giving the cluster memberships of the points.} \item{indexlist}{a list of six vectors containing the indices of points belonging to the six groups.} } \author{ Christian Hennig \email{christian.hennig@unibo.it} \url{https://www.unibo.it/sitoweb/christian.hennig/en/}} \examples{ set.seed(4634) face <- rFace(600,dMoNo=2,dNoEy=0) grface <- as.integer(attr(face,"grouping")) plot(face, col = grface) # pairs(face, col = grface, main ="rFace(600,dMoNo=2,dNoEy=0)") } \keyword{data}% at least one, from doc/KEYWORDS fpc/man/can.Rd0000644000176200001440000000212713467541512012637 0ustar liggesusers\name{can} \alias{can} %- Also NEED an `\alias' for EACH other topic documented here. \title{Generation of the tuning constant for regression fixed point clusters} \description{ Generates tuning constants \code{ca} for \code{\link{fixreg}} dependent on the number of points and variables of the dataset. Only thought for use in \code{\link{fixreg}}. } \usage{ can(n, p) } %- maybe also `usage' for other objects documented here. \arguments{ \item{n}{positive integer. Number of points.} \item{p}{positive integer. Number of independent variables.} } \details{ The formula is \eqn{3+33/(n*2^{-(p-1)/2})^{1/3}+2900000/(n*2^{-(p-1)/2})^3}. For justification cf. Hennig (2002). } \value{ A number. } \references{ Hennig, C. (2002) Fixed point clusters for linear regression: computation and comparison, \emph{Journal of Classification} 19, 249-276. } \author{Christian Hennig \email{christian.hennig@unibo.it} \url{https://www.unibo.it/sitoweb/christian.hennig/en/}} \seealso{\code{\link{fixreg}}} \examples{ can(429,3) } \keyword{arith}% at least one, from doc/KEYWORDS fpc/man/fixreg.Rd0000644000176200001440000003520213467541512013362 0ustar liggesusers\name{fixreg} \alias{fixreg} \alias{summary.rfpc} \alias{plot.rfpc} \alias{fpclusters.rfpc} \alias{print.summary.rfpc} \alias{print.rfpc} \alias{rfpi} %- Also NEED an `\alias' for EACH other topic documented here. \title{Linear Regression Fixed Point Clusters} \description{ Computes linear regression fixed point clusters (FPCs), i.e., subsets of the data, which consist exactly of the non-outliers w.r.t. themselves, and may be interpreted as generated from a homogeneous linear regression relation between independent and dependent variable. FPCs may overlap, are not necessarily exhausting and do not need a specification of the number of clusters. Note that while \code{fixreg} has lots of parameters, only one (or few) of them have usually to be specified, cf. the examples. The philosophy is to allow much flexibility, but to always provide sensible defaults. } \usage{ fixreg(indep=rep(1,n), dep, n=length(dep), p=ncol(as.matrix(indep)), ca=NA, mnc=NA, mtf=3, ir=NA, irnc=NA, irprob=0.95, mncprob=0.5, maxir=20000, maxit=5*n, distcut=0.85, init.group=list(), ind.storage=FALSE, countmode=100, plot=FALSE) \method{summary}{rfpc}(object, ...) \method{print}{summary.rfpc}(x, maxnc=30, ...) \method{plot}{rfpc}(x, indep=rep(1,n), dep, no, bw=TRUE, main=c("Representative FPC No. ",no), xlab="Linear combination of independents", ylab=deparse(substitute(indep)), xlim=NULL, ylim=range(dep), pch=NULL, col=NULL,...) \method{fpclusters}{rfpc}(object, indep=NA, dep=NA, ca=object$ca, ...) rfpi(indep, dep, p, gv, ca, maxit, plot) } %- maybe also `usage' for other objects documented here. \arguments{ \item{indep}{numerical matrix or vector. Independent variables. Leave out for clustering one-dimensional data. \code{fpclusters.rfpc} does not need specification of \code{indep} if \code{fixreg} was run with \code{ind.storage=TRUE}.} \item{dep}{numerical vector. Dependent variable. \code{fpclusters.rfpc} does not need specification of \code{dep} if \code{fixreg} was run with \code{ind.storage=TRUE}.} \item{n}{optional positive integer. Number of cases.} \item{p}{optional positive integer. Number of independent variables.} \item{ca}{optional positive number. Tuning constant, specifying required cluster separation. By default determined automatically as a function of \code{n} and \code{p}, see function \code{\link{can}}, Hennig (2002a).} \item{mnc}{optional positive integer. Minimum size of clusters to be reported. By default determined automatically as a function of \code{mncprob}. See Hennig (2002a).} \item{mtf}{optional positive integer. FPCs must be found at least \code{mtf} times to be reported by \code{summary.rfpc}.} \item{ir}{optional positive integer. Number of algorithm runs. By default determined automatically as a function of \code{n}, \code{p}, \code{irnc}, \code{irprob}, \code{mtf}, \code{maxir}. See function \code{\link{itnumber}} and Hennig (2002a).} \item{irnc}{optional positive integer. Size of the smallest cluster to be found with approximated probability \code{irprob}.} \item{irprob}{optional value between 0 and 1. Approximated probability for a cluster of size \code{irnc} to be found.} \item{mncprob}{optional value between 0 amd 1. Approximated probability for a cluster of size \code{mnc} to be found.} \item{maxir}{optional integer. Maximum number of algorithm runs.} \item{maxit}{optional integer. Maximum number of iterations per algorithm run (usually an FPC is found much earlier).} \item{distcut}{optional value between 0 and 1. A similarity measure between FPCs, given in Hennig (2002a), and the corresponding Single Linkage groups of FPCs with similarity larger than \code{distcut} are computed. A single representative FPC is selected for each group.} \item{init.group}{optional list of logical vectors of length \code{n}. Every vector indicates a starting configuration for the fixed point algorithm. This can be used for datasets with high dimension, where the vectors of \code{init.group} indicate cluster candidates found by graphical inspection or background knowledge.} \item{ind.storage}{optional logical. If \code{TRUE}, then all indicator vectors of found FPCs are given in the value of \code{fixreg}. May need lots of memory, but is a bit faster.} \item{countmode}{optional positive integer. Every \code{countmode} algorithm runs \code{fixreg} shows a message.} \item{plot}{optional logical. If \code{TRUE}, you get a scatterplot of first independent vs. dependent variable at each iteration.} \item{object}{object of class \code{rfpc}, output of \code{fixreg}.} \item{x}{object of class \code{rfpc}, output of \code{fixreg}.} \item{maxnc}{positive integer. Maximum number of FPCs to be reported.} \item{no}{positive integer. Number of the representative FPC to be plotted.} \item{bw}{optional logical. If \code{TRUE}, plot is black/white, FPC is indicated by different symbol. Else FPC is indicated red.} \item{main}{plot title.} \item{xlab}{label for x-axis.} \item{ylab}{label for y-axis.} \item{xlim}{plotted range of x-axis. If \code{NULL}, the range of the plotted linear combination of independent variables is used.} \item{ylim}{plotted range of y-axis.} \item{pch}{plotting symbol, see \code{\link{par}}. If \code{NULL}, the default is used.} \item{col}{plotting color, see \code{\link{par}}. If \code{NULL}, the default is used.} \item{gv}{logical vector of length \code{n}. Indicates the initial configuration for the fixed point algorithm.} \item{...}{additional parameters to be passed to \code{plot} (no effects elsewhere).} } \details{A linear regression FPC is a data subset that reproduces itself under the following operation: \cr Compute linear regression and error variance estimator for the data subset, and compute all points of the dataset for which the squared residual is smaller than \code{ca} times the error variance.\cr Fixed points of this operation can be considered as clusters, because they contain only non-outliers (as defined by the above mentioned procedure) and all other points are outliers w.r.t. the subset. \cr \code{fixreg} performs \code{ir} fixed point algorithms started from random subsets of size \code{p+2} to look for FPCs. Additionally an algorithm is started from the whole dataset, and algorithms are started from the subsets specified in \code{init.group}. \cr Usually some of the FPCs are unstable, and more than one FPC may correspond to the same significant pattern in the data. Therefore the number of FPCs is reduced: FPCs with less than \code{mnc} points are ignored. Then a similarity matrix is computed between the remaining FPCs. Similarity between sets is defined as the ratio between 2 times size of intersection and the sum of sizes of both sets. The Single Linkage clusters (groups) of level \code{distcut} are computed, i.e. the connectivity components of the graph where edges are drawn between FPCs with similarity larger than \code{distcut}. Groups of FPCs whose members are found \code{mtf} times or more are considered as stable enough. A representative FPC is chosen for every Single Linkage cluster of FPCs according to the maximum expectation ratio \code{ser}. \code{ser} is the ratio between the number of findings of an FPC and the estimated expectation of the number of findings of an FPC of this size, called \emph{expectation ratio} and computed by \code{\link{clusexpect}}.\cr Usually only the representative FPCs of stable groups are of interest. \cr The choice of the involved tuning constants such as \code{ca} and \code{ir} is discussed in detail in Hennig (2002a). Statistical theory is presented in Hennig (2003).\cr Generally, the default settings are recommended for \code{fixreg}. In cases where they lead to a too large number of algorithm runs (e.g., always for \code{p>4}), the use of \code{init.group} together with \code{mtf=1} and \code{ir=0} is useful. Occasionally, \code{irnc} may be chosen smaller than the default, if smaller clusters are of interest, but this may lead to too many clusters and too many algorithm runs. Decrease of \code{ca} will often lead to too many clusters, even for homogeneous data. Increase of \code{ca} will produce only very strongly separated clusters. Both may be of interest occasionally. \code{rfpi} is called by \code{fixreg} for a single fixed point algorithm and will usually not be executed alone. \code{summary.rfpc} gives a summary about the representative FPCs of stable groups. \code{plot.rfpc} is a plot method for the representative FPC of stable group no. \code{no}. It produces a scatterplot of the linear combination of independent variables determined by the regression coefficients of the FPC vs. the dependent variable. The regression line and the region of non-outliers determined by \code{ca} are plotted as well. \code{fpclusters.rfpc} produces a list of indicator vectors for the representative FPCs of stable groups. } \value{ \code{fixreg} returns an object of class \code{rfpc}. This is a list containing the components \code{nc, g, coefs, vars, nfound, er, tsc, ncoll, grto, imatrix, smatrix, stn, stfound, sfpc, ssig, sto, struc, n, p, ca, ir, mnc, mtf, distcut}. \code{summary.rfpc} returns an object of class \code{summary.rfpc}. This is a list containing the components \code{coefs, vars, stfound, stn, sn, ser, tsc, sim, ca, ir, mnc, mtf}. \code{fpclusters.rfpc} returns a list of indicator vectors for the representative FPCs of stable groups. \code{rfpi} returns a list with the components \code{coef, var, g, coll, ca}. \item{nc}{integer. Number of FPCs.} \item{g}{list of logical vectors. Indicator vectors of FPCs. \code{FALSE} if \code{ind.storage=FALSE}.} \item{coefs}{list of numerical vectors. Regression coefficients of FPCs. In \code{summary.rfpc}, only for representative FPCs of stable groups and sorted according to \code{stfound}.} \item{vars}{list of numbers. Error variances of FPCs. In \code{summary.rfpc}, only for representative FPCs of stable groups and sorted according to \code{stfound}.} \item{nfound}{vector of integers. Number of findings for the FPCs.} \item{er}{numerical vector. Expectation ratios of FPCs. Can be taken as a stability measure.} \item{tsc}{integer. Number of algorithm runs leading to too small or too seldom found FPCs.} \item{ncoll}{integer. Number of algorithm runs where collinear regressor matrices occurred.} \item{grto}{vector of integers. Numbers of FPCs to which algorithm runs led, which were started by \code{init.group}.} \item{imatrix}{vector of integers. Size of intersection between FPCs. See \code{\link{sseg}}.} \item{smatrix}{numerical vector. Similarities between FPCs. See \code{\link{sseg}}.} \item{stn}{integer. Number of representative FPCs of stable groups. In \code{summary.rfpc} sorted according to \code{stfound}.} \item{stfound}{vector of integers. Number of findings of members of all groups of FPCs. In \code{summary.rfpc} sorted according to \code{stfound}.} \item{sfpc}{vector of integers. Numbers of representative FPCs.} \item{ssig}{vector of integers. As \code{sfpc}, but only for stable groups.} \item{sto}{vector of integers. Number of representative FPC of most, 2nd most, ..., often found group of FPCs.} \item{struc}{vector of integers. Number of group an FPC belongs to.} \item{n}{see arguments.} \item{p}{see arguments.} \item{ca}{see arguments.} \item{ir}{see arguments.} \item{mnc}{see arguments.} \item{mtf}{see arguments.} \item{distcut}{see arguments.} \item{sn}{vector of integers. Number of points of representative FPCs.} \item{ser}{numerical vector. Expectation ratio for stable groups.} \item{sim}{vector of integers. Size of intersections between representative FPCs of stable groups. See \code{\link{sseg}}.} \item{coef}{vector of regression coefficients.} \item{var}{error variance.} \item{g}{logical indicator vector of iterated FPC.} \item{coll}{logical. \code{TRUE} means that singular covariance matrices occurred during the iterations.} } \references{ Hennig, C. (2002) Fixed point clusters for linear regression: computation and comparison, \emph{Journal of Classification} 19, 249-276. Hennig, C. (2003) Clusters, outliers and regression: fixed point clusters, \emph{Journal of Multivariate Analysis} 86, 183-212. } \author{Christian Hennig \email{christian.hennig@unibo.it} \url{https://www.unibo.it/sitoweb/christian.hennig/en/}} \seealso{ \code{\link{fixmahal}} for fixed point clusters in the usual setup (non-regression). \code{\link{regmix}} for clusterwiese linear regression by mixture modeling ML. \code{\link{can}}, \code{\link{itnumber}} for computation of the default settings. \code{\link{clusexpect}} for estimation of the expected number of findings of an FPC of given size. \code{\link{itnumber}} for the generation of the number of fixed point algorithms. \code{\link{minsize}} for the smallest FPC size to be found with a given probability.. \code{\link{sseg}} for indexing the similarity/intersection vectors computed by \code{fixreg}. } \examples{ set.seed(190000) options(digits=3) data(tonedata) attach(tonedata) tonefix <- fixreg(stretchratio,tuned,mtf=1,ir=20) summary(tonefix) # This is designed to have a fast example; default setting would be better. # If you want to see more (and you have a bit more time), # try out the following: \dontrun{ set.seed(1000) tonefix <- fixreg(stretchratio,tuned) # Default - good for these data summary(tonefix) plot(tonefix,stretchratio,tuned,1) plot(tonefix,stretchratio,tuned,2) plot(tonefix,stretchratio,tuned,3,bw=FALSE,pch=5) toneclus <- fpclusters(tonefix,stretchratio,tuned) plot(stretchratio,tuned,col=1+toneclus[[2]]) tonefix2 <- fixreg(stretchratio,tuned,distcut=1,mtf=1,countmode=50) # Every found fixed point cluster is reported, # no matter how instable it may be. summary(tonefix2) tonefix3 <- fixreg(stretchratio,tuned,ca=7) # ca defaults to 10.07 for these data. summary(tonefix3) subset <- c(rep(FALSE,5),rep(TRUE,24),rep(FALSE,121)) tonefix4 <- fixreg(stretchratio,tuned, mtf=1,ir=0,init.group=list(subset)) summary(tonefix4) } } \keyword{cluster}% at least one, from doc/KEYWORDS \keyword{robust}% __ONLY ONE__ keyword per line \keyword{regression} fpc/man/plot.valstat.Rd0000644000176200001440000001745014355662726014546 0ustar liggesusers\name{plot.valstat} \alias{plot.valstat} \alias{print.valstat} %- Also NEED an `\alias' for EACH other topic documented here. \title{Simulation-standardised plot and print of cluster validation statistics} \description{ Visualisation and print function for cluster validation output compared to results on simulated random clusterings. The print method can also be used to compute and print an aggregated cluster validation index. Unlike for many other plot methods, the additional arguments of \code{plot.valstat} are essential. \code{print.valstat} should make good sense with the defaults, but for computing the aggregate index need to be set. } \usage{ \method{plot}{valstat}(x,simobject=NULL,statistic="sindex", xlim=NULL,ylim=c(0,1), nmethods=length(x)-5, col=1:nmethods,cex=1,pch=c("c","f","a","n"), simcol=rep(grey(0.7),4), shift=c(-0.1,-1/3,1/3,0.1),include.othernc=NULL,...) \method{print}{valstat}(x,statistics=x$statistics, nmethods=length(x)-5,aggregate=FALSE, weights=NULL,digits=2, include.othernc=NULL,...) } %- maybe also `usage' for other objects documented here. \arguments{ \item{x}{object of class \code{"valstat"}, such as sublists \code{stat, qstat, sstat} of \code{\link{clusterbenchstats}}-output.} \item{simobject}{list of simulation results as produced by \code{\link{randomclustersim}} and documented there; typically sublist \code{sim} of \code{\link{clusterbenchstats}}-output.} \item{statistic}{one of \code{"avewithin","mnnd","variation", "diameter","gap","sindex","minsep","asw","dindex","denscut", "highdgap","pg","withinss","entropy","pamc","kdnorm","kdunif","dmode"}; validation statistic to be plotted.} \item{xlim}{passed on to \code{plot}. Default is the range of all involved numbers of clusters, minimum minus 0.5 to maximum plus 0.5.} \item{ylim}{passed on to \code{plot}.} \item{nmethods}{integer. Number of clustering methods to involve (these are those from number 1 to \code{nmethods} specified in \code{x$name}).} \item{col}{colours used for the different clustering methods.} \item{cex}{passed on to \code{plot}.} \item{pch}{vector of symbols for random clustering results from \code{\link{stupidkcentroids}}, \code{\link{stupidkfn}}, \code{\link{stupidkaven}}, \code{\link{stupidknn}}. To be passed on to \code{plot}.} \item{simcol}{vector of colours used for random clustering results in order \code{\link{stupidkcentroids}}, \code{\link{stupidkfn}}, \code{\link{stupidkaven}}, \code{\link{stupidknn}}.} \item{shift}{numeric vector. Indicates the amount to which the results from \code{\link{stupidkcentroids}}, \code{\link{stupidkfn}}, \code{\link{stupidkaven}}, \code{\link{stupidknn}} are plotted to the right of their respective number of clusters (negative numbers plot to the left).} \item{include.othernc}{this indicates whether methods should be included that estimated their number of clusters themselves and gave a result outside the standard range as given by \code{x$minG} and \code{x$maxG}. If not \code{NULL}, this is a list of integer vectors of length 2. The first number is the number of the clustering method (the order is determined by argument \code{x$name}), the second number is the number of clusters for those methods that estimate the number of clusters themselves and estimated a number outside the standard range. Normally what will be used here, if not \code{NULL}, is the output parameter \code{cm$othernc} of \code{\link{clusterbenchstats}}, see also \code{\link{cluster.magazine}}.} \item{statistics}{vector of character strings specifying the validation statistics that will be included in the output (unless you want to restrict the output for some reason, the default should be fine.} \item{aggregate}{logical. If \code{TRUE}, an aggegate validation statistic will be computed as the weighted mean of the involved statistic. This requires \code{weights} to be set. In order for this to make sense, values of the validation statistics should be comparable, which is achieved by standardisation in \code{\link{clusterbenchstats}}. Accordingly, \code{x} should be the \code{qstat} or \code{sstat}-component of the \code{\link{clusterbenchstats}}-output rather than the \code{stat}-component.} \item{weights}{vector of numericals. Weights for computation of the aggregate statistic in case that \code{aggregate=TRUE}. The order of clustering methods corresponding to the weight vector is given by \code{x$name}.} \item{digits}{minimal number of significant digits, passed on to \code{\link{print.table}}.} \item{...}{no effect.} } \details{ Whereas \code{print.valstat}, at least with \code{aggregate=TRUE} makes more sense for the \code{qstat} or \code{sstat}-component of the \code{\link{clusterbenchstats}}-output rather than the \code{stat}-component, \code{plot.valstat} should be run with the \code{stat}-component if \code{simobject} is specified, because the simulated cluster validity statistics are unstandardised and need to be compared with unstandardised values on the dataset of interest. \code{print.valstat} will print all values for all validation indexes and the aggregated index (in case of \code{aggregate=TRUE} and set \code{weights} will be printed last. } \value{ \code{print.valstats} returns the results table as invisible object. } \references{ Hennig, C. (2019) Cluster validation by measurement of clustering characteristics relevant to the user. In C. H. Skiadas (ed.) \emph{Data Analysis and Applications 1: Clustering and Regression, Modeling-estimating, Forecasting and Data Mining, Volume 2}, Wiley, New York 1-24, \url{https://arxiv.org/abs/1703.09282} Akhanli, S. and Hennig, C. (2020) Calibrating and aggregating cluster validity indexes for context-adapted comparison of clusterings. \emph{Statistics and Computing}, 30, 1523-1544, \url{https://link.springer.com/article/10.1007/s11222-020-09958-2}, \url{https://arxiv.org/abs/2002.01822} } \author{Christian Hennig \email{christian.hennig@unibo.it} \url{https://www.unibo.it/sitoweb/christian.hennig/en/} } \seealso{ \code{\link{clusterbenchstats}}, \code{\link{valstat.object}}, \code{\link{cluster.magazine}} } \examples{ set.seed(20000) options(digits=3) face <- rFace(10,dMoNo=2,dNoEy=0,p=2) clustermethod=c("kmeansCBI","hclustCBI","hclustCBI") clustermethodpars <- list() clustermethodpars[[2]] <- clustermethodpars[[3]] <- list() clustermethodpars[[2]]$method <- "ward.D2" clustermethodpars[[3]]$method <- "single" methodname <- c("kmeans","ward","single") cbs <- clusterbenchstats(face,G=2:3,clustermethod=clustermethod, methodname=methodname,distmethod=rep(FALSE,3), clustermethodpars=clustermethodpars,nnruns=2,kmruns=2,fnruns=2,avenruns=2) plot(cbs$stat,cbs$sim) plot(cbs$stat,cbs$sim,statistic="dindex") plot(cbs$stat,cbs$sim,statistic="avewithin") pcbs <- print(cbs$sstat,aggregate=TRUE,weights=c(1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0)) # Some of the values are "NaN" because due to the low number of runs of # the stupid clustering methods there is no variation. If this happens # in a real application, nnruns etc. should be chosen higher than 2. # Also useallg=TRUE in clusterbenchstats may help. # # Finding the best aggregated value: mpcbs <- as.matrix(pcbs[[17]][,-1]) which(mpcbs==max(mpcbs),arr.ind=TRUE) # row=1 refers to the first clustering method kmeansCBI, # col=2 refers to the second number of clusters, which is 3 in g=2:3. } \keyword{cluster}% at least one, from doc/KEYWORDS \keyword{multivariate} fpc/man/diptest.multi.Rd0000644000176200001440000000356413470376554014717 0ustar liggesusers\name{diptest.multi} \alias{diptest.multi} %- Also NEED an `\alias' for EACH other topic documented here. \title{Diptest for discriminant coordinate projection} \description{ Diptest (Hartigan and Hartigan, 1985, see \code{\link[diptest]{dip}}) for data projected in discriminant coordinate separating optimally two class means (see \code{discrcoord}) as suggested by Tantrum, Murua and Stuetzle (2003). } \usage{ diptest.multi(xdata,class,pvalue="uniform",M=100) } %- maybe also `usage' for other objects documented here. \arguments{ \item{xdata}{matrix. Potentially multidimensional dataset.} \item{class}{vector of integers giving class numbers for observations.} \item{pvalue}{\code{"uniform"} or \code{"tantrum"}. Defines whether the p-value is computed from a uniform null model as suggested in Hartigan and Hartigan (1985, using \code{\link[diptest]{dip.test}}) or as suggested in Tantrum et al. (2003, using \code{dipp.tantrum}).} \item{M}{integer. Number of artificial datasets generated in order to estimate the p-value if \code{pvalue="tantrum"}.} } \value{ The resulting p-value. } \references{ J. A. Hartigan and P. M. Hartigan (1985) The Dip Test of Unimodality, \emph{Annals of Statistics}, 13, 70-84. Tantrum, J., Murua, A. and Stuetzle, W. (2003) Assessment and Pruning of Hierarchical Model Based Clustering, \emph{Proceedings of the ninth ACM SIGKDD international conference on Knowledge discovery and data mining}, Washington, D.C., 197-205. } \author{Christian Hennig \email{christian.hennig@unibo.it} \url{https://www.unibo.it/sitoweb/christian.hennig/en/} } \examples{ require(diptest) x <- cbind(runif(100),runif(100)) partition <- 1+(x[,1]<0.5) d1 <- diptest.multi(x,partition) d2 <- diptest.multi(x,partition,pvalue="tantrum",M=10) } \keyword{cluster}% at least one, from doc/KEYWORDS \keyword{multivariate} fpc/man/stupidknn.Rd0000644000176200001440000000353113731163442014111 0ustar liggesusers\name{stupidknn} \alias{stupidknn} %- Also NEED an `\alias' for EACH other topic documented here. \title{Stupid nearest neighbour random clustering} \description{ Picks k random starting points from given dataset to initialise k clusters. Then, one by one, the point not yet assigned to any cluster that is closest to an already assigned point is assigned to that cluster, until all points are assigned. This is called stupid nearest neighbour clustering in Hennig (2019). } \usage{ stupidknn(d,k) } %- maybe also `usage' for other objects documented here. \arguments{ \item{d}{\code{dist}-object or dissimilarity matrix.} \item{k}{integer. Number of clusters.} } % \details{ % } \value{ The clustering vector (values 1 to \code{k}, length number of objects behind \code{d}), } \references{ Hennig, C. (2019) Cluster validation by measurement of clustering characteristics relevant to the user. In C. H. Skiadas (ed.) \emph{Data Analysis and Applications 1: Clustering and Regression, Modeling-estimating, Forecasting and Data Mining, Volume 2}, Wiley, New York 1-24, \url{https://arxiv.org/abs/1703.09282} Akhanli, S. and Hennig, C. (2020) Calibrating and aggregating cluster validity indexes for context-adapted comparison of clusterings. \emph{Statistics and Computing}, 30, 1523-1544, \url{https://link.springer.com/article/10.1007/s11222-020-09958-2}, \url{https://arxiv.org/abs/2002.01822} } \author{Christian Hennig \email{christian.hennig@unibo.it} \url{https://www.unibo.it/sitoweb/christian.hennig/en/} } \seealso{ \code{\link{stupidkcentroids}}, \code{\link{stupidkfn}}, \code{\link{stupidkaven}} } \examples{ set.seed(20000) options(digits=3) face <- rFace(200,dMoNo=2,dNoEy=0,p=2) stupidknn(dist(face),3) } \keyword{multivariate}% at least one, from doc/KEYWORDS \keyword{cluster}% __ONLY ONE__ keyword per line fpc/man/ncoord.Rd0000644000176200001440000000723213467541512013364 0ustar liggesusers\name{ncoord} \alias{ncoord} %- Also NEED an `\alias' for EACH other topic documented here. \title{Neighborhood based discriminant coordinates} \description{ Neighborhood based discriminant coordinates as defined in Hastie and Tibshirani (1996) and a robustified version as defined in Hennig (2003). The principle is to maximize the projection of a between classes covariance matrix, which is defined by averaging the between classes covariance matrices in the neighborhoods of all points. } \usage{ ncoord(xd, clvecd, nn=50, weighted=FALSE, sphere="mcd", orderall=TRUE, countmode=1000, ...) } %- maybe also `usage' for other objects documented here. \arguments{ \item{xd}{the data matrix; a numerical object which can be coerced to a matrix.} \item{clvecd}{integer vector of class numbers; length must equal \code{nrow(xd)}.} \item{nn}{integer. Number of points which belong to the neighborhood of each point (including the point itself).} \item{weighted}{logical. \code{FALSE} corresponds to the original method of Hastie and Tibshirani (1996). If \code{TRUE}, the between classes covariance matrices B are weighted by w/trace B, where w is some weight depending on the sizes of the classes in the neighborhood. Division by trace B reduces the effect of outliers. \code{TRUE} cooresponds to WNC as defined in Hennig (2003).} \item{sphere}{a covariance matrix or one of "mve", "mcd", "classical", "none". The matrix used for sphering the data. "mcd" and "mve" are robust covariance matrices as implemented in \code{\link{cov.rob}}. "classical" refers to the classical covariance matrix. "none" means no sphering and use of the raw data.} \item{orderall}{logical. By default, the neighborhoods are computed by ordering all points each time. If \code{FALSE}, the neighborhoods are computed by selecting \code{nn} times the nearest point from the remaining points, which may be faster sometimes.} \item{countmode}{optional positive integer. Every \code{countmode} algorithm runs \code{ncoord} shows a message.} \item{...}{no effect} } % \details{ % } \value{ List with the following components \item{ev}{eigenvalues in descending order.} \item{units}{columns are coordinates of projection basis vectors. New points \code{x} can be projected onto the projection basis vectors by \code{x \%*\% units}} \item{proj}{projections of \code{xd} onto \code{units}.} } \references{ Hastie, T. and Tibshirani, R. (1996). Discriminant adaptive nearest neighbor classification. \emph{IEEE Transactions on Pattern Analysis and Machine Intelligence} 18, 607-616. Hennig, C. (2004) Asymmetric linear dimension reduction for classification. Journal of Computational and Graphical Statistics 13, 930-945 . Hennig, C. (2005) A method for visual cluster validation. In: Weihs, C. and Gaul, W. (eds.): Classification - The Ubiquitous Challenge. Springer, Heidelberg 2005, 153-160. } \author{Christian Hennig \email{christian.hennig@unibo.it} \url{https://www.unibo.it/sitoweb/christian.hennig/en/} } \seealso{ \code{\link{plotcluster}} for straight forward discriminant plots. \code{\link{discrproj}} for alternatives. \code{\link{rFace}} for generation of the example data used below. } \examples{ set.seed(4634) face <- rFace(600,dMoNo=2,dNoEy=0) grface <- as.integer(attr(face,"grouping")) ncf <- ncoord(face,grface) plot(ncf$proj,col=grface) ncf2 <- ncoord(face,grface,weighted=TRUE) plot(ncf2$proj,col=grface) # ...done in one step by function plotcluster. } \keyword{multivariate}% at least one, from doc/KEYWORDS \keyword{classif}% __ONLY ONE__ keyword per line fpc/man/dipp.tantrum.Rd0000644000176200001440000000344013467541512014522 0ustar liggesusers\name{dipp.tantrum} \alias{dipp.tantrum} %- Also NEED an `\alias' for EACH other topic documented here. \title{Simulates p-value for dip test} \description{ Simulates p-value for dip test (see \code{\link[diptest]{dip}}) in the way suggested by Tantrum, Murua and Stuetzle (2003) from the clostest unimodal distribution determined by kernel density estimation with bandwith chosen so that the density just becomes unimodal. This is less conservative (and in fact sometimes anti-conservative) than the values from \code{\link[diptest]{dip.test}}. } \usage{ dipp.tantrum(xdata,d,M=100) } %- maybe also `usage' for other objects documented here. \arguments{ \item{xdata}{numeric vector. One-dimensional dataset.} \item{d}{numeric. Value of dip statistic.} \item{M}{integer. Number of artificial datasets generated in order to estimate the p-value.} } \value{ List with components \item{p.value}{approximated p-value.} \item{bw}{borderline unimodality bandwith in \code{\link{density}} with default settings.} \item{dv}{vector of dip statistic values from simulated artificial data.} } \references{ J. A. Hartigan and P. M. Hartigan (1985) The Dip Test of Unimodality, \emph{Annals of Statistics}, 13, 70-84. Tantrum, J., Murua, A. and Stuetzle, W. (2003) Assessment and Pruning of Hierarchical Model Based Clustering, \emph{Proceedings of the ninth ACM SIGKDD international conference on Knowledge discovery and data mining}, Washington, D.C., 197-205. } \author{Christian Hennig \email{christian.hennig@unibo.it} \url{https://www.unibo.it/sitoweb/christian.hennig/en/} } \examples{ # not run, requires package diptest # x <- runif(100) # d <- dip(x) # dt <- dipp.tantrum(x,d,M=10) } \keyword{cluster}% at least one, from doc/KEYWORDS % \keyword{multivariate} fpc/DESCRIPTION0000644000176200001440000000330414537033142012532 0ustar liggesusersPackage: fpc Title: Flexible Procedures for Clustering Version: 2.2-11 Date: 2023-12-14 Author: Christian Hennig Depends: R (>= 2.0) Imports: MASS, cluster, mclust, flexmix, prabclus, class, diptest, robustbase, kernlab, grDevices, graphics, methods, stats, utils, parallel Suggests: tclust, pdfCluster, mvtnorm Description: Various methods for clustering and cluster validation. Fixed point clustering. Linear regression clustering. Clustering by merging Gaussian mixture components. Symmetric and asymmetric discriminant projections for visualisation of the separation of groupings. Cluster validation statistics for distance based clustering including corrected Rand index. Standardisation of cluster validation statistics by random clusterings and comparison between many clustering methods and numbers of clusters based on this. Cluster-wise cluster stability assessment. Methods for estimation of the number of clusters: Calinski-Harabasz, Tibshirani and Walther's prediction strength, Fang and Wang's bootstrap stability. Gaussian/multinomial mixture fitting for mixed continuous/categorical variables. Variable-wise statistics for cluster interpretation. DBSCAN clustering. Interface functions for many clustering methods implemented in R, including estimating the number of clusters with kmeans, pam and clara. Modality diagnosis for Gaussian mixtures. For an overview see package?fpc. Maintainer: Christian Hennig License: GPL URL: https://www.unibo.it/sitoweb/christian.hennig/en/ NeedsCompilation: no Packaged: 2023-12-14 21:49:22 UTC; chrish Repository: CRAN Date/Publication: 2023-12-15 11:20:02 UTC fpc/tests/0000755000176200001440000000000014536674142012200 5ustar liggesusersfpc/tests/fpctests_notallin.Rout.save0000644000176200001440000015553314536673303017555 0ustar liggesusers R Under development (unstable) (2023-12-08 r85664) -- "Unsuffered Consequences" Copyright (C) 2023 The R Foundation for Statistical Computing Platform: x86_64-pc-linux-gnu R is free software and comes with ABSOLUTELY NO WARRANTY. You are welcome to redistribute it under certain conditions. Type 'license()' or 'licence()' for distribution details. R is a collaborative project with many contributors. Type 'contributors()' for more information and 'citation()' on how to cite R or R packages in publications. Type 'demo()' for some demos, 'help()' for on-line help, or 'help.start()' for an HTML browser interface to help. Type 'q()' to quit R. > # This tests a few things that are not run in the examples. > > library(fpc) > library(MASS) > library(diptest) > library(mclust) Package 'mclust' version 6.0.1 Type 'citation("mclust")' for citing this R package in publications. > options(digits=3) > > set.seed(4634) > face <- rFace(300,dMoNo=2,dNoEy=0,p=3) > grface <- as.integer(attr(face,"grouping")) > # discrproj(face,grface, clnum=1, method="bc")$units > discrproj(face,grface, clnum=1, method="anc")$units [,1] [,2] [,3] [1,] -1.3912 -0.3093 0.1093 [2,] 0.6211 -0.2233 0.0164 [3,] -0.0313 0.0749 -0.8074 > discrproj(face,grface, clnum=1, method="awc")$units [,1] [,2] [,3] [1,] 0.215 -0.3389 -0.51886 [2,] -0.370 0.0144 -0.00893 [3,] 0.111 0.7914 -0.23574 > > > pamk(face,krange=1:5,criterion="ch",usepam=FALSE,critout=TRUE) 1 clusters 0 2 clusters 1321 3 clusters 963 4 clusters 833 5 clusters 934 $pamobject Call: clara(x = sdata, k = k) Medoids: [,1] [,2] [,3] [1,] 0.119 3.53 1.49 [2,] 1.742 17.02 1.12 Objective function: 2.44 Clustering vector: int [1:300] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ... Cluster sizes: 202 98 Best sample: [1] 5 11 18 21 28 40 50 61 62 65 79 82 83 86 93 94 105 119 130 [20] 160 172 180 182 194 195 202 206 208 217 223 230 231 239 248 250 256 259 261 [39] 264 268 271 274 277 299 Available components: [1] "sample" "medoids" "i.med" "clustering" "objective" [6] "clusinfo" "diss" "call" "silinfo" "data" $nc [1] 2 $crit [1] 0 1321 963 833 934 > > set.seed(20000) > face50 <- rFace(50,dMoNo=2,dNoEy=0,p=2) > pamk(dist(face50),krange=1:5,criterion="asw",critout=TRUE) 1 clusters 0 2 clusters 0.742 3 clusters 0.748 4 clusters 0.581 5 clusters 0.544 $pamobject Medoids: ID [1,] "22" "22" [2,] "34" "34" [3,] "49" "49" Clustering vector: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 3 3 Objective function: build swap 2.14 2.09 Available components: [1] "medoids" "id.med" "clustering" "objective" "isolation" [6] "clusinfo" "silinfo" "diss" "call" $nc [1] 3 $crit [1] 0.000 0.742 0.748 0.581 0.544 > > x <- c(1,2,3,6,6,7,8,120) > ff8 <- fixmahal(x) > summary(ff8) * Mahalanobis Fixed Point Clusters * Often a clear cluster in the data leads to several similar FPCs. The summary shows the representative FPCs of groups of similar FPCs. Method fuzzy was used. Number of representative FPCs: 1 FPCs with less than 4 points were skipped. FPCs with ratio of times found to number of points less than 0.1 were skipped. 0 iteration runs led to 0 skipped clusters. Weight 1 for r^2<= 3.84 weight 0 for r^2> 7.88 Constant ca= 3.84 corresponding to alpha= 0.95 FPC 1 Times found (group members): 9 Ratio to size: 1.29 Mean: [1] 4.71 Covariance matrix: [,1] [1,] 6.2 Number of points (sum of weights): 7 Number of points (rounded weights) in intersection of representative FPCs [,1] [1,] 7 > # ...dataset a bit too small for the defaults... > ff9 <- fixmahal(x, mnc=3, startn=3) > summary(ff9) * Mahalanobis Fixed Point Clusters * Often a clear cluster in the data leads to several similar FPCs. The summary shows the representative FPCs of groups of similar FPCs. Method fuzzy was used. Number of representative FPCs: 3 FPCs with less than 3 points were skipped. FPCs with ratio of times found to number of points less than 0.1 were skipped. 0 iteration runs led to 0 skipped clusters. Weight 1 for r^2<= 3.84 weight 0 for r^2> 7.88 Constant ca= 3.84 corresponding to alpha= 0.95 FPC 1 Times found (group members): 4 Ratio to size: 1.33 Mean: [1] 6.33 Covariance matrix: [,1] [1,] 0.222 Number of points (sum of weights): 3 FPC 2 Times found (group members): 3 Ratio to size: 1 Mean: [1] 2 Covariance matrix: [,1] [1,] 0.667 Number of points (sum of weights): 3 FPC 3 Times found (group members): 2 Ratio to size: 0.286 Mean: [1] 4.71 Covariance matrix: [,1] [1,] 6.2 Number of points (sum of weights): 7 Number of points (rounded weights) in intersection of representative FPCs [,1] [,2] [,3] [1,] 3 0 3 [2,] 0 3 3 [3,] 3 3 7 > > set.seed(776655) > v1 <- rnorm(100) > v2 <- rnorm(100) > d1 <- sample(1:5,100,replace=TRUE) > d2 <- sample(1:4,100,replace=TRUE) > ldata <- cbind(v1,v2,d1,d2) > fr <- flexmixedruns(ldata, + continuous=2,discrete=2,simruns=1,initial.cluster=c(rep(1,5),rep(2,45), + rep(3,50)), + control=list(minprior=0.1), + n.cluster=3,allout=FALSE) k= 3 new best fit found in run 1 k= 3 BIC= 1299 > print(fr$optsummary) Call: flexmix(formula = x ~ 1, k = k, cluster = initial.cluster, model = lcmixed(continuous = continuous, discrete = discrete, ppdim = ppdim, diagonal = diagonal), control = control) prior size post>0 ratio Comp.1 0.204 23 61 0.377 Comp.2 0.284 30 71 0.423 Comp.3 0.512 47 77 0.610 'log Lik.' -569 (df=35) AIC: 1208 BIC: 1299 > > dface <- dist(face50) > > > hclusttreeCBI(face50,minlevel=2,method="complete",scaling=TRUE) $result Call: hclust(d = dist(sdata), method = method) Cluster method : complete Distance : euclidean Number of objects: 50 $nc [1] 48 $clusterlist $clusterlist[[1]] [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [49] TRUE TRUE $clusterlist[[2]] [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [13] FALSE FALSE TRUE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [49] FALSE FALSE $clusterlist[[3]] [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE [13] FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [49] FALSE FALSE $clusterlist[[4]] [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [49] FALSE FALSE $clusterlist[[5]] [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [25] TRUE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [49] FALSE FALSE $clusterlist[[6]] [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [25] FALSE TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [49] FALSE FALSE $clusterlist[[7]] [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE [37] FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [49] FALSE FALSE $clusterlist[[8]] [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [37] TRUE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [49] FALSE FALSE $clusterlist[[9]] [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [25] FALSE FALSE TRUE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [49] FALSE FALSE $clusterlist[[10]] [1] FALSE FALSE TRUE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [49] FALSE FALSE $clusterlist[[11]] [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [37] FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE TRUE FALSE FALSE FALSE [49] FALSE FALSE $clusterlist[[12]] [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [13] TRUE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [49] FALSE FALSE $clusterlist[[13]] [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE [37] TRUE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [49] FALSE FALSE $clusterlist[[14]] [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [13] FALSE FALSE TRUE FALSE TRUE FALSE FALSE FALSE FALSE FALSE TRUE FALSE [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [49] FALSE FALSE $clusterlist[[15]] [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE TRUE FALSE FALSE [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [49] FALSE FALSE $clusterlist[[16]] [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [37] FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE TRUE FALSE FALSE [49] FALSE FALSE $clusterlist[[17]] [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE [37] TRUE TRUE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [49] FALSE FALSE $clusterlist[[18]] [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [13] FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [49] FALSE FALSE $clusterlist[[19]] [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [25] FALSE FALSE TRUE TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [49] FALSE FALSE $clusterlist[[20]] [1] FALSE FALSE FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE FALSE FALSE [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [49] FALSE FALSE $clusterlist[[21]] [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE [13] TRUE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [49] FALSE FALSE $clusterlist[[22]] [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE [25] FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [49] FALSE FALSE $clusterlist[[23]] [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE TRUE [37] TRUE TRUE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [49] FALSE FALSE $clusterlist[[24]] [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [25] TRUE TRUE FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE FALSE FALSE [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [49] FALSE FALSE $clusterlist[[25]] [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE FALSE [13] FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE FALSE TRUE FALSE FALSE [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [49] FALSE FALSE $clusterlist[[26]] [1] FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [13] FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [49] FALSE FALSE $clusterlist[[27]] [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE TRUE TRUE [37] TRUE TRUE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [49] FALSE FALSE $clusterlist[[28]] [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [25] TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE FALSE FALSE FALSE FALSE [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [49] FALSE FALSE $clusterlist[[29]] [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [13] FALSE FALSE TRUE TRUE TRUE FALSE FALSE FALSE TRUE FALSE TRUE FALSE [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [49] FALSE FALSE $clusterlist[[30]] [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [49] FALSE FALSE $clusterlist[[31]] [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [37] FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE FALSE TRUE FALSE FALSE [49] FALSE FALSE $clusterlist[[32]] [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE TRUE TRUE [37] TRUE TRUE FALSE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [49] FALSE FALSE $clusterlist[[33]] [1] TRUE FALSE TRUE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [49] FALSE FALSE $clusterlist[[34]] [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE TRUE [13] TRUE FALSE FALSE FALSE FALSE TRUE TRUE TRUE FALSE TRUE FALSE FALSE [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [49] FALSE FALSE $clusterlist[[35]] [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE [25] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [49] FALSE FALSE $clusterlist[[36]] [1] FALSE FALSE FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE FALSE FALSE [13] FALSE FALSE TRUE TRUE TRUE FALSE FALSE FALSE TRUE FALSE TRUE FALSE [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [49] FALSE FALSE $clusterlist[[37]] [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [37] FALSE FALSE FALSE FALSE FALSE TRUE TRUE TRUE TRUE TRUE FALSE FALSE [49] FALSE FALSE $clusterlist[[38]] [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE TRUE TRUE [37] TRUE TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [49] FALSE FALSE $clusterlist[[39]] [1] FALSE FALSE FALSE TRUE FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE [13] FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [49] FALSE FALSE $clusterlist[[40]] [1] FALSE FALSE FALSE FALSE FALSE TRUE TRUE FALSE FALSE TRUE TRUE TRUE [13] TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [49] FALSE FALSE $clusterlist[[41]] [1] FALSE FALSE FALSE FALSE FALSE TRUE TRUE FALSE FALSE TRUE TRUE TRUE [13] TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE [25] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [49] FALSE FALSE $clusterlist[[42]] [1] TRUE TRUE TRUE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [49] FALSE FALSE $clusterlist[[43]] [1] FALSE FALSE FALSE TRUE FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE [13] FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [37] FALSE FALSE FALSE FALSE FALSE TRUE TRUE TRUE TRUE TRUE FALSE FALSE [49] FALSE FALSE $clusterlist[[44]] [1] TRUE TRUE TRUE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE TRUE TRUE [37] TRUE TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [49] FALSE FALSE $clusterlist[[45]] [1] FALSE FALSE FALSE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE [13] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE [25] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE [37] FALSE FALSE FALSE FALSE FALSE TRUE TRUE TRUE TRUE TRUE FALSE FALSE [49] FALSE FALSE $clusterlist[[46]] [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE [49] TRUE TRUE $clusterlist[[47]] [1] TRUE TRUE TRUE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE TRUE TRUE [37] TRUE TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE TRUE [49] FALSE FALSE $clusterlist[[48]] [1] FALSE FALSE FALSE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE [13] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE [25] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE [37] FALSE FALSE FALSE FALSE FALSE TRUE TRUE TRUE TRUE TRUE TRUE FALSE [49] TRUE TRUE $partition [1] 1 1 1 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 [39] 1 1 1 2 2 2 2 2 2 1 2 2 $clustermethod [1] "hclust, full tree" > > disthclusttreeCBI(dface,minlevel=2,method="complete") $result Call: hclust(d = as.dist(dmatrix), method = method) Cluster method : complete Distance : euclidean Number of objects: 50 $nc [1] 48 $clusterlist $clusterlist[[1]] [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [49] TRUE TRUE $clusterlist[[2]] [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [49] FALSE FALSE $clusterlist[[3]] [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE [13] FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [49] FALSE FALSE $clusterlist[[4]] [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [13] FALSE FALSE TRUE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [49] FALSE FALSE $clusterlist[[5]] [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE [37] FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [49] FALSE FALSE $clusterlist[[6]] [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE [37] FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [49] FALSE FALSE $clusterlist[[7]] [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE [13] FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE TRUE FALSE [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [49] FALSE FALSE $clusterlist[[8]] [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE [37] TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [49] FALSE FALSE $clusterlist[[9]] [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE [13] TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [49] FALSE FALSE $clusterlist[[10]] [1] FALSE FALSE TRUE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [49] FALSE FALSE $clusterlist[[11]] [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [13] FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [49] FALSE FALSE $clusterlist[[12]] [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [37] FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE TRUE FALSE FALSE [49] FALSE FALSE $clusterlist[[13]] [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [25] FALSE FALSE TRUE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [49] FALSE FALSE $clusterlist[[14]] [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [25] TRUE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [49] FALSE FALSE $clusterlist[[15]] [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [25] TRUE TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [49] FALSE FALSE $clusterlist[[16]] [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE [13] FALSE FALSE FALSE FALSE FALSE TRUE FALSE TRUE FALSE FALSE FALSE FALSE [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [49] FALSE FALSE $clusterlist[[17]] [1] FALSE FALSE FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE FALSE FALSE [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [49] FALSE FALSE $clusterlist[[18]] [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE [13] FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE TRUE TRUE FALSE [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [49] FALSE FALSE $clusterlist[[19]] [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [37] FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE TRUE FALSE FALSE FALSE [49] FALSE FALSE $clusterlist[[20]] [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [13] FALSE FALSE TRUE TRUE TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [49] FALSE FALSE $clusterlist[[21]] [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE [25] FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [49] FALSE FALSE $clusterlist[[22]] [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE [37] FALSE TRUE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [49] FALSE FALSE $clusterlist[[23]] [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [37] FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE FALSE TRUE FALSE FALSE [49] FALSE FALSE $clusterlist[[24]] [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [25] TRUE TRUE FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE FALSE FALSE [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [49] FALSE FALSE $clusterlist[[25]] [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE TRUE FALSE [37] TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [49] FALSE FALSE $clusterlist[[26]] [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE [25] FALSE FALSE TRUE FALSE TRUE FALSE FALSE TRUE FALSE FALSE FALSE FALSE [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [49] FALSE FALSE $clusterlist[[27]] [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE FALSE [13] FALSE FALSE FALSE FALSE FALSE TRUE TRUE TRUE FALSE TRUE TRUE FALSE [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [49] FALSE FALSE $clusterlist[[28]] [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE [13] TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [49] FALSE FALSE $clusterlist[[29]] [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE TRUE FALSE [37] TRUE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [49] FALSE FALSE $clusterlist[[30]] [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE TRUE [37] FALSE TRUE FALSE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [49] FALSE FALSE $clusterlist[[31]] [1] FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE [13] TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [49] FALSE FALSE $clusterlist[[32]] [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE [25] FALSE FALSE TRUE TRUE TRUE FALSE FALSE TRUE FALSE FALSE FALSE FALSE [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [49] FALSE FALSE $clusterlist[[33]] [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [37] FALSE FALSE FALSE FALSE FALSE TRUE TRUE TRUE TRUE TRUE FALSE FALSE [49] FALSE FALSE $clusterlist[[34]] [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE FALSE [13] FALSE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [49] FALSE FALSE $clusterlist[[35]] [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE TRUE TRUE [37] TRUE TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [49] FALSE FALSE $clusterlist[[36]] [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [49] FALSE FALSE $clusterlist[[37]] [1] FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE TRUE TRUE TRUE [13] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [49] FALSE FALSE $clusterlist[[38]] [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE [25] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [49] FALSE FALSE $clusterlist[[39]] [1] TRUE FALSE TRUE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [49] FALSE FALSE $clusterlist[[40]] [1] FALSE TRUE FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE FALSE FALSE [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [49] FALSE FALSE $clusterlist[[41]] [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE [25] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [49] FALSE FALSE $clusterlist[[42]] [1] FALSE TRUE FALSE TRUE FALSE TRUE TRUE FALSE FALSE TRUE TRUE TRUE [13] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [49] FALSE FALSE $clusterlist[[43]] [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE TRUE TRUE [37] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE FALSE [49] FALSE FALSE $clusterlist[[44]] [1] TRUE FALSE TRUE FALSE TRUE FALSE FALSE TRUE TRUE FALSE FALSE FALSE [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE [25] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [49] FALSE FALSE $clusterlist[[45]] [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE [49] TRUE TRUE $clusterlist[[46]] [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE [13] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE [25] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [49] FALSE FALSE $clusterlist[[47]] [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE [49] TRUE TRUE $clusterlist[[48]] [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE TRUE TRUE [37] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE [49] TRUE TRUE $partition [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 [39] 2 2 2 2 2 2 2 2 2 2 2 2 $clustermethod [1] "hclust, full tree" > > noisemclustCBI(face50,G=1:5,emModelNames="VVV",nnk=2) $result Bayesian Information Criterion (BIC): EII VII EEI VEI EVI VVI EEE VEE EVE VVE EEV VEV EVV VVV 1 -521 -521 -506 -506 -506 -506 -525 -525 -525 -525 -525 -525 -525 -525 2 -498 -501 -501 -477 -464 -466 -505 -481 -467 -470 -467 -470 -470 -473 3 -468 -467 -480 -466 -476 -461 -482 -470 -479 -465 -480 -468 -486 -469 4 -449 -448 -485 -449 -445 -449 -488 -452 -439 -452 -486 -482 -444 -455 5 -456 -452 -456 -454 -444 -458 -460 -458 -448 -462 -451 -485 -456 -469 Top 3 models based on the BIC criterion: EVE,4 EVV,4 EVI,5 -439 -444 -444 $nc [1] 5 $nccl [1] 4 $clusterlist $clusterlist[[1]] [1] FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE TRUE TRUE TRUE [13] TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [49] FALSE FALSE $clusterlist[[2]] [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE [25] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [49] FALSE FALSE $clusterlist[[3]] [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE TRUE TRUE [37] TRUE TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [49] FALSE FALSE $clusterlist[[4]] [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [37] FALSE FALSE FALSE FALSE FALSE TRUE TRUE TRUE TRUE TRUE FALSE FALSE [49] FALSE FALSE $clusterlist[[5]] [1] TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE FALSE FALSE FALSE [13] FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE [49] TRUE TRUE $partition [1] 5 5 5 1 5 5 5 5 5 1 1 1 1 5 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 [39] 3 3 3 4 4 4 4 4 5 5 5 5 $nnk [1] 2 $initnoise [1] TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE FALSE FALSE FALSE [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE [49] TRUE TRUE $clustermethod [1] "mclustBIC" > > distnoisemclustCBI(dface,G=5,emModelNames="EEE",nnk=2, + mdsmethod="classical", + mdsdim=2) $result Bayesian Information Criterion (BIC): EII VII EEI VEI EVI VVI EEE VEE EVE VVE EEV VEV EVV VVV 5 -461 NA -496 NA NA NA -500 NA NA NA -496 NA NA NA Top 3 models based on the BIC criterion: EII,5 EEI,5 EEV,5 -461 -496 -496 $nc [1] 6 $nccl [1] 5 $clusterlist $clusterlist[[1]] 1 2 3 4 5 6 7 8 9 10 11 12 13 FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE TRUE TRUE TRUE TRUE 14 15 16 17 18 19 20 21 22 23 24 25 26 TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE FALSE FALSE 27 28 29 30 31 32 33 34 35 36 37 38 39 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE 40 41 42 43 44 45 46 47 48 49 50 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE $clusterlist[[2]] 1 2 3 4 5 6 7 8 9 10 11 12 13 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE 14 15 16 17 18 19 20 21 22 23 24 25 26 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE TRUE 27 28 29 30 31 32 33 34 35 36 37 38 39 TRUE TRUE TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE 40 41 42 43 44 45 46 47 48 49 50 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE $clusterlist[[3]] 1 2 3 4 5 6 7 8 9 10 11 12 13 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE 14 15 16 17 18 19 20 21 22 23 24 25 26 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE 27 28 29 30 31 32 33 34 35 36 37 38 39 FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE 40 41 42 43 44 45 46 47 48 49 50 TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE $clusterlist[[4]] 1 2 3 4 5 6 7 8 9 10 11 12 13 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE 14 15 16 17 18 19 20 21 22 23 24 25 26 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE 27 28 29 30 31 32 33 34 35 36 37 38 39 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE 40 41 42 43 44 45 46 47 48 49 50 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE $clusterlist[[5]] 1 2 3 4 5 6 7 8 9 10 11 12 13 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE 14 15 16 17 18 19 20 21 22 23 24 25 26 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE 27 28 29 30 31 32 33 34 35 36 37 38 39 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE 40 41 42 43 44 45 46 47 48 49 50 FALSE FALSE TRUE TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE $clusterlist[[6]] 1 2 3 4 5 6 7 8 9 10 11 12 13 TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE 14 15 16 17 18 19 20 21 22 23 24 25 26 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE 27 28 29 30 31 32 33 34 35 36 37 38 39 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE 40 41 42 43 44 45 46 47 48 49 50 FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE TRUE TRUE $partition 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 6 6 6 1 6 6 6 6 6 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 5 5 5 5 5 6 6 6 6 $nnk [1] 2 $initnoise [1] TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE FALSE FALSE FALSE [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE [49] TRUE TRUE $clustermethod [1] "mclustBIC" > > mahalCBI(face50,clustercut=0.5) $result Mahalanobis Fixed Point Cluster object 3 representative stable fixed point clusters of totally 7 found fixed point clusters. $nc [1] 3 $clusterlist $clusterlist[[1]] [1] 0 0 0 1 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 0 0 0 1 1 0 0 0 0 0 0 0 [39] 0 0 0 0 0 0 0 0 0 0 0 0 $clusterlist[[2]] [1] 0 0 0 0 0 1 1 0 0 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 [39] 0 0 0 0 0 0 0 0 0 0 0 0 $clusterlist[[3]] [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 [39] 1 1 1 1 1 1 1 1 0 0 0 0 $partition [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 [39] 1 1 1 1 1 1 1 1 1 1 1 1 $clustermethod [1] "fixmahal" > > set.seed(20000) > face100 <- rFace(100,dMoNo=2,dNoEy=0,p=2) > cbf <- clusterboot(face100,B=2,clustermethod=speccCBI,showplots=TRUE,k=6,seed=50000) boot 1 boot 2 > cbf$nc [1] 6 > cbf$noisemethod [1] FALSE > cbf$bootmethod [1] "boot" > # suppressWarnings(if(require(tclust)) > # print(clusterboot(face100,B=2,clustermethod=tclustCBI,showplots=TRUE,k=5,seed=50000,noisemethod=TRUE))) > > > complete3 <- cutree(hclust(dface),3) > > cluster.stats(dface,complete3,G2=TRUE) $n [1] 50 $cluster.number [1] 3 $cluster.size [1] 32 14 4 $min.cluster.size [1] 4 $noisen [1] 0 $diameter [1] 8.53 4.94 9.00 $average.distance [1] 2.95 2.15 7.05 $median.distance [1] 2.94 1.44 8.32 $separation [1] 8.67 7.46 7.46 $average.toother [1] 15.5 12.9 20.6 $separation.matrix [,1] [,2] [,3] [1,] 0.00 8.67 17.24 [2,] 8.67 0.00 7.46 [3,] 17.24 7.46 0.00 $ave.between.matrix [,1] [,2] [,3] [1,] 0.0 13 24.4 [2,] 13.0 0 12.0 [3,] 24.4 12 0.0 $average.between [1] 15.2 $average.within [1] 3.05 $n.between [1] 632 $n.within [1] 593 $max.diameter [1] 9 $min.separation [1] 7.46 $within.cluster.ss [1] 319 $clus.avg.silwidths 1 2 3 0.764 0.821 0.346 $avg.silwidth [1] 0.746 $g2 [1] 1 $g3 NULL $pearsongamma [1] 0.838 $dunn [1] 0.829 $dunn2 [1] 1.71 $entropy [1] 0.844 $wb.ratio [1] 0.2 $ch [1] 229 $cwidegap [1] 2.90 2.49 8.32 $widestgap [1] 8.32 $sindex [1] 7.78 $corrected.rand NULL $vi NULL > > set.seed(55667788) > > data(crabs) > dc <- crabs[,4:8] > cmo <- mclustBIC(crabs[,4:8],G=9,modelNames="EEE") > # set.seed(12345) > cm <- mclustBIC(crabs[,4:8],G=9,modelNames="EEE", + initialization=list(noise=(1:200)[sample(200,50)])) > > > scm <- summary(cm,crabs[,4:8]) > scmo <- summary(cmo,crabs[,4:8]) > > set.seed(334455) > summary(mergenormals(crabs[,4:8],scm,method="ridge.ratio",by=0.05)) * Merging Gaussian mixture components * Method: ridge.ratio , cutoff value: 0.2 Original number of components: 9 (not including noise which is denoted by clustering=0) Number of clusters after merging: 1 Values at which clusters were merged: [,1] [,2] [1,] 8 0.828 [2,] 7 1.000 [3,] 6 1.000 [4,] 5 0.888 [5,] 4 1.000 [6,] 3 1.000 [7,] 2 0.784 [8,] 1 0.845 Components assigned to clusters: [,1] [1,] 0 [2,] 1 [3,] 1 [4,] 1 [5,] 1 [6,] 1 [7,] 1 [8,] 1 [9,] 1 [10,] 1 > summary(mergenormals(crabs[,4:8],scmo,method="ridge.uni",by=0.05)) * Merging Gaussian mixture components * Method: ridge.uni , cutoff value: 1 Original number of components: 9 Number of clusters after merging: 8 Values at which clusters were merged: [,1] [1,] 8 [2,] 9 Components assigned to clusters: [,1] [1,] 1 [2,] 1 [3,] 2 [4,] 3 [5,] 4 [6,] 5 [7,] 6 [8,] 7 [9,] 8 > # summary(mergenormals(crabs[,4:8],scm,method="diptantrum",by=0.05)) > # summary(mergenormals(crabs[,4:8],scmo,method="dipuni",by=0.05)) > # summary(mergenormals(crabs[,4:8],scm,method="predictive",M=2)) > > set.seed(20000) > x1 <- rnorm(50) > y <- rnorm(100) > x2 <- rnorm(40,mean=20) > x3 <- rnorm(10,mean=25,sd=100) > x0 <- cbind(c(x1,x2,x3),y) > > prediction.strength(x0,M=10,Gmax=4, + clustermethod=noisemclustCBI, + classification="qda") Prediction strength Clustering method: mclustBIC Maximum number of clusters: 4 Resampled data sets: 10 Mean pred.str. for numbers of clusters: 1 0.815 0.874 0.591 Cutoff value: 0.8 Largest number of clusters better than cutoff: 3 > > prediction.strength(dist(x0),M=10,Gmax=4, + clustermethod=claraCBI, + classification="centroids") Prediction strength Clustering method: clara/pam Maximum number of clusters: 4 Resampled data sets: 10 Mean pred.str. for numbers of clusters: 1 0 0 0 Cutoff value: 0.8 Largest number of clusters better than cutoff: 1 > > > set.seed(20000) > xdata <- c(rnorm(10,0,1),rnorm(10,8,1)) > clustermethod=c("claraCBI","dbscanCBI") > > clustermethodpars <- list() > clustermethodpars[[1]] <- clustermethodpars[[2]] <- list() > clustermethodpars[[2]]$eps <- 2 > clustermethodpars[[2]]$MinPts <- 2 > cbs <- clusterbenchstats(xdata,G=3,clustermethod=clustermethod, + distmethod=rep(TRUE,2),ncinput=c(TRUE,FALSE),scaling=FALSE, + clustermethodpars=clustermethodpars,nnruns=2,kmruns=2,fnruns=1,avenruns=1,useallg=TRUE) [1] "claraCBI" [1] "dbscanCBI" [1] "Computation of validity statistics" comsum 1 comsum 2 [1] "Simulation" 3 clusters; nn run 1 3 clusters; nn run 2 3 clusters; fn run 1 3 clusters; aven run 1 3 clusters; km run 1 3 clusters; km run 2 [1] "Simulation quantile re-standardisation" [1] "Simulation sd re-standardisation" > > print(cbs$sstat,aggregate=TRUE,weights=c(1,0,0,0,0,1,0,0,0,1,0,1,1,0,0,1),include.othernc=cbs$cm$othernc) avewithin method 2 3 1 claraCBI NA 0.67 2 dbscanCBI 0.14 NA mnnd method 2 3 1 claraCBI NA 0.37 2 dbscanCBI 0.39 NA cvnnd method 2 3 1 claraCBI NA 0.32 2 dbscanCBI 0.3 NA maxdiameter method 2 3 1 claraCBI NA 0.66 2 dbscanCBI 0.56 NA widestgap method 2 3 1 claraCBI NA 0.59 2 dbscanCBI 0.59 NA sindex method 2 3 1 claraCBI NA 0.1 2 dbscanCBI 5.4 NA minsep method 2 3 1 claraCBI NA -1.6 2 dbscanCBI 17 NA asw method 2 3 1 claraCBI NA 0.55 2 dbscanCBI 1.7 NA dindex method 2 3 1 claraCBI NA 1.4 2 dbscanCBI -1.3 NA denscut method 2 3 1 claraCBI NA -2.3 2 dbscanCBI 0.38 NA highdgap method 2 3 1 claraCBI NA 0.73 2 dbscanCBI 0.55 NA pearsongamma method 2 3 1 claraCBI NA 0.54 2 dbscanCBI 1.9 NA withinss method 2 3 1 claraCBI NA 0.63 2 dbscanCBI 0.42 NA entropy method 2 3 1 claraCBI NA 0.85 2 dbscanCBI 1.6 NA pamc method 2 3 1 claraCBI NA 0.83 2 dbscanCBI 0.052 NA dmode method 2 3 1 claraCBI NA 1.2 2 dbscanCBI -0.87 NA aggregate method 2 3 1 claraCBI NA 0.15 2 dbscanCBI 1.2 NA > print(cbs$qstat,aggregate=TRUE,weights=c(1,0,0,0,0,1,0,0,0,1,0,1,1,0,0,1),include.othernc=cbs$cm$othernc) avewithin method 2 3 1 claraCBI NA 0.88 2 dbscanCBI 0.38 NA mnnd method 2 3 1 claraCBI NA 0.62 2 dbscanCBI 0.75 NA cvnnd method 2 3 1 claraCBI NA 0.75 2 dbscanCBI 0.75 NA maxdiameter method 2 3 1 claraCBI NA 0.88 2 dbscanCBI 0.38 NA widestgap method 2 3 1 claraCBI NA 0.38 2 dbscanCBI 0.38 NA sindex method 2 3 1 claraCBI NA 0.38 2 dbscanCBI 1 NA minsep method 2 3 1 claraCBI NA 0.12 2 dbscanCBI 1 NA asw method 2 3 1 claraCBI NA 0.62 2 dbscanCBI 1 NA dindex method 2 3 1 claraCBI NA 0.88 2 dbscanCBI 0.12 NA denscut method 2 3 1 claraCBI NA 0.12 2 dbscanCBI 0.25 NA highdgap method 2 3 1 claraCBI NA 0.88 2 dbscanCBI 0.38 NA pearsongamma method 2 3 1 claraCBI NA 0.5 2 dbscanCBI 1 NA withinss method 2 3 1 claraCBI NA 0.88 2 dbscanCBI 0.38 NA entropy method 2 3 1 claraCBI NA 0.75 2 dbscanCBI 1 NA pamc method 2 3 1 claraCBI NA 0.88 2 dbscanCBI 0.38 NA dmode method 2 3 1 claraCBI NA 0.88 2 dbscanCBI 0.19 NA aggregate method 2 3 1 claraCBI NA 0.6 2 dbscanCBI 0.53 NA > > > > proc.time() user system elapsed 6.340 0.055 6.430 fpc/tests/Examples/0000755000176200001440000000000014536674142013756 5ustar liggesusersfpc/tests/Examples/fpc-Ex.Rout.save0000644000176200001440000032301714536673274016723 0ustar liggesusers R Under development (unstable) (2023-12-08 r85664) -- "Unsuffered Consequences" Copyright (C) 2023 The R Foundation for Statistical Computing Platform: x86_64-pc-linux-gnu R is free software and comes with ABSOLUTELY NO WARRANTY. You are welcome to redistribute it under certain conditions. Type 'license()' or 'licence()' for distribution details. Natural language support but running in an English locale R is a collaborative project with many contributors. Type 'contributors()' for more information and 'citation()' on how to cite R or R packages in publications. Type 'demo()' for some demos, 'help()' for on-line help, or 'help.start()' for an HTML browser interface to help. Type 'q()' to quit R. > pkgname <- "fpc" > source(file.path(R.home("share"), "R", "examples-header.R")) > options(warn = 1) > library('fpc') > > base::assign(".oldSearch", base::search(), pos = 'CheckExEnv') > base::assign(".old_wd", base::getwd(), pos = 'CheckExEnv') > cleanEx() > nameEx("adcoord") > ### * adcoord > > flush(stderr()); flush(stdout()) > > ### Name: adcoord > ### Title: Asymmetric discriminant coordinates > ### Aliases: adcoord > ### Keywords: multivariate classif > > ### ** Examples > > set.seed(4634) > face <- rFace(600,dMoNo=2,dNoEy=0) > grface <- as.integer(attr(face,"grouping")) > adcf <- adcoord(face,grface==2) > adcf2 <- adcoord(face,grface==4) > plot(adcf$proj,col=1+(grface==2)) > plot(adcf2$proj,col=1+(grface==4)) > # ...done in one step by function plotcluster. > > > > cleanEx() > nameEx("ancoord") > ### * ancoord > > flush(stderr()); flush(stdout()) > > ### Name: ancoord > ### Title: Asymmetric neighborhood based discriminant coordinates > ### Aliases: ancoord > ### Keywords: multivariate classif > > ### ** Examples > > set.seed(4634) > face <- rFace(600,dMoNo=2,dNoEy=0) > grface <- as.integer(attr(face,"grouping")) > ancf2 <- ancoord(face,grface==4) > plot(ancf2$proj,col=1+(grface==4)) > # ...done in one step by function plotcluster. > > > > cleanEx() > nameEx("awcoord") > ### * awcoord > > flush(stderr()); flush(stdout()) > > ### Name: awcoord > ### Title: Asymmetric weighted discriminant coordinates > ### Aliases: awcoord > ### Keywords: multivariate classif > > ### ** Examples > > set.seed(4634) > face <- rFace(600,dMoNo=2,dNoEy=0) > grface <- as.integer(attr(face,"grouping")) > awcf <- awcoord(face,grface==1) > # awcf2 <- ancoord(face,grface==1, method="mcd") > plot(awcf$proj,col=1+(grface==1)) > # plot(awcf2$proj,col=1+(grface==1)) > # ...done in one step by function plotcluster. > > > > cleanEx() > nameEx("batcoord") > ### * batcoord > > flush(stderr()); flush(stdout()) > > ### Name: batcoord > ### Title: Bhattacharyya discriminant projection > ### Aliases: batcoord batvarcoord > ### Keywords: multivariate classif > > ### ** Examples > > set.seed(4634) > face <- rFace(600,dMoNo=2,dNoEy=0) > grface <- as.integer(attr(face,"grouping")) > bcf2 <- batcoord(face,grface==2) > plot(bcf2$proj,col=1+(grface==2)) > bcfv2 <- batcoord(face,grface==2,dom="variance") > plot(bcfv2$proj,col=1+(grface==2)) > bcfvv2 <- batvarcoord(face,grface==2) > plot(bcfvv2$proj,col=1+(grface==2)) > > > > cleanEx() > nameEx("bhattacharyya.dist") > ### * bhattacharyya.dist > > flush(stderr()); flush(stdout()) > > ### Name: bhattacharyya.dist > ### Title: Bhattacharyya distance between Gaussian distributions > ### Aliases: bhattacharyya.dist > ### Keywords: multivariate > > ### ** Examples > > round(bhattacharyya.dist(c(1,1),c(2,5),diag(2),diag(2)),digits=2) modulus 2.12 > > > > cleanEx() > nameEx("bhattacharyya.matrix") > ### * bhattacharyya.matrix > > flush(stderr()); flush(stdout()) > > ### Name: bhattacharyya.matrix > ### Title: Matrix of pairwise Bhattacharyya distances > ### Aliases: bhattacharyya.matrix > ### Keywords: cluster multivariate > > ### ** Examples > > muarray <-cbind(c(0,0),c(0,0.1),c(10,10)) > sigmaarray <- array(c(diag(2),diag(2),diag(2)),dim=c(2,2,3)) > bhattacharyya.matrix(muarray,sigmaarray,ipairs=list(c(1,2),c(2,3))) [,1] [,2] [,3] [1,] NA 9.987508e-01 NA [2,] 0.9987508 NA 1.78102e-11 [3,] NA 1.781020e-11 NA > > > > > cleanEx() > nameEx("calinhara") > ### * calinhara > > flush(stderr()); flush(stdout()) > > ### Name: calinhara > ### Title: Calinski-Harabasz index > ### Aliases: calinhara > ### Keywords: cluster > > ### ** Examples > > set.seed(98765) > iriss <- iris[sample(150,20),-5] > km <- kmeans(iriss,3) > round(calinhara(iriss,km$cluster),digits=2) [1] 91.75 > > > > cleanEx() > nameEx("can") > ### * can > > flush(stderr()); flush(stdout()) > > ### Name: can > ### Title: Generation of the tuning constant for regression fixed point > ### clusters > ### Aliases: can > ### Keywords: arith > > ### ** Examples > > can(429,3) [1] 8.806634 > > > > cleanEx() > nameEx("cat2bin") > ### * cat2bin > > flush(stderr()); flush(stdout()) > > ### Name: cat2bin > ### Title: Recode nominal variables to binary variables > ### Aliases: cat2bin > ### Keywords: manip > > ### ** Examples > > set.seed(776655) > v1 <- rnorm(20) > v2 <- rnorm(20) > d1 <- sample(1:5,20,replace=TRUE) > d2 <- sample(1:4,20,replace=TRUE) > ldata <-cbind(v1,v2,d1,d2) > lc <- cat2bin(ldata,categorical=3:4) > > > > cleanEx() > nameEx("cdbw") > ### * cdbw > > flush(stderr()); flush(stdout()) > > ### Name: cdbw > ### Title: CDbw-index for cluster validation > ### Aliases: cdbw > ### Keywords: cluster > > ### ** Examples > > options(digits=3) > iriss <- as.matrix(iris[c(1:5,51:55,101:105),-5]) > irisc <- as.numeric(iris[c(1:5,51:55,101:105),5]) > cdbw(iriss,irisc) $cdbw [1] 2.35 $cohesion [1] 1.28 $compactness [1] 1.33 $sep [1] 1.38 > > > > cleanEx() > nameEx("cgrestandard") > ### * cgrestandard > > flush(stderr()); flush(stdout()) > > ### Name: cgrestandard > ### Title: Standardise cluster validation statistics by random clustering > ### results > ### Aliases: cgrestandard > ### Keywords: multivariate cluster > > ### ** Examples > > > set.seed(20000) > options(digits=3) > face <- rFace(10,dMoNo=2,dNoEy=0,p=2) > dif <- dist(face) > clusum <- list() > clusum[[2]] <- list() > cl12 <- kmeansCBI(face,2) > cl13 <- kmeansCBI(face,3) > cl22 <- claraCBI(face,2) > cl23 <- claraCBI(face,2) > ccl12 <- clustatsum(dif,cl12$partition) > ccl13 <- clustatsum(dif,cl13$partition) > ccl22 <- clustatsum(dif,cl22$partition) > ccl23 <- clustatsum(dif,cl23$partition) > clusum[[1]] <- list() > clusum[[1]][[2]] <- ccl12 > clusum[[1]][[3]] <- ccl13 > clusum[[2]][[2]] <- ccl22 > clusum[[2]][[3]] <- ccl23 > clusum$maxG <- 3 > clusum$minG <- 2 > clusum$method <- c("kmeansCBI","claraCBI") > clusum$name <- c("kmeansCBI","claraCBI") > clusim <- randomclustersim(dist(face),G=2:3,nnruns=1,kmruns=1, + fnruns=1,avenruns=1,monitor=FALSE) > cgr <- cgrestandard(clusum,clusim,2:3) > cgr2 <- cgrestandard(clusum,clusim,2:3,useallg=TRUE) > cgr3 <- cgrestandard(clusum,clusim,2:3,percentage=TRUE) > print(str(cgr)) List of 6 $ :List of 3 ..$ : NULL ..$ :List of 16 .. ..$ avewithin : num 0.378 .. ..$ mnnd : num 2.46 .. ..$ cvnnd : num -21.3 .. ..$ maxdiameter : num 0.879 .. ..$ widestgap : num -0.799 .. ..$ sindex : num -0.125 .. ..$ minsep : num -0.125 .. ..$ asw : num 0.369 .. ..$ dindex : num 0.5 .. ..$ denscut : num 0.5 .. ..$ highdgap : num -0.576 .. ..$ pearsongamma: num 0.574 .. ..$ withinss : num 0.572 .. ..$ entropy : num 0.294 .. ..$ pamc : num 0.798 .. ..$ dmode : num 0.231 ..$ :List of 16 .. ..$ avewithin : num 0.964 .. ..$ mnnd : num NA .. ..$ cvnnd : num NA .. ..$ maxdiameter : num 0.998 .. ..$ widestgap : num NaN .. ..$ sindex : num 0.85 .. ..$ minsep : num 0.797 .. ..$ asw : num 0.939 .. ..$ dindex : num NaN .. ..$ denscut : num 0.538 .. ..$ highdgap : num NaN .. ..$ pearsongamma: num 1.01 .. ..$ withinss : num 0.882 .. ..$ entropy : num 0.359 .. ..$ pamc : num 0.824 .. ..$ dmode : num NaN $ :List of 3 ..$ : NULL ..$ :List of 16 .. ..$ avewithin : num 0.749 .. ..$ mnnd : num 3.81 .. ..$ cvnnd : num 9.29 .. ..$ maxdiameter : num 0.826 .. ..$ widestgap : num 0.865 .. ..$ sindex : num 0.746 .. ..$ minsep : num 0.746 .. ..$ asw : num 0.743 .. ..$ dindex : num 0.5 .. ..$ denscut : num 0.5 .. ..$ highdgap : num 0.845 .. ..$ pearsongamma: num 0.732 .. ..$ withinss : num 0.744 .. ..$ entropy : num 0.294 .. ..$ pamc : num 0.707 .. ..$ dmode : num 0.586 ..$ :List of 16 .. ..$ avewithin : num -0.782 .. ..$ mnnd : num NA .. ..$ cvnnd : num NA .. ..$ maxdiameter : num -0.861 .. ..$ widestgap : num NaN .. ..$ sindex : num 1.82 .. ..$ minsep : num 1.74 .. ..$ asw : num 1.24 .. ..$ dindex : num NaN .. ..$ denscut : num 0.538 .. ..$ highdgap : num NaN .. ..$ pearsongamma: num 1.52 .. ..$ withinss : num -1.25 .. ..$ entropy : num 0.496 .. ..$ pamc : num -1.42 .. ..$ dmode : num NaN $ maxG : num 3 $ minG : num 2 $ method: chr [1:2] "kmeansCBI" "claraCBI" $ name : chr [1:2] "kmeansCBI" "claraCBI" - attr(*, "class")= chr "valstat" NULL > print(str(cgr2)) List of 6 $ :List of 3 ..$ : NULL ..$ :List of 16 .. ..$ avewithin : num -0.174 .. ..$ mnnd : num 2.46 .. ..$ cvnnd : num -23.2 .. ..$ maxdiameter : num 0.217 .. ..$ widestgap : num -1.54 .. ..$ sindex : num 0.0591 .. ..$ minsep : num 0.305 .. ..$ asw : num 0.452 .. ..$ dindex : num 0.354 .. ..$ denscut : num 0.399 .. ..$ highdgap : num -1.25 .. ..$ pearsongamma: num 0.562 .. ..$ withinss : num 0.0414 .. ..$ entropy : num 0.431 .. ..$ pamc : num -0.0345 .. ..$ dmode : num -0.0485 ..$ :List of 16 .. ..$ avewithin : num 1.03 .. ..$ mnnd : num 5.09 .. ..$ cvnnd : num 10.1 .. ..$ maxdiameter : num 1.19 .. ..$ widestgap : num 0.54 .. ..$ sindex : num 0.429 .. ..$ minsep : num 0.305 .. ..$ asw : num 0.73 .. ..$ dindex : num 0.354 .. ..$ denscut : num 0.399 .. ..$ highdgap : num 0.531 .. ..$ pearsongamma: num 0.611 .. ..$ withinss : num 0.866 .. ..$ entropy : num 0.26 .. ..$ pamc : num 1.17 .. ..$ dmode : num 0.398 $ :List of 3 ..$ : NULL ..$ :List of 16 .. ..$ avewithin : num 0.233 .. ..$ mnnd : num 3.91 .. ..$ cvnnd : num 10.3 .. ..$ maxdiameter : num 0.165 .. ..$ widestgap : num 0.54 .. ..$ sindex : num 1.17 .. ..$ minsep : num 1.17 .. ..$ asw : num 0.953 .. ..$ dindex : num 0.354 .. ..$ denscut : num 0.399 .. ..$ highdgap : num 0.531 .. ..$ pearsongamma: num 0.787 .. ..$ withinss : num 0.235 .. ..$ entropy : num 0.431 .. ..$ pamc : num -0.111 .. ..$ dmode : num 0.398 ..$ :List of 16 .. ..$ avewithin : num 0.233 .. ..$ mnnd : num 3.91 .. ..$ cvnnd : num 10.3 .. ..$ maxdiameter : num 0.165 .. ..$ widestgap : num 0.54 .. ..$ sindex : num 1.17 .. ..$ minsep : num 1.17 .. ..$ asw : num 0.953 .. ..$ dindex : num 0.354 .. ..$ denscut : num 0.399 .. ..$ highdgap : num 0.531 .. ..$ pearsongamma: num 0.787 .. ..$ withinss : num 0.235 .. ..$ entropy : num 0.431 .. ..$ pamc : num -0.111 .. ..$ dmode : num 0.398 $ maxG : num 3 $ minG : num 2 $ method: chr [1:2] "kmeansCBI" "claraCBI" $ name : chr [1:2] "kmeansCBI" "claraCBI" - attr(*, "class")= chr "valstat" NULL > print(cgr3[[1]][[2]]) $avewithin [1] 0.6 $mnnd [1] 1 $cvnnd [1] 0.2 $maxdiameter [1] 1 $widestgap [1] 0.4 $sindex [1] 0.4 $minsep [1] 0.4 $asw [1] 0.6 $dindex [1] 0.4 $denscut [1] 0.4 $highdgap [1] 0.4 $pearsongamma [1] 0.6 $withinss [1] 0.6 $entropy [1] 0.4 $pamc [1] 1 $dmode [1] 0.4 > > > > cleanEx() > nameEx("classifdist") > ### * classifdist > > flush(stderr()); flush(stdout()) > > ### Name: classifdist > ### Title: Classification of unclustered points > ### Aliases: classifdist classifnp > ### Keywords: cluster multivariate > > ### ** Examples > > set.seed(20000) > x1 <- rnorm(50) > y <- rnorm(100) > x2 <- rnorm(40,mean=20) > x3 <- rnorm(10,mean=25,sd=100) > x <-cbind(c(x1,x2,x3),y) > truec <- c(rep(1,50),rep(2,40),rep(3,10)) > topredict <- c(1,2,51,52,91) > clumin <- truec > clumin[topredict] <- -1 > > classifnp(x,clumin, method="averagedist") [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 [75] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 3 3 3 3 3 3 3 3 3 > classifnp(x,clumin, method="qda") [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 [75] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 > classifdist(dist(x),clumin, centroids=c(3,53,93),method="centroid") [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 [75] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 3 3 3 3 3 3 3 3 3 > classifdist(dist(x),clumin,method="knn") [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 [75] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 > > > > > cleanEx() > nameEx("clucols") > ### * clucols > > flush(stderr()); flush(stdout()) > > ### Name: clucols > ### Title: Sets of colours and symbols for cluster plotting > ### Aliases: clucols clugrey clusym > ### Keywords: cluster > > ### ** Examples > > set.seed(112233) > require(MASS) Loading required package: MASS > require(flexmix) Loading required package: flexmix Loading required package: lattice > data(Cars93) > Cars934 <- Cars93[,c(3,5,8,10)] > cc <- + discrete.recode(Cars934,xvarsorted=FALSE,continuous=c(2,3),discrete=c(1,4)) > fcc <- flexmix(cc$data~1,k=3, + model=lcmixed(continuous=2,discrete=2,ppdim=c(6,3),diagonal=TRUE)) > plot(Cars934[,c(2,3)],col=clucols(3)[fcc@cluster],pch=clusym[fcc@cluster]) > > > > cleanEx() detaching ‘package:flexmix’, ‘package:lattice’, ‘package:MASS’ > nameEx("clujaccard") > ### * clujaccard > > flush(stderr()); flush(stdout()) > > ### Name: clujaccard > ### Title: Jaccard similarity between logical vectors > ### Aliases: clujaccard > ### Keywords: cluster > > ### ** Examples > > c1 <- rep(TRUE,10) > c2 <- c(FALSE,rep(TRUE,9)) > clujaccard(c1,c2) [1] 0.9 > > > > cleanEx() > nameEx("clusexpect") > ### * clusexpect > > flush(stderr()); flush(stdout()) > > ### Name: clusexpect > ### Title: Expected value of the number of times a fixed point cluster is > ### found > ### Aliases: clusexpect > ### Keywords: univar cluster > > ### ** Examples > > round(clusexpect(500,4,150,2000),digits=2) [1] 1.36 > > > > cleanEx() > nameEx("clustatsum") > ### * clustatsum > > flush(stderr()); flush(stdout()) > > ### Name: clustatsum > ### Title: Compute and format cluster validation statistics > ### Aliases: clustatsum > ### Keywords: cluster multivariate > > ### ** Examples > > set.seed(20000) > options(digits=3) > face <- rFace(20,dMoNo=2,dNoEy=0,p=2) > dface <- dist(face) > complete3 <- cutree(hclust(dface),3) > clustatsum(dface,complete3) $avewithin [1] 0.875 $mnnd [1] 0.63 $cvnnd [1] 0.88 $maxdiameter [1] 0.696 $widestgap [1] 0.719 $sindex [1] 0.287 $minsep [1] 0.274 $asw [1] 0.683 $dindex [1] 0.984 $denscut [1] 1 $highdgap [1] 0.904 $pearsongamma [1] 0.895 $withinss [1] 0.916 $entropy [1] 0.908 $pamc [1] 0.919 > > > > > cleanEx() > nameEx("cluster.magazine") > ### * cluster.magazine > > flush(stderr()); flush(stdout()) > > ### Name: cluster.magazine > ### Title: Run many clustering methods on many numbers of clusters > ### Aliases: cluster.magazine > ### Keywords: multivariate cluster > > ### ** Examples > > > set.seed(20000) > options(digits=3) > face <- rFace(10,dMoNo=2,dNoEy=0,p=2) > clustermethod=c("kmeansCBI","hclustCBI","hclustCBI") > # A clustering method can be used more than once, with different > # parameters > clustermethodpars <- list() > clustermethodpars[[2]] <- clustermethodpars[[3]] <- list() > clustermethodpars[[2]]$method <- "complete" > clustermethodpars[[3]]$method <- "average" > cmf <- cluster.magazine(face,G=2:3,clustermethod=clustermethod, + distmethod=rep(FALSE,3),clustermethodpars=clustermethodpars) [1] "kmeansCBI" [1] "hclustCBI" [1] "hclustCBI" > print(str(cmf)) List of 4 $ output :List of 3 ..$ :List of 3 .. ..$ : logi NA .. ..$ :List of 4 .. .. ..$ result :List of 11 .. .. .. ..$ cluster : int [1:10] 2 2 2 2 2 1 1 2 1 1 .. .. .. ..$ centers : num [1:2, 1:2] -0.711 0.474 0.852 -0.568 .. .. .. .. ..- attr(*, "dimnames")=List of 2 .. .. .. .. .. ..$ : chr [1:2] "1" "2" .. .. .. .. .. ..$ : NULL .. .. .. ..$ totss : num 18 .. .. .. ..$ withinss : num [1:2] 3.44 6.35 .. .. .. ..$ tot.withinss: num 9.79 .. .. .. ..$ betweenss : num 8.21 .. .. .. ..$ size : int [1:2] 4 6 .. .. .. ..$ iter : int 1 .. .. .. ..$ ifault : int 0 .. .. .. ..$ crit : num [1:2] 0 6.7 .. .. .. ..$ bestk : int 2 .. .. .. ..- attr(*, "class")= chr "kmeans" .. .. ..$ nc : int 2 .. .. ..$ partition : int [1:10] 2 2 2 2 2 1 1 2 1 1 .. .. ..$ clustermethod: chr "kmeans" .. ..$ :List of 4 .. .. ..$ result :List of 11 .. .. .. ..$ cluster : int [1:10] 1 1 1 1 3 2 2 3 2 2 .. .. .. ..$ centers : num [1:3, 1:2] 0.0512 -0.7113 1.3201 -1.0511 0.8515 ... .. .. .. .. ..- attr(*, "dimnames")=List of 2 .. .. .. .. .. ..$ : chr [1:3] "1" "2" "3" .. .. .. .. .. ..$ : NULL .. .. .. ..$ totss : num 18 .. .. .. ..$ withinss : num [1:3] 0.828 3.439 0.576 .. .. .. ..$ tot.withinss: num 4.84 .. .. .. ..$ betweenss : num 13.2 .. .. .. ..$ size : int [1:3] 4 4 2 .. .. .. ..$ iter : int 2 .. .. .. ..$ ifault : int 0 .. .. .. ..$ crit : num [1:3] 0 0 9.51 .. .. .. ..$ bestk : int 3 .. .. .. ..- attr(*, "class")= chr "kmeans" .. .. ..$ nc : int 3 .. .. ..$ partition : int [1:10] 1 1 1 1 3 2 2 3 2 2 .. .. ..$ clustermethod: chr "kmeans" ..$ :List of 3 .. ..$ : logi NA .. ..$ :List of 5 .. .. ..$ result :List of 7 .. .. .. ..$ merge : int [1:9, 1:2] -9 -2 -4 -1 -6 4 -8 6 -7 -10 ... .. .. .. ..$ height : num [1:9] 0 0.158 0.226 1.033 1.26 ... .. .. .. ..$ order : int [1:10] 7 1 5 6 4 2 3 8 9 10 .. .. .. ..$ labels : NULL .. .. .. ..$ method : chr "complete" .. .. .. ..$ call : language hclust(d = dist(sdata), method = method) .. .. .. ..$ dist.method: chr "euclidean" .. .. .. ..- attr(*, "class")= chr "hclust" .. .. ..$ noise : logi FALSE .. .. ..$ nc : int 2 .. .. ..$ partition : int [1:10] 1 1 1 1 1 1 2 1 1 1 .. .. ..$ clustermethod: chr "hclust/cutree" .. ..$ :List of 5 .. .. ..$ result :List of 7 .. .. .. ..$ merge : int [1:9, 1:2] -9 -2 -4 -1 -6 4 -8 6 -7 -10 ... .. .. .. ..$ height : num [1:9] 0 0.158 0.226 1.033 1.26 ... .. .. .. ..$ order : int [1:10] 7 1 5 6 4 2 3 8 9 10 .. .. .. ..$ labels : NULL .. .. .. ..$ method : chr "complete" .. .. .. ..$ call : language hclust(d = dist(sdata), method = method) .. .. .. ..$ dist.method: chr "euclidean" .. .. .. ..- attr(*, "class")= chr "hclust" .. .. ..$ noise : logi FALSE .. .. ..$ nc : int 3 .. .. ..$ partition : int [1:10] 1 1 1 1 1 1 2 3 3 3 .. .. ..$ clustermethod: chr "hclust/cutree" ..$ :List of 3 .. ..$ : logi NA .. ..$ :List of 5 .. .. ..$ result :List of 7 .. .. .. ..$ merge : int [1:9, 1:2] -9 -2 -4 -1 -5 -6 1 6 -7 -10 ... .. .. .. ..$ height : num [1:9] 0 0.158 0.195 1.032 1.073 ... .. .. .. ..$ order : int [1:10] 7 6 1 4 2 3 9 10 5 8 .. .. .. ..$ labels : NULL .. .. .. ..$ method : chr "average" .. .. .. ..$ call : language hclust(d = dist(sdata), method = method) .. .. .. ..$ dist.method: chr "euclidean" .. .. .. ..- attr(*, "class")= chr "hclust" .. .. ..$ noise : logi FALSE .. .. ..$ nc : int 2 .. .. ..$ partition : int [1:10] 1 1 1 1 1 1 2 1 1 1 .. .. ..$ clustermethod: chr "hclust/cutree" .. ..$ :List of 5 .. .. ..$ result :List of 7 .. .. .. ..$ merge : int [1:9, 1:2] -9 -2 -4 -1 -5 -6 1 6 -7 -10 ... .. .. .. ..$ height : num [1:9] 0 0.158 0.195 1.032 1.073 ... .. .. .. ..$ order : int [1:10] 7 6 1 4 2 3 9 10 5 8 .. .. .. ..$ labels : NULL .. .. .. ..$ method : chr "average" .. .. .. ..$ call : language hclust(d = dist(sdata), method = method) .. .. .. ..$ dist.method: chr "euclidean" .. .. .. ..- attr(*, "class")= chr "hclust" .. .. ..$ noise : logi FALSE .. .. ..$ nc : int 3 .. .. ..$ partition : int [1:10] 1 1 1 1 2 1 3 2 2 2 .. .. ..$ clustermethod: chr "hclust/cutree" $ clustering:List of 3 ..$ :List of 3 .. ..$ : logi NA .. ..$ : int [1:10] 2 2 2 2 2 1 1 2 1 1 .. ..$ : int [1:10] 1 1 1 1 3 2 2 3 2 2 ..$ :List of 3 .. ..$ : logi NA .. ..$ : int [1:10] 1 1 1 1 1 1 2 1 1 1 .. ..$ : int [1:10] 1 1 1 1 1 1 2 3 3 3 ..$ :List of 3 .. ..$ : logi NA .. ..$ : int [1:10] 1 1 1 1 1 1 2 1 1 1 .. ..$ : int [1:10] 1 1 1 1 2 1 3 2 2 2 $ noise :List of 3 ..$ :List of 3 .. ..$ : logi NA .. ..$ : logi FALSE .. ..$ : logi FALSE ..$ :List of 3 .. ..$ : logi NA .. ..$ : logi FALSE .. ..$ : logi FALSE ..$ :List of 3 .. ..$ : logi NA .. ..$ : logi FALSE .. ..$ : logi FALSE $ othernc : list() NULL > > > > > cleanEx() > nameEx("cluster.stats") > ### * cluster.stats > > flush(stderr()); flush(stdout()) > > ### Name: cluster.stats > ### Title: Cluster validation statistics > ### Aliases: cluster.stats > ### Keywords: cluster multivariate > > ### ** Examples > > set.seed(20000) > options(digits=3) > face <- rFace(200,dMoNo=2,dNoEy=0,p=2) > dface <- dist(face) > complete3 <- cutree(hclust(dface),3) > cluster.stats(dface,complete3, + alt.clustering=as.integer(attr(face,"grouping"))) $n [1] 200 $cluster.number [1] 3 $cluster.size [1] 136 60 4 $min.cluster.size [1] 4 $noisen [1] 0 $diameter [1] 10.80 5.76 9.00 $average.distance [1] 3.03 2.21 7.05 $median.distance [1] 2.84 1.48 8.32 $separation [1] 5.87 5.87 7.22 $average.toother [1] 13.8 13.0 20.8 $separation.matrix [,1] [,2] [,3] [1,] 0.00 5.87 14.98 [2,] 5.87 0.00 7.22 [3,] 14.98 7.22 0.00 $ave.between.matrix [,1] [,2] [,3] [1,] 0.0 13.1 24.5 [2,] 13.1 0.0 12.2 [3,] 24.5 12.2 0.0 $average.between [1] 13.7 $average.within [1] 2.86 $n.between [1] 8944 $n.within [1] 10956 $max.diameter [1] 10.8 $min.separation [1] 5.87 $within.cluster.ss [1] 1198 $clus.avg.silwidths 1 2 3 0.752 0.818 0.355 $avg.silwidth [1] 0.764 $g2 NULL $g3 NULL $pearsongamma [1] 0.883 $dunn [1] 0.544 $dunn2 [1] 1.73 $entropy [1] 0.702 $wb.ratio [1] 0.209 $ch [1] 699 $cwidegap [1] 1.81 1.21 8.32 $widestgap [1] 8.32 $sindex [1] 6.19 $corrected.rand [1] 0.345 $vi [1] 0.97 > > > > > cleanEx() > nameEx("cluster.varstats") > ### * cluster.varstats > > flush(stderr()); flush(stdout()) > > ### Name: cluster.varstats > ### Title: Variablewise statistics for clusters > ### Aliases: cluster.varstats print.varwisetables > ### Keywords: cluster > > ### ** Examples > > set.seed(112233) > options(digits=3) > require(MASS) Loading required package: MASS > require(flexmix) Loading required package: flexmix Loading required package: lattice > data(Cars93) > Cars934 <- Cars93[,c(3,5,8,10)] > cc <- + discrete.recode(Cars934,xvarsorted=FALSE,continuous=c(2,3),discrete=c(1,4)) > fcc <- flexmix(cc$data~1,k=2, + model=lcmixed(continuous=2,discrete=2,ppdim=c(6,3),diagonal=TRUE)) > cv <- + cluster.varstats(fcc@cluster,Cars934, contdata=Cars934[,c(2,3)], + tablevar=c(1,4),catvar=c(2,3),quantvar=c(2,3),projmethod="awc", + ask=FALSE) Cluster 1 42 out of 93 points. Cluster 1 Type Type In cluster 1 Compact Large Midsize Small Sporty Van FALSE 12 0 2 21 7 9 TRUE 4 11 20 0 7 0 Cluster 1 Price Mean= 26.5 all obs.= 19.5 Standard deviation= 9.83 all obs.= 9.66 0% 25% 50% 75% 100% 13.9 18.9 25.1 32.4 61.9 [1] "All obs.:" 0% 25% 50% 75% 100% 7.4 12.2 17.7 23.3 61.9 Cluster 1 MPG.highway Mean= 26.5 all obs.= 29.1 Standard deviation= 1.86 all obs.= 5.33 0% 25% 50% 75% 100% 22 25 26 28 30 [1] "All obs.:" 0% 25% 50% 75% 100% 20 26 28 31 50 Cluster 1 DriveTrain DriveTrain In cluster 1 4WD Front Rear FALSE 9 42 0 TRUE 1 25 16 Cluster 2 51 out of 93 points. Cluster 2 Type Type In cluster 2 Compact Large Midsize Small Sporty Van FALSE 4 11 20 0 7 0 TRUE 12 0 2 21 7 9 Cluster 2 Price Mean= 13.7 all obs.= 19.5 Standard deviation= 4.09 all obs.= 9.66 0% 25% 50% 75% 100% 7.4 10.2 13.3 16.6 22.7 [1] "All obs.:" 0% 25% 50% 75% 100% 7.4 12.2 17.7 23.3 61.9 Cluster 2 MPG.highway Mean= 31.2 all obs.= 29.1 Standard deviation= 6.26 all obs.= 5.33 0% 25% 50% 75% 100% 20.0 28.5 31.0 34.0 50.0 [1] "All obs.:" 0% 25% 50% 75% 100% 20 26 28 31 50 Cluster 2 DriveTrain DriveTrain In cluster 2 4WD Front Rear FALSE 1 25 16 TRUE 9 42 0 > print(cv) Type Cluster Compact Large Midsize Small Sporty Van 1 4 11 20 0 7 0 2 12 0 2 21 7 9 Sum 16 11 22 21 14 9 Categorised Price Cluster 1 2 3 4 5 6 7 8 9 10 1 0 0 0 4 3 5 3 8 9 10 2 10 9 9 5 7 4 6 1 0 0 Sum 10 9 9 9 10 9 9 9 9 10 Categorised MPG.highway Cluster 1 2 3 4 5 6 7 8 9 10 1 2 11 10 4 9 6 0 0 0 0 2 8 1 1 2 1 9 7 8 6 8 Sum 10 12 11 6 10 15 7 8 6 8 DriveTrain Cluster 4WD Front Rear 1 1 25 16 2 9 42 0 Sum 10 67 16 > > > > cleanEx() detaching ‘package:flexmix’, ‘package:lattice’, ‘package:MASS’ > nameEx("clusterbenchstats") > ### * clusterbenchstats > > flush(stderr()); flush(stdout()) > > ### Name: clusterbenchstats > ### Title: Run and validate many clusterings > ### Aliases: clusterbenchstats print.clusterbenchstats > ### Keywords: multivariate cluster > > ### ** Examples > > > set.seed(20000) > options(digits=3) > face <- rFace(10,dMoNo=2,dNoEy=0,p=2) > clustermethod=c("kmeansCBI","hclustCBI") > # A clustering method can be used more than once, with different > # parameters > clustermethodpars <- list() > clustermethodpars[[2]] <- list() > clustermethodpars[[2]]$method <- "average" > # Last element of clustermethodpars needs to have an entry! > methodname <- c("kmeans","average") > cbs <- clusterbenchstats(face,G=2:3,clustermethod=clustermethod, + methodname=methodname,distmethod=rep(FALSE,2), + clustermethodpars=clustermethodpars,nnruns=1,kmruns=1,fnruns=1,avenruns=1) [1] "kmeansCBI" [1] "hclustCBI" [1] "Computation of validity statistics" comsum 1 comsum 2 [1] "Simulation" 2 clusters; nn run 1 2 clusters; fn run 1 2 clusters; aven run 1 2 clusters; km run 1 3 clusters; nn run 1 3 clusters; fn run 1 3 clusters; aven run 1 3 clusters; km run 1 [1] "Simulation quantile re-standardisation" [1] "Simulation sd re-standardisation" > print(cbs) Output object of clusterbenchstats. Clustering methods: kmeans average Cluster validation statistics: avewithin mnnd cvnnd maxdiameter widestgap sindex minsep asw dindex denscut highdgap pearsongamma withinss entropy pamc dmode Numbers of clusters minimum: 2 maximum: 3 Output components are cm, stat, sim, qstat, sstat.stat, qstat, and sstat are valstat-objects.Use plot.valstat and print.valstat on these to get more information. > print(cbs$qstat,aggregate=TRUE,weights=c(1,0,0,0,0,1,0,1,0,1,0,1,0,0,1,1)) avewithin method 2 3 1 kmeans 0.43 0.57 2 average 0.14 0.14 mnnd method 2 3 1 kmeans 0.29 0.71 2 average 0.86 0.14 cvnnd method 2 3 1 kmeans 0.14 0.29 2 average 0.43 0.14 maxdiameter method 2 3 1 kmeans 0.43 0.14 2 average 0.14 0.57 widestgap method 2 3 1 kmeans 0.14 0.29 2 average 0.43 0.14 sindex method 2 3 1 kmeans 0.29 0.57 2 average 0.57 0.29 minsep method 2 3 1 kmeans 0.29 0.43 2 average 0.57 0.43 asw method 2 3 1 kmeans 0.43 0.57 2 average 0.14 0.14 dindex method 2 3 1 kmeans 0.43 0.14 2 average 0.14 0.14 denscut method 2 3 1 kmeans 0.29 0.43 2 average 0.29 0.43 highdgap method 2 3 1 kmeans 0.43 0.29 2 average 0.14 0.14 pearsongamma method 2 3 1 kmeans 0.43 0.57 2 average 0.14 0.29 withinss method 2 3 1 kmeans 0.43 0.57 2 average 0.14 0.14 entropy method 2 3 1 kmeans 0.43 0.43 2 average 0.14 0.29 pamc method 2 3 1 kmeans 0.43 0.57 2 average 0.14 0.29 dmode method 2 3 1 kmeans 0.43 0.18 2 average 0.14 0.14 aggregate method 2 3 1 kmeans 0.39 0.49 2 average 0.22 0.24 > # The weights are weights for the validation statistics ordered as in > # cbs$qstat$statistics for computation of an aggregated index, see > # ?print.valstat. > > # Now using bootstrap stability assessment as in Akhanli and Hennig (2020): > bootclassif <- c("centroid","averagedist") > cbsboot <- clusterbenchstats(face,G=2:3,clustermethod=clustermethod, + methodname=methodname,distmethod=rep(FALSE,2), + clustermethodpars=clustermethodpars, + useboot=TRUE,bootclassif=bootclassif,bootmethod="nselectboot", + bootruns=2,nnruns=1,kmruns=1,fnruns=1,avenruns=1,useallg=TRUE) [1] "kmeansCBI" [1] "hclustCBI" [1] "Computation of validity statistics" comsum 1 comsum 2 [1] "Simulation" 2 clusters; nn run 1 2 clusters; fn run 1 2 clusters; aven run 1 2 clusters; km run 1 3 clusters; nn run 1 3 clusters; fn run 1 3 clusters; aven run 1 3 clusters; km run 1 [1] "Simulation quantile re-standardisation" [1] "Simulation sd re-standardisation" > print(cbsboot) Output object of clusterbenchstats. Clustering methods: kmeans average Cluster validation statistics: avewithin mnnd cvnnd maxdiameter widestgap sindex minsep asw dindex denscut highdgap pearsongamma withinss entropy pamc boot dmode Numbers of clusters minimum: 2 maximum: 3 Output components are cm, stat, sim, qstat, sstat.stat, qstat, and sstat are valstat-objects.Use plot.valstat and print.valstat on these to get more information. > ## Not run: > ##D # Index A1 in Akhanli and Hennig (2020) (need these weights choices): > ##D print(cbsboot$sstat,aggregate=TRUE,weights=c(1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0)) > ##D # Index A2 in Akhanli and Hennig (2020) (need these weights choices): > ##D print(cbsboot$sstat,aggregate=TRUE,weights=c(0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0)) > ## End(Not run) > > # Results from nselectboot: > plot(cbsboot$stat,cbsboot$sim,statistic="boot") > > > > cleanEx() > nameEx("clusterboot") > ### * clusterboot > > flush(stderr()); flush(stdout()) > > ### Name: clusterboot > ### Title: Clusterwise cluster stability assessment by resampling > ### Aliases: clusterboot print.clboot plot.clboot > ### Keywords: cluster multivariate > > ### ** Examples > > options(digits=3) > set.seed(20000) > face <- rFace(50,dMoNo=2,dNoEy=0,p=2) > cf1 <- clusterboot(face,B=3,bootmethod= + c("boot","noise","jitter"),clustermethod=kmeansCBI, + krange=5,seed=15555) boot 1 boot 2 boot 3 noise 1 noise 2 noise 3 jitter 1 jitter 2 jitter 3 > print(cf1) * Cluster stability assessment * Cluster method: kmeans Full clustering results are given as parameter result of the clusterboot object, which also provides further statistics of the resampling results. Number of resampling runs: 3 Number of clusters found in data: 5 Clusterwise Jaccard bootstrap (omitting multiple points) mean: [1] 0.792 0.743 0.778 0.500 0.398 dissolved: [1] 1 1 1 1 2 recovered: [1] 2 2 2 0 1 Clusterwise Jaccard replacement by noise mean: [1] 0.880 0.874 0.902 0.630 0.120 dissolved: [1] 0 0 0 1 3 recovered: [1] 3 2 3 1 0 Clusterwise Jaccard jittering mean: [1] 0.926 0.929 1.000 0.833 0.407 dissolved: [1] 0 0 0 1 2 recovered: [1] 3 3 3 2 1 > plot(cf1) > cf2 <- clusterboot(dist(face),B=3,bootmethod= + "subset",clustermethod=disthclustCBI, + k=5, cut="number", method="average", showplots=TRUE, seed=15555) subset 1 subset 2 subset 3 > print(cf2) * Cluster stability assessment * Cluster method: hclust Full clustering results are given as parameter result of the clusterboot object, which also provides further statistics of the resampling results. Number of resampling runs: 3 Number of clusters found in data: 5 Clusterwise Jaccard subsetting mean: [1] 0.630 0.889 0.667 0.000 0.667 dissolved: [1] 1 0 1 3 1 recovered: [1] 1 2 2 0 2 > d1 <- c("a","b","a","c") > d2 <- c("a","a","a","b") > dx <- as.data.frame(cbind(d1,d2)) > cpx <- clusterboot(dx,k=2,B=10,clustermethod=claraCBI, + multipleboot=TRUE,usepam=TRUE,datatomatrix=FALSE) boot 1 boot 2 boot 3 boot 4 boot 5 boot 6 boot 7 boot 8 boot 9 boot 10 > print(cpx) * Cluster stability assessment * Cluster method: clara/pam Full clustering results are given as parameter result of the clusterboot object, which also provides further statistics of the resampling results. Number of resampling runs: 10 Number of clusters found in data: 2 Clusterwise Jaccard bootstrap mean: [1] 0.9 0.6 dissolved: [1] 0 4 recovered: [1] 6 6 > > > > cleanEx() > nameEx("cmahal") > ### * cmahal > > flush(stderr()); flush(stdout()) > > ### Name: cmahal > ### Title: Generation of tuning constant for Mahalanobis fixed point > ### clusters. > ### Aliases: cmahal > ### Keywords: cluster > > ### ** Examples > > plot(1:100,cmahal(100,3,nmin=5,cmin=qchisq(0.99,3),nc1=90), + xlab="FPC size", ylab="cmahal") > > > > cleanEx() > nameEx("concomp") > ### * concomp > > flush(stderr()); flush(stdout()) > > ### Name: con.comp > ### Title: Connectivity components of an undirected graph > ### Aliases: con.comp > ### Keywords: array cluster > > ### ** Examples > > set.seed(1000) > x <- rnorm(20) > m <- matrix(0,nrow=20,ncol=20) > for(i in 1:20) + for(j in 1:20) + m[i,j] <- abs(x[i]-x[j]) > d <- m<0.2 > cc <- con.comp(d) > max(cc) # number of connectivity components [1] 6 > plot(x,cc) > # The same should be produced by > # cutree(hclust(as.dist(m),method="single"),h=0.2). > > > > cleanEx() > nameEx("confusion") > ### * confusion > > flush(stderr()); flush(stdout()) > > ### Name: confusion > ### Title: Misclassification probabilities in mixtures > ### Aliases: confusion > ### Keywords: cluster multivariate > > ### ** Examples > > set.seed(12345) > m <- rpois(20,lambda=5) > dim(m) <- c(5,4) > pro <- apply(m,2,sum) > pro <- pro/sum(pro) > m <- m/apply(m,1,sum) > round(confusion(m,pro,1,2),digits=2) [1] 0.7 > > > > cleanEx() > nameEx("cov.wml") > ### * cov.wml > > flush(stderr()); flush(stdout()) > > ### Name: cov.wml > ### Title: Weighted Covariance Matrices (Maximum Likelihood) > ### Aliases: cov.wml > ### Keywords: multivariate > > ### ** Examples > > x <- c(1,2,3,4,5,6,7,8,9,10) > y <- c(1,2,3,8,7,6,5,8,9,10) > cov.wml(cbind(x,y),wt=c(0,0,0,1,1,1,1,1,0,0)) $cov x y x 2.0 -0.40 y -0.4 1.36 $center x y 6.0 6.8 $n.obs [1] 10 $wt [1] 0.0 0.0 0.0 0.2 0.2 0.2 0.2 0.2 0.0 0.0 > cov.wt(cbind(x,y),wt=c(0,0,0,1,1,1,1,1,0,0)) $cov x y x 2.5 -0.5 y -0.5 1.7 $center x y 6.0 6.8 $n.obs [1] 10 $wt [1] 0.0 0.0 0.0 0.2 0.2 0.2 0.2 0.2 0.0 0.0 > > > > cleanEx() > nameEx("cqcluster.stats") > ### * cqcluster.stats > > flush(stderr()); flush(stdout()) > > ### Name: cqcluster.stats > ### Title: Cluster validation statistics (version for use with > ### clusterbenchstats > ### Aliases: cqcluster.stats summary.cquality print.summary.cquality > ### Keywords: cluster multivariate > > ### ** Examples > > set.seed(20000) > options(digits=3) > face <- rFace(200,dMoNo=2,dNoEy=0,p=2) > dface <- dist(face) > complete3 <- cutree(hclust(dface),3) > cqcluster.stats(dface,complete3, + alt.clustering=as.integer(attr(face,"grouping"))) $n [1] 200 $cluster.number [1] 3 $cluster.size [1] 136 60 4 $min.cluster.size [1] 4 $noisen [1] 0 $diameter [1] 10.80 5.76 9.00 $average.distance [1] 3.03 2.21 7.05 $median.distance [1] 2.84 1.48 8.32 $separation [1] 5.87 5.87 7.22 $average.toother [1] 13.8 13.0 20.8 $separation.matrix [,1] [,2] [,3] [1,] 0.00 5.87 14.98 [2,] 5.87 0.00 7.22 [3,] 14.98 7.22 0.00 $ave.between.matrix [,1] [,2] [,3] [1,] 0.0 13.1 24.5 [2,] 13.1 0.0 12.2 [3,] 24.5 12.2 0.0 $avebetween [1] 0.428 $avewithin [1] 0.0893 $n.between [1] 8944 $n.within [1] 10956 $maxdiameter [1] 0.336 $minsep [1] 0.183 $withinss [1] 0.124 $clus.avg.silwidths 1 2 3 0.752 0.818 0.355 $asw [1] 0.764 $g2 NULL $g3 NULL $pearsongamma [1] 0.942 $dunn [1] 0.544 $dunn2 [1] 1.73 $entropy [1] 0.639 $wb.ratio [1] 0.209 $ch [1] 699 $cwidegap [1] 1.81 1.21 8.32 $widestgap [1] 0.259 $corrected.rand [1] 0.345 $vi [1] 0.907 $sindex [1] 0.232 $svec [1] 8.55 8.81 7.98 7.82 7.90 8.50 7.30 8.99 7.96 8.78 7.29 8.60 5.87 8.69 6.13 [16] 5.90 5.98 6.07 5.87 6.04 7.22 $psep [1] 11.19 13.98 8.55 13.20 9.06 16.34 16.54 8.81 10.85 10.76 16.39 13.27 [13] 16.24 9.28 16.10 10.32 16.54 9.50 15.07 16.28 7.98 7.82 14.54 15.59 [25] 10.30 16.50 10.43 11.52 15.06 14.21 10.35 7.90 15.48 8.50 15.84 7.30 [37] 15.20 16.41 13.82 13.92 13.24 13.44 13.28 13.14 13.31 13.02 13.23 13.45 [49] 13.40 13.64 13.59 13.26 13.00 13.50 13.11 13.48 13.40 13.08 13.18 13.69 [61] 13.17 13.66 13.21 12.81 13.26 13.16 13.57 13.34 13.47 13.41 13.69 13.15 [73] 13.63 12.99 13.29 13.69 13.48 13.37 13.29 13.15 13.13 13.36 13.66 12.89 [85] 13.57 13.45 13.69 11.91 13.15 13.30 13.23 13.27 13.80 13.46 13.48 13.47 [97] 13.08 13.48 10.14 11.32 10.07 11.18 9.08 11.08 11.00 9.74 8.99 9.97 [109] 9.00 7.96 9.29 11.34 9.62 9.64 8.78 7.29 10.45 8.60 9.55 5.87 [121] 11.09 11.22 11.14 10.31 11.28 11.44 8.69 10.74 10.40 11.03 9.59 6.30 [133] 10.72 11.00 9.41 9.75 11.32 6.62 7.01 6.55 6.95 6.53 6.88 6.69 [145] 6.67 6.77 6.63 6.47 7.01 6.72 7.08 7.04 6.23 6.67 6.15 6.31 [157] 7.12 7.10 6.69 7.35 6.13 6.15 5.90 6.62 6.73 6.52 6.37 6.66 [169] 7.02 6.75 6.90 6.62 6.48 7.02 7.08 6.86 6.28 6.43 5.98 6.14 [181] 7.44 7.43 6.67 6.07 6.40 6.19 7.22 7.19 7.22 6.59 5.87 6.56 [193] 7.26 6.57 6.04 6.57 7.22 7.63 14.08 14.08 $stan [1] 32.1 $nnk [1] 2 $mnnd [1] 0.0533 $pamc [1] 0.0624 $pamcentroids [1] 58 138 200 $dindex [1] 0.0238 $denscut [1] 0 $highdgap [1] 0.0149 $npenalty [1] 0 0 0 $dpenalty [1] 0.0991 0.0141 0.0000 $withindensp [1] 1.41 1.99 1.31 1.89 1.72 4.82 5.58 1.94 1.99 1.97 5.78 1.89 [13] 3.26 1.36 2.89 3.23 4.96 1.41 3.23 3.94 3.10 3.10 1.86 2.17 [25] 3.19 5.73 1.55 1.07 3.27 1.00 3.24 3.20 1.47 2.20 1.85 1.60 [37] 3.10 5.85 1.78 9.35 23.85 28.50 26.48 9.94 12.53 5.39 23.69 28.80 [49] 27.65 25.24 25.86 14.12 14.61 16.27 4.95 27.05 27.70 18.54 12.35 21.58 [61] 7.53 21.89 21.41 4.12 25.77 17.50 27.21 26.93 28.50 26.16 19.59 15.00 [73] 24.67 4.87 21.68 23.44 28.26 26.71 16.37 10.33 20.66 25.89 23.12 5.53 [85] 26.05 16.05 17.50 1.00 20.91 26.39 19.53 26.42 12.42 24.00 23.77 17.19 [97] 14.77 28.71 3.70 8.63 5.83 7.25 5.20 7.92 9.09 7.53 4.98 6.53 [109] 4.12 1.30 6.59 8.09 7.58 6.16 4.16 1.10 5.18 3.57 7.38 1.00 [121] 9.38 9.08 7.10 4.74 7.77 6.60 4.10 3.14 4.27 8.93 7.12 1.00 [133] 6.41 8.07 5.54 7.48 7.51 14.53 13.64 11.49 14.52 13.47 14.42 15.70 [145] 14.74 14.97 15.34 10.01 4.16 15.79 12.36 12.01 6.42 3.52 3.73 7.03 [157] 9.81 11.82 7.03 5.49 4.85 4.47 2.95 12.47 13.23 13.89 4.41 14.65 [169] 8.97 3.77 12.11 5.00 12.14 5.11 12.34 5.84 2.08 4.83 4.17 1.85 [181] 1.51 3.13 3.80 4.75 4.56 4.96 3.44 2.68 1.67 3.11 3.70 3.31 [193] 3.44 4.06 2.58 3.73 1.00 1.00 2.00 2.00 $densoc [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 [38] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 [75] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 [112] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 [149] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 [186] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 $pdistto $pdistto[[1]] [1] NA 48 69 48 48 42 49 68 98 67 50 50 51 90 41 47 89 63 [19] 63 66 76 73 85 56 70 82 78 70 94 73 60 62 97 66 79 52 [37] 79 78 65 92 65 81 71 83 59 82 91 72 80 80 45 96 61 74 [55] 87 44 46 46 55 12 61 39 2 23 19 29 37 24 15 13 17 7 [73] 26 11 11 6 20 35 12 84 88 28 9 31 16 25 14 8 34 21 [91] 32 22 88 126 112 112 100 122 121 105 130 104 123 102 121 105 133 117 [109] 124 124 108 136 106 113 113 131 119 131 108 101 111 109 115 127 135 103 [127] 134 118 110 116 123 1 10 27 18 5 $pdistto[[2]] [1] NA 150 144 150 146 145 147 166 142 173 140 164 147 138 143 141 139 158 143 [20] 171 139 175 151 157 138 148 156 153 169 174 176 161 140 162 155 167 172 170 [39] 170 159 132 180 179 191 184 186 178 185 186 194 194 192 190 186 183 187 193 [58] 187 188 182 $pdistto[[3]] [1] NA 199 199 199 $pclosetomode $pclosetomode[[1]] [1] 48 69 77 42 98 49 68 90 67 50 76 51 83 41 47 89 63 75 [19] 66 97 73 85 56 70 82 78 57 94 95 60 62 71 53 79 52 59 [37] 86 65 92 43 81 58 93 87 61 91 72 80 44 45 96 54 74 64 [55] 40 46 55 84 12 4 39 2 23 19 29 37 24 15 13 17 7 26 [73] 11 38 6 20 35 33 30 88 28 9 31 16 25 14 8 34 21 32 [91] 22 36 126 112 100 125 122 121 105 130 104 123 102 137 134 133 117 124 [109] 129 108 136 106 113 119 131 114 135 111 101 99 109 115 127 118 103 107 [127] 128 110 116 120 1 10 27 18 5 3 $pclosetomode[[2]] [1] 150 144 147 146 145 168 166 142 173 140 164 165 138 143 141 139 158 152 171 [20] 169 175 151 157 160 148 156 153 161 174 176 159 163 162 155 167 172 170 154 [39] 149 132 180 179 191 184 186 178 185 196 194 183 192 190 177 195 187 193 182 [58] 188 181 189 $pclosetomode[[3]] [1] 199 200 197 198 $distto $distto[[1]] [1] NA -0.010308 -0.008199 -0.010265 -0.002805 -0.029572 -0.024898 [8] -0.018713 -0.052255 -0.068266 -0.062747 0.021523 -0.095210 -0.088147 [15] -0.005542 -0.096747 0.017529 0.009342 -0.135760 -0.094820 0.042933 [22] 0.047641 0.034916 -0.030965 -0.009467 0.028707 0.034230 -0.075005 [29] -0.008034 -0.107555 0.010907 -0.079852 -0.005589 -0.039372 -0.078150 [36] -0.061560 -0.010983 -0.032677 0.022470 0.002038 -0.177657 -0.073619 [43] -0.248917 -0.195095 -0.167149 -0.220591 -0.157537 -0.162211 -0.013300 [50] 0.076384 0.161927 -0.032004 -0.092537 -0.025939 -0.283240 -0.158031 [57] -0.015482 0.004635 -0.106143 0.000000 -0.199966 0.007360 -0.004461 [64] 0.047700 0.001219 -0.005897 -0.032149 0.024814 0.012786 0.059112 [71] 0.021739 0.004992 0.001735 0.002452 -0.033090 -0.030655 -0.072734 [78] -0.013223 -0.030961 -0.157221 0.002537 0.031912 0.043207 -0.000288 [85] -0.001281 -0.063523 0.020193 0.009040 0.031200 0.003498 -0.003548 [92] -0.051979 0.194308 0.051962 0.018810 -0.011259 0.015496 0.010479 [99] -0.010186 -0.005647 -0.034853 -0.028384 0.005062 0.008929 -0.045612 [106] -0.093128 -0.042588 -0.015108 -0.016329 0.062131 0.032728 0.001954 [113] 0.001515 -0.006843 -0.015786 -0.033391 -0.063940 -0.018370 -0.024519 [120] -0.073963 -0.085706 0.001138 -0.001867 -0.018489 -0.011797 -0.007750 [127] -0.171206 -0.079010 -0.006788 -0.003495 -0.197571 0.019128 -0.014412 [134] -0.005023 0.010807 -0.014106 $distto[[2]] [1] NA -3.06e-03 -1.24e-02 -2.83e-02 -7.96e-03 -3.20e-03 -5.05e-02 [8] -1.46e-02 -4.60e-02 -2.26e-02 3.38e-02 2.64e-02 -2.83e-02 -3.64e-03 [15] 3.30e-03 -3.05e-02 -6.32e-02 6.65e-03 -8.03e-02 -1.09e-01 -4.52e-02 [22] 9.18e-04 -8.87e-02 -1.50e-01 -1.57e-01 -1.04e-01 -2.11e-02 -5.46e-02 [29] -1.34e-01 2.51e-02 4.14e-02 -6.58e-02 -2.44e-01 -2.56e-02 2.35e-02 [36] 2.05e-02 -4.26e-02 -8.61e-03 1.35e-02 -2.09e-01 2.94e-02 8.06e-02 [43] -1.62e-02 3.64e-02 7.30e-03 -4.62e-03 -9.46e-03 -2.86e-02 -3.15e-02 [50] -8.85e-03 -2.57e-02 -7.18e-03 -3.57e-02 -8.27e-02 -1.24e-02 -9.93e-05 [57] -1.09e-02 -2.65e-02 -4.08e-02 -5.05e-02 $distto[[3]] [1] NA 0.0000 -0.0347 -0.0347 $percwdens [1] 0.0491 0.0690 0.0455 0.0657 0.0596 0.1675 0.1939 0.0675 0.0692 0.0683 [11] 0.2006 0.0657 0.1131 0.0473 0.1003 0.1121 0.1722 0.0488 0.1123 0.1369 [21] 0.1077 0.1077 0.0646 0.0755 0.1108 0.1989 0.0539 0.0373 0.1135 0.0347 [31] 0.1124 0.1112 0.0509 0.0765 0.0642 0.0557 0.1076 0.2031 0.0617 0.3247 [41] 0.8284 0.9897 0.9195 0.3453 0.4350 0.1873 0.8229 1.0000 0.9602 0.8767 [51] 0.8982 0.4904 0.5075 0.5650 0.1718 0.9394 0.9619 0.6437 0.4288 0.7493 [61] 0.2617 0.7602 0.7436 0.1432 0.8950 0.6079 0.9449 0.9353 0.9897 0.9085 [71] 0.6804 0.5209 0.8569 0.1691 0.7530 0.8139 0.9815 0.9277 0.5685 0.3586 [81] 0.7174 0.8990 0.8030 0.1919 0.9045 0.5575 0.6079 0.0347 0.7261 0.9166 [91] 0.6784 0.9175 0.4314 0.8334 0.8254 0.5970 0.5131 0.9972 0.1284 0.2998 [101] 0.2024 0.2518 0.1805 0.2751 0.3156 0.2616 0.1728 0.2269 0.1432 0.0450 [111] 0.2289 0.2810 0.2631 0.2139 0.1444 0.0382 0.1799 0.1240 0.2563 0.0347 [121] 0.3258 0.3153 0.2467 0.1648 0.2697 0.2290 0.1425 0.1090 0.1484 0.3099 [131] 0.2473 0.0347 0.2225 0.2802 0.1923 0.2596 0.2607 0.5045 0.4737 0.3991 [141] 0.5041 0.4677 0.5009 0.5451 0.5119 0.5199 0.5328 0.3477 0.1445 0.5482 [151] 0.4294 0.4171 0.2229 0.1224 0.1295 0.2440 0.3406 0.4104 0.2441 0.1906 [161] 0.1683 0.1551 0.1025 0.4330 0.4593 0.4823 0.1530 0.5087 0.3115 0.1310 [171] 0.4206 0.1736 0.4217 0.1776 0.4284 0.2027 0.0722 0.1677 0.1448 0.0641 [181] 0.0523 0.1086 0.1320 0.1650 0.1582 0.1723 0.1196 0.0931 0.0580 0.1079 [191] 0.1286 0.1151 0.1195 0.1408 0.0896 0.1296 0.0347 0.0347 0.0695 0.0695 $percdensoc [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 [38] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 [75] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 [112] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 [149] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 [186] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 $parsimony [1] 0.3 $cvnnd [1] 0.067 $cvnndc [1] 1.070 0.733 0.000 attr(,"class") [1] "cquality" > > > > > cleanEx() > nameEx("cvnn") > ### * cvnn > > flush(stderr()); flush(stdout()) > > ### Name: cvnn > ### Title: Cluster validation based on nearest neighbours > ### Aliases: cvnn > ### Keywords: cluster > > ### ** Examples > > options(digits=3) > iriss <- as.matrix(iris[c(1:10,51:55,101:105),-5]) > irisc <- as.numeric(iris[c(1:10,51:55,101:105),5]) > print(cvnn(dist(iriss),list(irisc,rep(1:4,5)))) $cvnnindex [1] 0.616 2.000 $sep [1] 0.36 0.92 $comp [1] 0.674 2.998 > > > > cleanEx() > nameEx("cweight") > ### * cweight > > flush(stderr()); flush(stdout()) > > ### Name: cweight > ### Title: Weight function for AWC > ### Aliases: cweight > ### Keywords: arith > > ### ** Examples > > cweight(4,1) [1] 0.25 > > > > cleanEx() > nameEx("dbscan") > ### * dbscan > > flush(stderr()); flush(stdout()) > > ### Name: dbscan > ### Title: DBSCAN density reachability and connectivity clustering > ### Aliases: dbscan print.dbscan plot.dbscan predict.dbscan > ### Keywords: multivariate cluster > > ### ** Examples > > set.seed(665544) > n <- 600 > x <- cbind(runif(10, 0, 10)+rnorm(n, sd=0.2), runif(10, 0, 10)+rnorm(n, + sd=0.2)) > par(bg="grey40") > ds <- dbscan(x, 0.2) > # run with showplot=1 to see how dbscan works. > ds dbscan Pts=600 MinPts=5 eps=0.2 0 1 2 3 4 5 6 7 8 9 10 11 border 28 4 4 8 5 3 3 4 3 4 6 4 seed 0 50 53 51 52 51 54 54 54 53 51 1 total 28 54 57 59 57 54 57 58 57 57 57 5 > plot(ds, x) > > x2 <- matrix(0,nrow=4,ncol=2) > x2[1,] <- c(5,2) > x2[2,] <- c(8,3) > x2[3,] <- c(4,4) > x2[4,] <- c(9,9) > predict(ds, x, x2) [1] 4 9 0 0 > > n <- 600 > x <- cbind((1:3)+rnorm(n, sd=0.2), (1:3)+rnorm(n, sd=0.2)) > > # Not run, but results from my machine are 0.105 - 0.068 - 0.255: > # system.time(ds <- dbscan(x, 0.3, countmode=NULL, method="raw"))[3] > # system.time(dsb <- dbscan(x, 0.3, countmode=NULL, method="hybrid"))[3] > # system.time(dsc <- dbscan(dist(x), 0.3, countmode=NULL, > # method="dist"))[3] > > > > graphics::par(get("par.postscript", pos = 'CheckExEnv')) > cleanEx() > nameEx("dipp.tantrum") > ### * dipp.tantrum > > flush(stderr()); flush(stdout()) > > ### Name: dipp.tantrum > ### Title: Simulates p-value for dip test > ### Aliases: dipp.tantrum > ### Keywords: cluster > > ### ** Examples > > # not run, requires package diptest > # x <- runif(100) > # d <- dip(x) > # dt <- dipp.tantrum(x,d,M=10) > > > > cleanEx() > nameEx("diptest.multi") > ### * diptest.multi > > flush(stderr()); flush(stdout()) > > ### Name: diptest.multi > ### Title: Diptest for discriminant coordinate projection > ### Aliases: diptest.multi > ### Keywords: cluster multivariate > > ### ** Examples > > require(diptest) Loading required package: diptest > x <- cbind(runif(100),runif(100)) > partition <- 1+(x[,1]<0.5) > d1 <- diptest.multi(x,partition) > d2 <- diptest.multi(x,partition,pvalue="tantrum",M=10) > > > > cleanEx() detaching ‘package:diptest’ > nameEx("discrcoord") > ### * discrcoord > > flush(stderr()); flush(stdout()) > > ### Name: discrcoord > ### Title: Discriminant coordinates/canonical variates > ### Aliases: discrcoord > ### Keywords: multivariate classif > > ### ** Examples > > set.seed(4634) > face <- rFace(600,dMoNo=2,dNoEy=0) > grface <- as.integer(attr(face,"grouping")) > dcf <- discrcoord(face,grface) > plot(dcf$proj,col=grface) > # ...done in one step by function plotcluster. > > > > cleanEx() > nameEx("discrete.recode") > ### * discrete.recode > > flush(stderr()); flush(stdout()) > > ### Name: discrete.recode > ### Title: Recodes mixed variables dataset > ### Aliases: discrete.recode > ### Keywords: manip > > ### ** Examples > > set.seed(776655) > v1 <- rnorm(20) > v2 <- rnorm(20) > d1 <- sample(c(2,4,6,8),20,replace=TRUE) > d2 <- sample(1:4,20,replace=TRUE) > ldata <- cbind(v1,d1,v2,d2) > lc <- + discrete.recode(ldata,xvarsorted=FALSE,continuous=c(1,3),discrete=c(2,4)) > require(MASS) Loading required package: MASS > data(Cars93) > Cars934 <- Cars93[,c(3,5,8,10)] > cc <- discrete.recode(Cars934,xvarsorted=FALSE,continuous=c(2,3),discrete=c(1,4)) > > > > cleanEx() detaching ‘package:MASS’ > nameEx("discrproj") > ### * discrproj > > flush(stderr()); flush(stdout()) > > ### Name: discrproj > ### Title: Linear dimension reduction for classification > ### Aliases: discrproj > ### Keywords: multivariate classif > > ### ** Examples > > set.seed(4634) > face <- rFace(300,dMoNo=2,dNoEy=0,p=3) > grface <- as.integer(attr(face,"grouping")) > > # The abs in the following is there to unify the output, > # because eigenvectors are defined only up to their sign. > # Statistically it doesn't make sense to compute absolute values. > round(abs(discrproj(face,grface, method="nc")$units),digits=2) [,1] [,2] [,3] [1,] 0.84 1.15 0.02 [2,] 0.34 0.28 0.01 [3,] 0.07 0.04 1.00 > round(abs(discrproj(face,grface, method="wnc")$units),digits=2) [,1] [,2] [,3] [1,] 0.07 1.42 0.04 [2,] 0.44 0.00 0.01 [3,] 0.04 0.00 1.01 > round(abs(discrproj(face,grface, clnum=1, method="arc")$units),digits=2) [,1] [,2] [,3] [1,] 1.30 0.50 0.34 [2,] 0.65 0.05 0.08 [3,] 0.08 0.66 0.47 > > > > cleanEx() > nameEx("distancefactor") > ### * distancefactor > > flush(stderr()); flush(stdout()) > > ### Name: distancefactor > ### Title: Factor for dissimilarity of mixed type data > ### Aliases: distancefactor > ### Keywords: cluster > > ### ** Examples > > set.seed(776655) > d1 <- sample(1:5,20,replace=TRUE) > d2 <- sample(1:4,20,replace=TRUE) > ldata <- cbind(d1,d2) > lc <- cat2bin(ldata,categorical=1)$data > lc[,1:5] <- lc[,1:5]*distancefactor(5,20,type="categorical") > lc[,6] <- lc[,6]*distancefactor(4,20,type="ordinal") > > > > cleanEx() > nameEx("distcritmulti") > ### * distcritmulti > > flush(stderr()); flush(stdout()) > > ### Name: distcritmulti > ### Title: Distance based validity criteria for large data sets > ### Aliases: distcritmulti > ### Keywords: cluster > > ### ** Examples > > set.seed(20000) > options(digits=3) > face <- rFace(50,dMoNo=2,dNoEy=0,p=2) > clustering <- as.integer(attr(face,"grouping")) > distcritmulti(face,clustering,ns=3,seed=100000,criterion="pearsongamma") $crit.overall [1] 0.469 $crit.sub [1] 0.512 0.475 0.424 $crit.sd [1] 0.0443 $subsets $subsets[[1]] [1] 42 12 45 43 29 19 49 4 18 14 28 40 23 5 36 10 $subsets[[2]] [1] 21 47 31 7 6 34 46 9 1 35 22 30 17 37 16 39 $subsets[[3]] [1] 3 8 32 48 50 13 44 38 24 20 27 15 11 2 26 41 25 33 > > > > cleanEx() > nameEx("distrsimilarity") > ### * distrsimilarity > > flush(stderr()); flush(stdout()) > > ### Name: distrsimilarity > ### Title: Similarity of within-cluster distributions to normal and uniform > ### Aliases: distrsimilarity > ### Keywords: multivariate classif cluster > > ### ** Examples > > set.seed(20000) > options(digits=3) > face <- rFace(200,dMoNo=2,dNoEy=0,p=2) > km3 <- kmeans(face,3) > distrsimilarity(face,km3$cluster) $kdnorm [1] 0.194 $kdunif [1] 0.631 $kdnormc [1] 0.213 0.114 0.240 $kdunifc [1] 0.536 0.470 0.891 $xmahal [1] 3.5129 4.7799 6.3495 7.8442 5.0433 5.6147 6.0117 2.8951 3.4951 [10] 3.3331 5.4346 7.4691 4.9576 2.5102 4.4414 2.7912 5.8919 4.1839 [19] 3.7049 5.8845 4.0592 4.3852 3.6482 3.8863 2.8131 5.8701 3.3964 [28] 4.5995 3.4337 6.0519 2.8604 4.2351 5.5251 3.2639 5.5791 5.6654 [37] 3.3749 5.3881 5.1841 0.1951 0.5543 0.2182 0.3423 1.3870 1.0588 [46] 2.2913 0.5611 0.1924 0.2815 0.0719 0.1155 1.2668 1.0872 0.6318 [55] 2.4902 0.1747 0.2197 0.5771 1.5045 0.0861 2.2272 0.1307 0.7233 [64] 3.2335 0.3505 1.0175 0.1018 0.3543 0.1838 0.2504 0.1653 0.8485 [73] 0.0698 3.1368 0.6831 0.0465 0.1856 0.2635 1.0392 1.3378 0.5079 [82] 0.2897 0.0961 1.9485 0.1066 0.8219 0.2738 7.4076 0.7187 0.3910 [91] 0.5886 0.3360 0.3379 0.2821 0.2710 0.6091 1.2201 0.1610 0.2435 [100] 1.3166 0.1236 1.1852 0.6172 1.0057 0.7931 0.0836 0.6910 0.0241 [109] 0.4402 2.3749 0.2522 1.3834 0.1406 0.0374 0.7562 4.3898 0.2228 [118] 1.1830 0.1869 11.0909 0.9184 1.1382 1.1336 0.1277 1.2978 1.5902 [127] 0.9341 0.6224 0.1983 0.8693 0.0827 0.1414 0.4996 0.8275 0.3648 [136] 0.0871 1.3740 0.3438 0.3971 0.6755 0.3984 0.5457 0.3319 0.4708 [145] 0.5582 0.5483 0.4577 0.2446 1.0912 0.4667 0.5092 0.3227 0.2626 [154] 1.4643 1.0198 0.2357 0.6785 0.3872 0.1366 0.7052 0.2668 0.8800 [163] 0.4571 0.6615 0.6485 0.4795 1.1714 0.5612 0.2161 1.3731 0.2612 [172] 1.1953 0.5939 0.1136 0.5080 0.1114 3.2163 1.4314 1.1823 0.7213 [181] 1.0050 1.9337 2.0463 1.4902 1.1999 1.6324 1.8385 1.2525 2.5465 [190] 2.8546 1.3114 2.7367 1.8662 2.0296 2.2878 1.1876 11.6174 10.4323 [199] 23.1443 23.1443 $xdknn [1] 0.7869 0.5996 0.9758 0.8746 0.5241 0.2066 0.1396 0.4856 0.5316 0.4451 [11] 0.1041 0.8472 0.4166 0.8038 0.5286 0.0330 0.2158 0.9412 0.1836 0.3729 [21] 0.1554 0.1554 0.5434 0.5559 0.0445 0.1041 0.7869 1.2058 0.1624 1.0412 [31] 0.0445 0.0792 0.9497 0.5262 0.5460 0.6086 0.1836 0.1201 0.7696 0.3148 [41] 0.0925 0.0373 0.0686 0.1787 0.1787 0.2043 0.0850 0.0368 0.0634 0.0647 [51] 0.0735 0.0877 0.1696 0.2180 0.3033 0.0847 0.0817 0.1949 0.1573 0.0905 [61] 0.2285 0.0610 0.0810 0.3732 0.0686 0.1248 0.0735 0.0634 0.0352 0.0602 [71] 0.0905 0.2093 0.1010 0.2285 0.1312 0.0931 0.0428 0.0517 0.1569 0.1692 [81] 0.1518 0.0522 0.0858 0.2576 0.0975 0.2108 0.2287 1.2165 0.0850 0.1060 [91] 0.2131 0.0365 0.2383 0.0987 0.1149 0.2074 0.1481 0.0503 0.4643 0.1035 [101] 0.3037 0.1217 0.3723 0.1217 0.1360 0.1248 0.4015 0.2825 0.3543 0.6843 [111] 0.3329 0.0968 0.1248 0.2612 0.2589 1.3065 0.2769 0.3047 0.1963 2.1923 [121] 0.1477 0.1416 0.1031 0.2388 0.1412 0.1595 0.1747 0.4555 0.2769 0.1722 [131] 0.1812 1.2096 0.3164 0.2369 0.2848 0.1386 0.2088 0.2163 0.0919 0.1223 [141] 0.1260 0.0983 0.1396 0.0609 0.1027 0.1113 0.0916 0.2354 0.4479 0.0916 [151] 0.1720 0.1345 0.1015 0.2424 0.2546 0.1630 0.2245 0.1167 0.3315 0.3251 [161] 0.1807 0.3613 0.4538 0.1241 0.1310 0.1222 0.2546 0.1113 0.2096 0.1844 [171] 0.1576 0.2424 0.1223 0.2617 0.1708 0.1843 0.4485 0.2678 0.2942 0.5822 [181] 0.8005 0.2150 0.4959 0.2568 0.2056 0.2958 0.2150 0.4788 0.6237 0.4241 [191] 0.2568 0.4485 0.1713 0.4629 0.5524 0.2678 7.5721 7.8539 8.3217 8.3217 > > > > cleanEx() > nameEx("dridgeline") > ### * dridgeline > > flush(stderr()); flush(stdout()) > > ### Name: dridgeline > ### Title: Density along the ridgeline > ### Aliases: dridgeline > ### Keywords: cluster multivariate > > ### ** Examples > > q <- dridgeline(seq(0,1,0.1),0.5,c(1,1),c(2,5),diag(2),diag(2)) > > > > cleanEx() > nameEx("dudahart2") > ### * dudahart2 > > flush(stderr()); flush(stdout()) > > ### Name: dudahart2 > ### Title: Duda-Hart test for splitting > ### Aliases: dudahart2 > ### Keywords: cluster > > ### ** Examples > > options(digits=2) > set.seed(98765) > iriss <- iris[sample(150,20),-5] > km <- kmeans(iriss,2) > dudahart2(iriss,km$cluster) $p.value [1] 2.2e-05 $dh [1] 0.26 $compare [1] 0.4 $cluster1 [1] FALSE $alpha [1] 0.001 $z [1] 3.1 > > > > cleanEx() > nameEx("extract.mixturepars") > ### * extract.mixturepars > > flush(stderr()); flush(stdout()) > > ### Name: extract.mixturepars > ### Title: Extract parameters for certain components from mclust > ### Aliases: extract.mixturepars > ### Keywords: cluster multivariate > > ### ** Examples > > set.seed(98765) > require(mclust) Loading required package: mclust Package 'mclust' version 6.0.1 Type 'citation("mclust")' for citing this R package in publications. > iriss <- iris[sample(150,20),-5] > irisBIC <- mclustBIC(iriss,G=5,modelNames="VEV") > siris <- summary(irisBIC,iriss) > emp <- extract.mixturepars(siris,2) > emp$pro [1] 1 > round(emp$mean,digits=1) Sepal.Length Sepal.Width Petal.Length Petal.Width 7.6 2.8 6.7 2.2 > emp$variance$modelName [1] "VEV" > round(emp$variance$scale,digits=2) [1] 0.00 0.00 0.02 0.00 0.03 > > > > > cleanEx() detaching ‘package:mclust’ > nameEx("findrep") > ### * findrep > > flush(stderr()); flush(stdout()) > > ### Name: findrep > ### Title: Finding representatives for cluster border > ### Aliases: findrep > ### Keywords: cluster > > ### ** Examples > > options(digits=3) > iriss <- as.matrix(iris[c(1:5,51:55,101:105),-5]) > irisc <- as.numeric(iris[c(1:5,51:55,101:105),5]) > findrep(iriss,colMeans(iriss),irisc,cluster=1,r=2) $repc [1] 3 5 $repx [1] 3 5 $maxr [1] 2 $wvar [1] 10.6 > > > > cleanEx() > nameEx("fixmahal") > ### * fixmahal > > flush(stderr()); flush(stdout()) > > ### Name: fixmahal > ### Title: Mahalanobis Fixed Point Clusters > ### Aliases: fixmahal summary.mfpc plot.mfpc fpclusters.mfpc > ### print.summary.mfpc print.mfpc fpmi > ### Keywords: cluster multivariate robust > > ### ** Examples > > options(digits=2) > set.seed(20000) > face <- rFace(400,dMoNo=2,dNoEy=0, p=3) > # The first example uses grouping information via init.group. > initg <- list() > grface <- as.integer(attr(face,"grouping")) > for (i in 1:5) initg[[i]] <- (grface==i) > ff0 <- fixmahal(face, pointit=FALSE, init.group=initg) > summary(ff0) * Mahalanobis Fixed Point Clusters * Often a clear cluster in the data leads to several similar FPCs. The summary shows the representative FPCs of groups of similar FPCs. Method fuzzy was used. Number of representative FPCs: 5 FPCs with less than 10 points were skipped. 0 iteration runs led to 0 skipped clusters. Weight 1 for r^2<= 7.8 weight 0 for r^2> 13 Constant ca= 7.8 corresponding to alpha= 0.95 FPC 1 Times found (group members): 1 Mean: [1] -2.1 17.1 1.2 Covariance matrix: [,1] [,2] [,3] [1,] 0.1420 -0.0053 -0.041 [2,] -0.0053 0.1518 0.063 [3,] -0.0413 0.0632 1.057 Number of points (sum of weights): 39 FPC 2 Times found (group members): 1 Mean: [1] 2.0 17.0 1.2 Covariance matrix: [,1] [,2] [,3] [1,] 0.1544 0.0038 0.043 [2,] 0.0038 0.1159 0.057 [3,] 0.0427 0.0567 1.296 Number of points (sum of weights): 76 FPC 3 Times found (group members): 1 Mean: [1] -0.0043 3.0912 0.5582 Covariance matrix: [,1] [,2] [,3] [1,] 0.1866 0.0017 0.0175 [2,] 0.0017 0.0438 0.0047 [3,] 0.0175 0.0047 0.2097 Number of points (sum of weights): 96 FPC 4 Times found (group members): 2 Mean: [1] 0.013 3.880 0.615 Covariance matrix: [,1] [,2] [,3] [1,] 0.2064 0.0067 0.017 [2,] 0.0067 4.8776 0.193 [3,] 0.0173 0.1934 0.262 Number of points (sum of weights): 197 FPC 5 Times found (group members): 1 Mean: [1] 0.11 7.58 0.63 Covariance matrix: [,1] [,2] [,3] [1,] 1.830 1.769 0.037 [2,] 1.769 36.251 0.074 [3,] 0.037 0.074 0.270 Number of points (sum of weights): 328 Number of points (rounded weights) in intersection of representative FPCs [,1] [,2] [,3] [,4] [,5] [1,] 39 0 0 0 27 [2,] 0 76 0 0 58 [3,] 0 0 96 96 96 [4,] 0 0 96 197 197 [5,] 27 58 96 197 328 > cff0 <- fpclusters(ff0) > plot(face, col=1+cff0[[1]]) > plot(face, col=1+cff0[[4]]) # Why does this come out as a cluster? > plot(ff0, face, 4) # A bit clearer... > # Without grouping information, examples need more time: > # ff1 <- fixmahal(face) > # summary(ff1) > # cff1 <- fpclusters(ff1) > # plot(face, col=1+cff1[[1]]) > # plot(face, col=1+cff1[[6]]) # Why does this come out as a cluster? > # plot(ff1, face, 6) # A bit clearer... > # ff2 <- fixmahal(face,method="ml") > # summary(ff2) > # ff3 <- fixmahal(face,method="ml",calpha=0.95,subset=50) > # summary(ff3) > ## ...fast, but lots of clusters. mer=0.3 may be useful here. > # set.seed(3000) > # face2 <- rFace(400,dMoNo=2,dNoEy=0) > # ff5 <- fixmahal(face2) > # summary(ff5) > ## misses right eye of face data; with p=6, > ## initial configurations are too large for 40 point clusters > # ff6 <- fixmahal(face2, startn=30) > # summary(ff6) > # cff6 <- fpclusters(ff6) > # plot(face2, col=1+cff6[[3]]) > # plot(ff6, face2, 3) > # x <- c(1,2,3,6,6,7,8,120) > # ff8 <- fixmahal(x) > # summary(ff8) > # ...dataset a bit too small for the defaults... > # ff9 <- fixmahal(x, mnc=3, startn=3) > # summary(ff9) > > > > cleanEx() > nameEx("fixreg") > ### * fixreg > > flush(stderr()); flush(stdout()) > > ### Name: fixreg > ### Title: Linear Regression Fixed Point Clusters > ### Aliases: fixreg summary.rfpc plot.rfpc fpclusters.rfpc > ### print.summary.rfpc print.rfpc rfpi > ### Keywords: cluster robust regression > > ### ** Examples > > set.seed(190000) > options(digits=3) > data(tonedata) > attach(tonedata) > tonefix <- fixreg(stretchratio,tuned,mtf=1,ir=20) > summary(tonefix) * Fixed Point Clusters * Often a clear cluster in the data leads to several similar FPCs. The summary shows the representative FPCs of groups of similar FPCs, which were found at least 1 times. Constant ca= 10.1 Number of representative FPCs: 2 FPCs with less than 50 points were skipped. 2 iterations led to skipped FPCs. FPC 1 Times found (group members): 14 Ratio to estimated expectation: 1.31 Regression parameters: Intercept X 1.9051 0.0477 Error variance: 0.00282 Number of points: 122 FPC 2 Times found (group members): 5 Ratio to estimated expectation: 2.13 Regression parameters: Intercept X 0.023 0.991 Error variance: 2e-04 Number of points: 74 Number of points in intersection of representative FPCs [,1] [,2] [1,] 122 57 [2,] 57 74 > # This is designed to have a fast example; default setting would be better. > # If you want to see more (and you have a bit more time), > # try out the following: > ## Not run: > ##D set.seed(1000) > ##D tonefix <- fixreg(stretchratio,tuned) > ##D # Default - good for these data > ##D summary(tonefix) > ##D plot(tonefix,stretchratio,tuned,1) > ##D plot(tonefix,stretchratio,tuned,2) > ##D plot(tonefix,stretchratio,tuned,3,bw=FALSE,pch=5) > ##D toneclus <- fpclusters(tonefix,stretchratio,tuned) > ##D plot(stretchratio,tuned,col=1+toneclus[[2]]) > ##D tonefix2 <- fixreg(stretchratio,tuned,distcut=1,mtf=1,countmode=50) > ##D # Every found fixed point cluster is reported, > ##D # no matter how instable it may be. > ##D summary(tonefix2) > ##D tonefix3 <- fixreg(stretchratio,tuned,ca=7) > ##D # ca defaults to 10.07 for these data. > ##D summary(tonefix3) > ##D subset <- c(rep(FALSE,5),rep(TRUE,24),rep(FALSE,121)) > ##D tonefix4 <- fixreg(stretchratio,tuned, > ##D mtf=1,ir=0,init.group=list(subset)) > ##D summary(tonefix4) > ## End(Not run) > > > > cleanEx() detaching ‘tonedata’ > nameEx("flexmixedruns") > ### * flexmixedruns > > flush(stderr()); flush(stdout()) > > ### Name: flexmixedruns > ### Title: Fitting mixed Gaussian/multinomial mixtures with flexmix > ### Aliases: flexmixedruns > ### Keywords: cluster > > ### ** Examples > > options(digits=3) > set.seed(776655) > v1 <- rnorm(100) > v2 <- rnorm(100) > d1 <- sample(1:5,100,replace=TRUE) > d2 <- sample(1:4,100,replace=TRUE) > ldata <- cbind(v1,v2,d1,d2) > fr <- flexmixedruns(ldata, + continuous=2,discrete=2,simruns=2,n.cluster=2:3,allout=FALSE) k= 2 new best fit found in run 1 k= 2 new best fit found in run 2 k= 2 BIC= 1258 k= 3 new best fit found in run 1 Nonoptimal or repeated fit found in run 2 k= 3 BIC= 1298 > print(fr$optimalk) [1] 2 > print(fr$optsummary) Call: flexmix(formula = x ~ 1, k = k, cluster = initial.cluster, model = lcmixed(continuous = continuous, discrete = discrete, ppdim = ppdim, diagonal = diagonal), control = control) prior size post>0 ratio Comp.1 0.511 47 77 0.61 Comp.2 0.489 53 93 0.57 'log Lik.' -576 (df=23) AIC: 1198 BIC: 1258 > print(fr$flexout@cluster) [1] 2 1 1 1 2 2 1 1 1 1 2 1 2 1 2 1 1 2 2 2 1 1 2 1 2 1 1 1 1 1 1 2 2 1 1 1 2 [38] 2 2 1 2 2 2 2 2 1 2 2 1 2 2 1 2 2 1 1 2 2 2 1 2 2 1 2 2 2 1 1 2 2 1 1 1 2 [75] 2 2 1 2 2 1 2 1 2 1 2 2 2 1 2 2 2 2 1 1 1 1 2 2 1 1 > print(fr$flexout@components) $Comp.1 $Comp.1[[1]] $center [1] 0.278 -0.171 $cov [,1] [,2] [1,] 1.37 0.00 [2,] 0.00 1.37 $pp $pp[[1]] [1] 0.136 0.260 0.216 0.121 0.268 $pp[[2]] [1] 2.31e-01 6.25e-07 2.02e-01 5.67e-01 $Comp.2 $Comp.2[[1]] $center [1] -0.1221 0.0365 $cov [,1] [,2] [1,] 0.403 0.00 [2,] 0.000 1.01 $pp $pp[[1]] [1] 0.206 0.219 0.204 0.303 0.068 $pp[[2]] [1] 0.351856 0.470507 0.177369 0.000268 > > > > cleanEx() > nameEx("itnumber") > ### * itnumber > > flush(stderr()); flush(stdout()) > > ### Name: itnumber > ### Title: Number of regression fixed point cluster iterations > ### Aliases: itnumber > ### Keywords: univar cluster > > ### ** Examples > > itnumber(500,4,150,2) [1] 6985 > > > > cleanEx() > nameEx("jittervar") > ### * jittervar > > flush(stderr()); flush(stdout()) > > ### Name: jittervar > ### Title: Jitter variables in a data matrix > ### Aliases: jittervar > ### Keywords: manip > > ### ** Examples > > set.seed(776655) > v1 <- rnorm(20) > v2 <- rnorm(20) > d1 <- sample(1:5,20,replace=TRUE) > d2 <- sample(1:4,20,replace=TRUE) > ldata <- cbind(v1,v2,d1,d2) > jv <- jittervar(ldata,jitterv=3:4) > > > > cleanEx() > nameEx("kmeansCBI") > ### * kmeansCBI > > flush(stderr()); flush(stdout()) > > ### Name: kmeansCBI > ### Title: Interface functions for clustering methods > ### Aliases: kmeansCBI hclustCBI hclusttreeCBI disthclustCBI > ### disthclusttreeCBI noisemclustCBI distnoisemclustCBI claraCBI pamkCBI > ### dbscanCBI mahalCBI mergenormCBI speccCBI tclustCBI pdfclustCBI > ### emskewCBI stupidkcentroidsCBI stupidknnCBI stupidkfnCBI > ### stupidkavenCBI > ### Keywords: cluster multivariate > > ### ** Examples > > options(digits=3) > set.seed(20000) > face <- rFace(50,dMoNo=2,dNoEy=0,p=2) > dbs <- dbscanCBI(face,eps=1.5,MinPts=4) > dhc <- disthclustCBI(dist(face),method="average",k=1.5,noisecut=2) > table(dbs$partition,dhc$partition) 1 1 16 2 9 3 9 4 5 5 11 > dm <- mergenormCBI(face,G=10,modelNames="EEE",nnk=2) > dtc <- tclustCBI(face,6,trim=0.1,restr.fact=500) Warning in .tkmeans.warn(O, ret) : Clusters with size <= p found - try reducing k. > table(dm$partition,dtc$partition) 1 2 3 4 5 6 7 1 4 0 0 1 0 0 0 2 10 8 7 2 5 0 0 3 0 1 0 0 0 0 0 4 0 0 2 4 0 1 5 > > > > > cleanEx() > nameEx("kmeansruns") > ### * kmeansruns > > flush(stderr()); flush(stdout()) > > ### Name: kmeansruns > ### Title: k-means with estimating k and initialisations > ### Aliases: kmeansruns > ### Keywords: cluster multivariate > > ### ** Examples > > options(digits=3) > set.seed(20000) > face <- rFace(50,dMoNo=2,dNoEy=0,p=2) > pka <- kmeansruns(face,krange=1:5,critout=TRUE,runs=2,criterion="asw") 2 clusters 0.742 3 clusters 0.544 4 clusters 0.59 5 clusters 0.588 > pkc <- kmeansruns(face,krange=1:5,critout=TRUE,runs=2,criterion="ch") 2 clusters 181 3 clusters 108 4 clusters 231 5 clusters 187 > > > > cleanEx() > nameEx("lcmixed") > ### * lcmixed > > flush(stderr()); flush(stdout()) > > ### Name: lcmixed > ### Title: flexmix method for mixed Gaussian/multinomial mixtures > ### Aliases: lcmixed > ### Keywords: cluster > > ### ** Examples > > set.seed(112233) > options(digits=3) > require(MASS) Loading required package: MASS > require(flexmix) Loading required package: flexmix Loading required package: lattice > data(Cars93) > Cars934 <- Cars93[,c(3,5,8,10)] > cc <- + discrete.recode(Cars934,xvarsorted=FALSE,continuous=c(2,3),discrete=c(1,4)) > fcc <- flexmix(cc$data~1,k=2, + model=lcmixed(continuous=2,discrete=2,ppdim=c(6,3),diagonal=TRUE)) > summary(fcc) Call: flexmix(formula = cc$data ~ 1, k = 2, model = lcmixed(continuous = 2, discrete = 2, ppdim = c(6, 3), diagonal = TRUE)) prior size post>0 ratio Comp.1 0.464 42 61 0.689 Comp.2 0.536 51 66 0.773 'log Lik.' -789 (df=23) AIC: 1625 BIC: 1683 > > > > cleanEx() detaching ‘package:flexmix’, ‘package:lattice’, ‘package:MASS’ > nameEx("localshape") > ### * localshape > > flush(stderr()); flush(stdout()) > > ### Name: localshape > ### Title: Local shape matrix > ### Aliases: localshape > ### Keywords: multivariate > > ### ** Examples > > options(digits=3) > data(iris) > localshape(iris[,-5],mscatter="cov") Sepal.Length Sepal.Width Petal.Length Petal.Width Sepal.Length 631889 309217 265161 63257 Sepal.Width 309217 456220 34488 52444 Petal.Length 265161 34488 386520 116040 Petal.Width 63257 52444 116040 104828 > > > > cleanEx() > nameEx("mahalanodisc") > ### * mahalanodisc > > flush(stderr()); flush(stdout()) > > ### Name: mahalanodisc > ### Title: Mahalanobis for AWC > ### Aliases: mahalanodisc > ### Keywords: multivariate > > ### ** Examples > > options(digits=3) > x <- cbind(rnorm(50),rnorm(50)) > mahalanodisc(x,c(0,0),cov(x)) [1] 0.7135 0.4376 1.1082 4.8732 2.3944 5.0023 0.4707 1.8782 0.8586 0.1586 [11] 9.8187 0.2204 1.0252 7.1017 2.3418 0.0401 3.4794 3.7170 1.0145 5.6731 [21] 1.5074 1.3702 0.4105 6.8458 2.1580 0.0936 0.2516 3.1341 0.3337 0.6000 [31] 2.9447 0.0362 1.7431 2.4890 3.0445 0.3544 1.3904 0.1056 1.9390 0.9399 [41] 0.3619 1.6200 2.2192 1.0100 3.2679 1.0197 1.8863 1.1636 1.6318 1.3235 > mahalanodisc(x,c(0,0),matrix(0,ncol=2,nrow=2)) [1] 5.51e+09 4.08e+09 8.15e+09 3.82e+10 2.16e+10 4.60e+10 3.72e+09 1.64e+10 [9] 6.56e+09 1.12e+09 8.05e+10 1.54e+09 8.62e+09 4.91e+10 1.82e+10 3.77e+08 [17] 3.26e+10 3.04e+10 6.98e+09 5.07e+10 1.07e+10 1.12e+10 3.79e+09 4.83e+10 [25] 1.96e+10 8.81e+08 2.21e+09 2.16e+10 2.34e+09 5.22e+09 2.17e+10 2.88e+08 [33] 1.54e+10 2.32e+10 2.25e+10 2.83e+09 1.29e+10 9.60e+08 1.35e+10 6.54e+09 [41] 3.21e+09 1.52e+10 1.83e+10 8.00e+09 2.99e+10 8.12e+09 1.76e+10 9.19e+09 [49] 1.51e+10 1.00e+10 > > > > cleanEx() > nameEx("mahalanofix") > ### * mahalanofix > > flush(stderr()); flush(stdout()) > > ### Name: mahalanofix > ### Title: Mahalanobis distances from center of indexed points > ### Aliases: mahalanofix mahalanofuz > ### Keywords: multivariate > > ### ** Examples > > x <- c(1,2,3,4,5,6,7,8,9,10) > y <- c(1,2,3,8,7,6,5,8,9,10) > mahalanofix(cbind(x,y),gv=c(0,0,0,1,1,1,1,1,0,0)) > mahalanofix(cbind(x,y),gv=c(0,0,0,1,1,1,1,0,0,0)) > mahalanofix(cbind(x,y),gv=c(0,0,0,1,1,1,1,1,0,0),method="mcd") > mahalanofuz(cbind(x,y),gv=c(0,0,0.5,0.5,1,1,1,0.5,0.5,0)) > > > > cleanEx() > nameEx("mahalconf") > ### * mahalconf > > flush(stderr()); flush(stdout()) > > ### Name: mahalconf > ### Title: Mahalanobis fixed point clusters initial configuration > ### Aliases: mahalconf > ### Keywords: multivariate cluster > > ### ** Examples > > set.seed(4634) > face <- rFace(600,dMoNo=2,dNoEy=0,p=2) > mahalconf(face,no=200,startn=20,covall=cov(face),plot="start") [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [49] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [61] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [73] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [85] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [97] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [109] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [121] FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [133] FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE [145] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [157] FALSE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [169] TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [181] FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE [193] TRUE FALSE FALSE FALSE FALSE TRUE FALSE TRUE FALSE FALSE FALSE FALSE [205] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [217] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [229] FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [241] TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [253] FALSE FALSE FALSE FALSE FALSE TRUE FALSE TRUE FALSE FALSE FALSE FALSE [265] FALSE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [277] TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [289] TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE [301] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [313] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [325] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [337] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [349] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [361] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [373] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [385] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [397] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [409] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [421] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [433] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [445] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [457] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [469] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [481] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [493] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [505] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [517] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [529] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [541] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [553] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [565] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [577] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [589] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE > > > > cleanEx() > nameEx("mergenormals") > ### * mergenormals > > flush(stderr()); flush(stdout()) > > ### Name: mergenormals > ### Title: Clustering by merging Gaussian mixture components > ### Aliases: mergenormals summary.mergenorm print.summary.mergenorm > ### Keywords: multivariate cluster > > ### ** Examples > > require(mclust) Loading required package: mclust Package 'mclust' version 6.0.1 Type 'citation("mclust")' for citing this R package in publications. > require(MASS) Loading required package: MASS > options(digits=3) > data(crabs) > dc <- crabs[,4:8] > cm <- mclustBIC(crabs[,4:8],G=9,modelNames="EEE") > scm <- summary(cm,crabs[,4:8]) > cmnbhat <- mergenormals(crabs[,4:8],scm,method="bhat") > summary(cmnbhat) * Merging Gaussian mixture components * Method: bhat , cutoff value: 0.1 Original number of components: 9 Number of clusters after merging: 4 Values at which clusters were merged: [,1] [,2] [1,] 8 0.5746 [2,] 7 0.2503 [3,] 6 0.2307 [4,] 5 0.1443 [5,] 4 0.1258 [6,] 3 0.0999 Components assigned to clusters: [,1] [1,] 1 [2,] 1 [3,] 2 [4,] 3 [5,] 2 [6,] 1 [7,] 4 [8,] 4 [9,] 3 > cmndemp <- mergenormals(crabs[,4:8],scm,method="demp") > summary(cmndemp) * Merging Gaussian mixture components * Method: demp , cutoff value: 0.025 Original number of components: 9 Number of clusters after merging: 4 Values at which clusters were merged: [,1] [,2] [1,] 8 0.1902 [2,] 7 0.0714 [3,] 6 0.0694 [4,] 5 0.0351 [5,] 4 0.0266 [6,] 3 0.0220 Components assigned to clusters: [,1] [1,] 1 [2,] 1 [3,] 2 [4,] 3 [5,] 2 [6,] 1 [7,] 4 [8,] 4 [9,] 3 > # Other methods take a bit longer, but try them! > # The values of by and M below are still chosen for reasonably fast execution. > # cmnrr <- mergenormals(crabs[,4:8],scm,method="ridge.ratio",by=0.05) > # cmd <- mergenormals(crabs[,4:8],scm,method="dip.tantrum",by=0.05) > # cmp <- mergenormals(crabs[,4:8],scm,method="predictive",M=3) > > > > cleanEx() detaching ‘package:MASS’, ‘package:mclust’ > nameEx("mergeparameters") > ### * mergeparameters > > flush(stderr()); flush(stdout()) > > ### Name: mergeparameters > ### Title: New parameters from merging two Gaussian mixture components > ### Aliases: mergeparameters > ### Keywords: multivariate cluster > > ### ** Examples > > options(digits=3) > set.seed(98765) > require(mclust) Loading required package: mclust Package 'mclust' version 6.0.1 Type 'citation("mclust")' for citing this R package in publications. > iriss <- iris[sample(150,20),-5] > irisBIC <- mclustBIC(iriss) > siris <- summary(irisBIC,iriss) > probs <- siris$parameters$pro > muarray <- siris$parameters$mean > Sigmaarray <- siris$parameters$variance$sigma > z <- siris$z > mpi <- mergeparameters(iriss,1,2,probs,muarray,Sigmaarray,z) > mpi$probs [1] 0.25 0.10 0.30 0.15 0.30 > mpi$muarray [,1] [,2] [,3] [,4] [,5] Sepal.Length 6.98 7.65 5.233 7.03 5.80 Sepal.Width 2.94 2.80 3.583 3.60 2.73 Petal.Length 5.40 6.75 1.383 6.07 4.52 Petal.Width 1.72 2.20 0.217 2.33 1.60 > > > > cleanEx() detaching ‘package:mclust’ > nameEx("minsize") > ### * minsize > > flush(stderr()); flush(stdout()) > > ### Name: minsize > ### Title: Minimum size of regression fixed point cluster > ### Aliases: minsize > ### Keywords: univar cluster > > ### ** Examples > > minsize(500,4,7000,2) [1] 127 > > > > cleanEx() > nameEx("mixdens") > ### * mixdens > > flush(stderr()); flush(stdout()) > > ### Name: mixdens > ### Title: Density of multivariate Gaussian mixture, mclust > ### parameterisation > ### Aliases: mixdens > ### Keywords: cluster multivariate > > ### ** Examples > > set.seed(98765) > require(mclust) Loading required package: mclust Package 'mclust' version 6.0.1 Type 'citation("mclust")' for citing this R package in publications. > iriss <- iris[sample(150,20),-5] > irisBIC <- mclustBIC(iriss) > siris <- summary(irisBIC,iriss) > round(mixdens(siris$modelName,iriss,siris$parameters),digits=2) 59 106 37 22 52 40 149 15 119 76 1136.59 3272.04 8.35 1.02 2381.43 0.23 15.95 9.24 3272.04 258.91 95 112 118 65 50 38 122 110 143 93 0.30 1.47 20.20 0.47 1.87 1.06 1.91 41.67 4.02 0.40 > > > > cleanEx() detaching ‘package:mclust’ > nameEx("mixpredictive") > ### * mixpredictive > > flush(stderr()); flush(stdout()) > > ### Name: mixpredictive > ### Title: Prediction strength of merged Gaussian mixture > ### Aliases: mixpredictive > ### Keywords: cluster multivariate > > ### ** Examples > > set.seed(98765) > iriss <- iris[sample(150,20),-5] > mp <- mixpredictive(iriss,2,2,M=2) > > > > cleanEx() > nameEx("mvdcoord") > ### * mvdcoord > > flush(stderr()); flush(stdout()) > > ### Name: mvdcoord > ### Title: Mean/variance differences discriminant coordinates > ### Aliases: mvdcoord > ### Keywords: multivariate classif > > ### ** Examples > > set.seed(4634) > face <- rFace(300,dMoNo=2,dNoEy=0,p=3) > grface <- as.integer(attr(face,"grouping")) > mcf <- mvdcoord(face,grface) > plot(mcf$proj,col=grface) > # ...done in one step by function plotcluster. > > > > cleanEx() > nameEx("ncoord") > ### * ncoord > > flush(stderr()); flush(stdout()) > > ### Name: ncoord > ### Title: Neighborhood based discriminant coordinates > ### Aliases: ncoord > ### Keywords: multivariate classif > > ### ** Examples > > set.seed(4634) > face <- rFace(600,dMoNo=2,dNoEy=0) > grface <- as.integer(attr(face,"grouping")) > ncf <- ncoord(face,grface) > plot(ncf$proj,col=grface) > ncf2 <- ncoord(face,grface,weighted=TRUE) > plot(ncf2$proj,col=grface) > # ...done in one step by function plotcluster. > > > > cleanEx() > nameEx("neginc") > ### * neginc > > flush(stderr()); flush(stdout()) > > ### Name: neginc > ### Title: Neg-entropy normality index for cluster validation > ### Aliases: neginc > ### Keywords: cluster > > ### ** Examples > > options(digits=3) > iriss <- as.matrix(iris[c(1:10,51:55,101:105),-5]) > irisc <- as.numeric(iris[c(1:10,51:55,101:105),5]) > neginc(iriss,irisc) [1] -2.92 > > > > cleanEx() > nameEx("nselectboot") > ### * nselectboot > > flush(stderr()); flush(stdout()) > > ### Name: nselectboot > ### Title: Selection of the number of clusters via bootstrap > ### Aliases: nselectboot > ### Keywords: cluster multivariate > > ### ** Examples > > set.seed(20000) > face <- rFace(50,dMoNo=2,dNoEy=0,p=2) > nselectboot(dist(face),B=2,clustermethod=disthclustCBI, + method="average",krange=5:7) > nselectboot(dist(face),B=2,clustermethod=claraCBI, + classification="centroid",krange=5:7) > nselectboot(face,B=2,clustermethod=kmeansCBI, + classification="centroid",krange=5:7) > # Of course use larger B in a real application. > > > > cleanEx() > nameEx("pamk") > ### * pamk > > flush(stderr()); flush(stdout()) > > ### Name: pamk > ### Title: Partitioning around medoids with estimation of number of > ### clusters > ### Aliases: pamk > ### Keywords: cluster multivariate > > ### ** Examples > > options(digits=3) > set.seed(20000) > face <- rFace(50,dMoNo=2,dNoEy=0,p=2) > pk1 <- pamk(face,krange=1:5,criterion="asw",critout=TRUE) 1 clusters 0 2 clusters 0.742 3 clusters 0.748 4 clusters 0.581 5 clusters 0.544 > pk2 <- pamk(face,krange=1:5,criterion="multiasw",ns=2,critout=TRUE) 1 clusters 0 2 clusters 0.749 3 clusters 0.727 4 clusters 0.584 5 clusters 0.582 > # "multiasw" is better for larger data sets, use larger ns then. > pk3 <- pamk(face,krange=1:5,criterion="ch",critout=TRUE) 1 clusters 0 2 clusters 181 3 clusters 210 4 clusters 204 5 clusters 181 > > > > cleanEx() > nameEx("piridge") > ### * piridge > > flush(stderr()); flush(stdout()) > > ### Name: piridge > ### Title: Ridgeline Pi-function > ### Aliases: piridge > ### Keywords: cluster multivariate > > ### ** Examples > > q <- piridge(seq(0,1,0.1),c(1,1),c(2,5),diag(2),diag(2)) > > > > cleanEx() > nameEx("piridge.zeroes") > ### * piridge.zeroes > > flush(stderr()); flush(stdout()) > > ### Name: piridge.zeroes > ### Title: Extrema of two-component Gaussian mixture > ### Aliases: piridge.zeroes > ### Keywords: cluster multivariate > > ### ** Examples > > q <- piridge.zeroes(0.2,c(1,1),c(2,5),diag(2),diag(2),by=0.1) > > > > cleanEx() > nameEx("plot.valstat") > ### * plot.valstat > > flush(stderr()); flush(stdout()) > > ### Name: plot.valstat > ### Title: Simulation-standardised plot and print of cluster validation > ### statistics > ### Aliases: plot.valstat print.valstat > ### Keywords: cluster multivariate > > ### ** Examples > > set.seed(20000) > options(digits=3) > face <- rFace(10,dMoNo=2,dNoEy=0,p=2) > clustermethod=c("kmeansCBI","hclustCBI","hclustCBI") > clustermethodpars <- list() > clustermethodpars[[2]] <- clustermethodpars[[3]] <- list() > clustermethodpars[[2]]$method <- "ward.D2" > clustermethodpars[[3]]$method <- "single" > methodname <- c("kmeans","ward","single") > cbs <- clusterbenchstats(face,G=2:3,clustermethod=clustermethod, + methodname=methodname,distmethod=rep(FALSE,3), + clustermethodpars=clustermethodpars,nnruns=2,kmruns=2,fnruns=2,avenruns=2) [1] "kmeansCBI" [1] "hclustCBI" [1] "hclustCBI" [1] "Computation of validity statistics" comsum 1 comsum 2 comsum 3 [1] "Simulation" 2 clusters; nn run 1 2 clusters; nn run 2 2 clusters; fn run 1 2 clusters; fn run 2 2 clusters; aven run 1 2 clusters; aven run 2 2 clusters; km run 1 2 clusters; km run 2 3 clusters; nn run 1 3 clusters; nn run 2 3 clusters; fn run 1 3 clusters; fn run 2 3 clusters; aven run 1 3 clusters; aven run 2 3 clusters; km run 1 3 clusters; km run 2 [1] "Simulation quantile re-standardisation" [1] "Simulation sd re-standardisation" > plot(cbs$stat,cbs$sim) > plot(cbs$stat,cbs$sim,statistic="dindex") > plot(cbs$stat,cbs$sim,statistic="avewithin") > pcbs <- print(cbs$sstat,aggregate=TRUE,weights=c(1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0)) avewithin method 2 3 1 kmeans -0.56 0.38 2 ward -0.17 -0.89 3 single -2.43 -0.76 mnnd method 2 3 1 kmeans -1.94 -0.15 2 ward 0.31 -2.64 3 single 0.79 0.71 cvnnd method 2 3 1 kmeans -1.36 0.33 2 ward -0.29 -0.61 3 single -0.57 -1.21 maxdiameter method 2 3 1 kmeans -0.6 0.17 2 ward -0.6 -0.73 3 single -1.9 -1.25 widestgap method 2 3 1 kmeans -0.85 0.69 2 ward -0.70 -1.16 3 single -0.70 -1.01 sindex method 2 3 1 kmeans -1.66 -0.18 2 ward -0.13 -0.62 3 single -0.13 0.62 minsep method 2 3 1 kmeans -1.66 -0.37 2 ward -0.13 -0.71 3 single -0.13 0.88 asw method 2 3 1 kmeans -0.61 0.21 2 ward -0.11 -0.91 3 single -2.43 -0.16 dindex method 2 3 1 kmeans 0.3 NaN 2 ward 0.3 NaN 3 single -3.0 NaN denscut method 2 3 1 kmeans NaN 0.60 2 ward NaN 0.47 3 single NaN 0.60 highdgap method 2 3 1 kmeans -0.106 0.69 2 ward -0.042 -1.16 3 single -2.760 -1.01 pearsongamma method 2 3 1 kmeans -0.50 0.34 2 ward -0.12 -1.44 3 single -2.53 -0.94 withinss method 2 3 1 kmeans -0.42 0.51 2 ward -0.15 -0.57 3 single -2.57 -1.25 entropy method 2 3 1 kmeans 0.47 0.85 2 ward -0.13 1.13 3 single -2.90 -1.27 pamc method 2 3 1 kmeans -0.334 0.34 2 ward -0.093 -0.32 3 single -2.691 -0.91 dmode method 2 3 1 kmeans 0.20 NaN 2 ward 0.22 NaN 3 single -2.95 NaN aggregate method 2 3 1 kmeans -0.91 0.18 2 ward -0.14 -0.98 3 single -1.69 -0.36 > # Some of the values are "NaN" because due to the low number of runs of > # the stupid clustering methods there is no variation. If this happens > # in a real application, nnruns etc. should be chosen higher than 2. > # Also useallg=TRUE in clusterbenchstats may help. > # > # Finding the best aggregated value: > mpcbs <- as.matrix(pcbs[[17]][,-1]) > which(mpcbs==max(mpcbs),arr.ind=TRUE) row col [1,] 1 2 > # row=1 refers to the first clustering method kmeansCBI, > # col=2 refers to the second number of clusters, which is 3 in g=2:3. > > > > cleanEx() > nameEx("plotcluster") > ### * plotcluster > > flush(stderr()); flush(stdout()) > > ### Name: plotcluster > ### Title: Discriminant projection plot. > ### Aliases: plotcluster > ### Keywords: multivariate classif > > ### ** Examples > > set.seed(4634) > face <- rFace(300,dMoNo=2,dNoEy=0) > grface <- as.integer(attr(face,"grouping")) > plotcluster(face,grface) > plotcluster(face,grface==1) > plotcluster(face,grface, clnum=1, method="vbc") [1] "Cluster indicator has more than 2 values" > > > > cleanEx() > nameEx("prediction.strength") > ### * prediction.strength > > flush(stderr()); flush(stdout()) > > ### Name: prediction.strength > ### Title: Prediction strength for estimating number of clusters > ### Aliases: prediction.strength print.predstr > ### Keywords: cluster multivariate > > ### ** Examples > > options(digits=3) > set.seed(98765) > iriss <- iris[sample(150,20),-5] > prediction.strength(iriss,2,3,M=3) Prediction strength Clustering method: kmeans Maximum number of clusters: 3 Resampled data sets: 3 Mean pred.str. for numbers of clusters: 1 1 0.889 Cutoff value: 0.8 Largest number of clusters better than cutoff: 3 > prediction.strength(iriss,2,3,M=3,clustermethod=claraCBI) Prediction strength Clustering method: clara/pam Maximum number of clusters: 3 Resampled data sets: 3 Mean pred.str. for numbers of clusters: 1 1 0.933 Cutoff value: 0.8 Largest number of clusters better than cutoff: 3 > # The examples are fast, but of course M should really be larger. > > > > cleanEx() > nameEx("rFace") > ### * rFace > > flush(stderr()); flush(stdout()) > > ### Name: rFace > ### Title: "Face-shaped" clustered benchmark datasets > ### Aliases: rFace > ### Keywords: data > > ### ** Examples > > set.seed(4634) > face <- rFace(600,dMoNo=2,dNoEy=0) > grface <- as.integer(attr(face,"grouping")) > plot(face, col = grface) > # pairs(face, col = grface, main ="rFace(600,dMoNo=2,dNoEy=0)") > > > > cleanEx() > nameEx("randcmatrix") > ### * randcmatrix > > flush(stderr()); flush(stdout()) > > ### Name: randcmatrix > ### Title: Random partition matrix > ### Aliases: randcmatrix > ### Keywords: cluster > > ### ** Examples > > set.seed(111) > randcmatrix(10,2,1) [,1] [,2] [1,] 0 1 [2,] 0 1 [3,] 1 0 [4,] 0 1 [5,] 1 0 [6,] 1 0 [7,] 1 0 [8,] 0 1 [9,] 1 0 [10,] 1 0 > > > > cleanEx() > nameEx("randconf") > ### * randconf > > flush(stderr()); flush(stdout()) > > ### Name: randconf > ### Title: Generate a sample indicator vector > ### Aliases: randconf > ### Keywords: distribution > > ### ** Examples > > randconf(10,3) [1] FALSE FALSE FALSE TRUE FALSE FALSE TRUE FALSE TRUE FALSE > > > > cleanEx() > nameEx("randomclustersim") > ### * randomclustersim > > flush(stderr()); flush(stdout()) > > ### Name: randomclustersim > ### Title: Simulation of validity indexes based on random clusterings > ### Aliases: randomclustersim > ### Keywords: multivariate cluster > > ### ** Examples > > set.seed(20000) > options(digits=3) > face <- rFace(10,dMoNo=2,dNoEy=0,p=2) > rmx <- randomclustersim(dist(face),datanp=face,npstats=TRUE,G=2:3, + nnruns=2,kmruns=2, fnruns=1,avenruns=1,nnk=2) 2 clusters; nn run 1 2 clusters; nn run 2 2 clusters; fn run 1 2 clusters; aven run 1 2 clusters; km run 1 2 clusters; km run 2 3 clusters; nn run 1 3 clusters; nn run 2 3 clusters; fn run 1 3 clusters; aven run 1 3 clusters; km run 1 3 clusters; km run 2 > ## Not run: > ##D rmx$km # Produces slightly different but basically identical results on ATLAS > ## End(Not run) > rmx$aven [[1]] NULL [[2]] avewithin mnnd cvnnd maxdiameter widestgap sindex minsep asw dindex 1 0.773 0.317 0.97 0.482 0.715 0.405 0.405 0.648 1 denscut highdgap pearsongamma withinss entropy pamc kdnorm kdunif 1 1 0.862 0.885 0.785 0.971 0.832 0.599 0.541 [[3]] avewithin mnnd cvnnd maxdiameter widestgap sindex minsep asw dindex 1 0.847 0.385 0.969 0.692 0.715 0.317 0.272 0.609 1 denscut highdgap pearsongamma withinss entropy pamc kdnorm kdunif 1 1 0.862 0.867 0.919 0.96 0.912 0.54 0.505 > rmx$fn [[1]] NULL [[2]] avewithin mnnd cvnnd maxdiameter widestgap sindex minsep asw dindex 1 0.773 0.317 0.97 0.482 0.715 0.405 0.405 0.648 1 denscut highdgap pearsongamma withinss entropy pamc kdnorm kdunif 1 1 0.862 0.885 0.785 0.971 0.832 0.599 0.541 [[3]] avewithin mnnd cvnnd maxdiameter widestgap sindex minsep asw dindex 1 0.718 0.358 0.719 0.493 0.595 0.0908 0 0.206 1 denscut highdgap pearsongamma withinss entropy pamc kdnorm kdunif 1 0.906 0.804 0.786 0.76 0.865 0.837 0.484 0.376 > rmx$nn [[1]] NULL [[2]] avewithin mnnd cvnnd maxdiameter widestgap sindex minsep asw dindex 1 0.500 0.332 0.82 0.000 0.595 0.285 0.285 0.000 0.847 2 0.773 0.317 0.97 0.482 0.715 0.405 0.405 0.648 1.000 denscut highdgap pearsongamma withinss entropy pamc kdnorm kdunif 1 1 0.608 0.498 0.082 0.469 0.675 0.608 0.558 2 1 0.862 0.885 0.785 0.971 0.832 0.599 0.541 [[3]] avewithin mnnd cvnnd maxdiameter widestgap sindex minsep asw dindex 1 0.764 0.279 0.980 0.482 0.715 0.190 0.0827 0.403 1 2 0.838 0.285 0.964 0.661 0.715 0.315 0.2846 0.587 1 denscut highdgap pearsongamma withinss entropy pamc kdnorm kdunif 1 0.995 0.862 0.801 0.790 0.817 0.843 0.475 0.448 2 1.000 0.862 0.857 0.912 0.991 0.913 0.415 0.562 > > > > > cleanEx() > nameEx("regmix") > ### * regmix > > flush(stderr()); flush(stdout()) > > ### Name: regmix > ### Title: Mixture Model ML for Clusterwise Linear Regression > ### Aliases: regmix regem > ### Keywords: cluster regression > > ### ** Examples > > ## Not run: > ##D # This apparently gives slightly different > ##D # but data-analytically fine results > ##D # on some versions of R. > ##D set.seed(12234) > ##D data(tonedata) > ##D attach(tonedata) > ##D rmt1 <- regmix(stretchratio,tuned,nclust=1:2) > ##D # nclust=1:2 makes the example fast; > ##D # a more serious application would rather use the default. > ##D rmt1$g > ##D round(rmt1$bic,digits=2) > ##D # start with initial parameter values > ##D cln <- 3 > ##D n <- 150 > ##D initcoef <- cbind(c(2,0),c(0,1),c(0,2.5)) > ##D initvar <- c(0.001,0.0001,0.5) > ##D initeps <- c(0.4,0.3,0.3) > ##D # computation of m from initial parameters > ##D m <- matrix(nrow=n, ncol=cln) > ##D stm <- numeric(0) > ##D for (i in 1:cln) > ##D for (j in 1:n){ > ##D m[j,i] <- initeps[i]*dnorm(tuned[j],mean=initcoef[1,i]+ > ##D initcoef[2,i]*stretchratio[j], sd=sqrt(initvar[i])) > ##D } > ##D for (j in 1:n){ > ##D stm[j] <- sum(m[j,]) > ##D for (i in 1:cln) > ##D m[j,i] <- m[j,i]/stm[j] > ##D } > ##D rmt2 <- regem(stretchratio, tuned, m, cln) > ## End(Not run) > > > > cleanEx() > nameEx("ridgeline") > ### * ridgeline > > flush(stderr()); flush(stdout()) > > ### Name: ridgeline > ### Title: Ridgeline computation > ### Aliases: ridgeline > ### Keywords: cluster multivariate > > ### ** Examples > > ridgeline(0.5,c(1,1),c(2,5),diag(2),diag(2)) [,1] [1,] 1.5 [2,] 3.0 > > > > cleanEx() > nameEx("ridgeline.diagnosis") > ### * ridgeline.diagnosis > > flush(stderr()); flush(stdout()) > > ### Name: ridgeline.diagnosis > ### Title: Ridgeline plots, ratios and unimodality > ### Aliases: ridgeline.diagnosis > ### Keywords: cluster multivariate > > ### ** Examples > > muarray <- cbind(c(0,0),c(0,0.1),c(10,10)) > sigmaarray <- array(c(diag(2),diag(2),diag(2)),dim=c(2,2,3)) > rd <- + ridgeline.diagnosis(c(0.5,0.3,0.2),muarray,sigmaarray,ridgelineplot="matrix",by=0.1) > # Much slower but more precise with default by=0.001. > > > > cleanEx() > nameEx("simmatrix") > ### * simmatrix > > flush(stderr()); flush(stdout()) > > ### Name: simmatrix > ### Title: Extracting intersections between clusters from fpc-object > ### Aliases: simmatrix > ### Keywords: utilities > > ### ** Examples > > set.seed(190000) > data(tonedata) > # Note: If you do not use the installed package, replace this by > # tonedata <- read.table("(path/)tonedata.txt", header=TRUE) > attach(tonedata) > tonefix <- fixreg(stretchratio,tuned,mtf=1,ir=20) > simmatrix(tonefix)[sseg(2,3)] [1] NA > > > > cleanEx() detaching ‘tonedata’ > nameEx("solvecov") > ### * solvecov > > flush(stderr()); flush(stdout()) > > ### Name: solvecov > ### Title: Inversion of (possibly singular) symmetric matrices > ### Aliases: solvecov > ### Keywords: array > > ### ** Examples > > x <- c(1,0,0,1,0,1,0,0,1) > dim(x) <- c(3,3) > solvecov(x) > > > > cleanEx() > nameEx("sseg") > ### * sseg > > flush(stderr()); flush(stdout()) > > ### Name: sseg > ### Title: Position in a similarity vector > ### Aliases: sseg > ### Keywords: utilities > > ### ** Examples > > sseg(3,4) [1] 9 > > > > cleanEx() > nameEx("stupidkaven") > ### * stupidkaven > > flush(stderr()); flush(stdout()) > > ### Name: stupidkaven > ### Title: Stupid average dissimilarity random clustering > ### Aliases: stupidkaven > ### Keywords: multivariate cluster > > ### ** Examples > > set.seed(20000) > options(digits=3) > face <- rFace(200,dMoNo=2,dNoEy=0,p=2) > stupidkaven(dist(face),3) [1] 2 3 2 2 2 3 3 2 2 2 3 2 3 2 3 2 3 2 3 3 2 2 3 3 2 3 2 2 3 2 2 2 1 2 1 2 3 [38] 3 3 1 3 1 1 2 2 2 3 1 1 1 1 3 3 2 2 1 1 1 3 1 3 1 3 3 1 3 1 3 1 1 1 2 1 3 [75] 3 1 1 1 3 2 1 1 1 2 1 3 1 2 3 3 2 1 1 1 1 2 3 1 2 2 2 2 2 2 2 2 2 2 2 2 2 [112] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 [149] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 [186] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 > > > > cleanEx() > nameEx("stupidkcentroids") > ### * stupidkcentroids > > flush(stderr()); flush(stdout()) > > ### Name: stupidkcentroids > ### Title: Stupid k-centroids random clustering > ### Aliases: stupidkcentroids > ### Keywords: multivariate cluster > > ### ** Examples > > set.seed(20000) > options(digits=3) > face <- rFace(200,dMoNo=2,dNoEy=0,p=2) > stupidkcentroids(dist(face),3) $partition [1] 3 3 3 2 3 1 1 2 2 3 1 2 1 2 1 2 1 3 1 1 2 2 1 1 2 1 3 2 1 2 2 2 1 2 1 2 1 [38] 1 3 1 3 1 3 2 2 2 3 1 1 1 1 3 3 2 2 1 1 3 3 1 3 1 3 3 1 3 1 3 1 1 1 2 1 3 [75] 3 1 1 1 3 2 3 1 1 2 1 3 1 2 3 3 1 1 1 1 1 2 3 1 3 3 3 3 3 3 3 3 3 3 2 2 3 [112] 3 3 2 2 2 3 3 3 3 3 3 3 3 3 3 2 2 2 3 3 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 [149] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 2 2 2 2 2 2 2 2 2 [186] 2 2 2 2 2 2 2 2 2 2 2 2 3 2 2 $centroids [1] 51 55 63 $distances [1] TRUE > > > > cleanEx() > nameEx("stupidkfn") > ### * stupidkfn > > flush(stderr()); flush(stdout()) > > ### Name: stupidkfn > ### Title: Stupid farthest neighbour random clustering > ### Aliases: stupidkfn > ### Keywords: multivariate cluster > > ### ** Examples > > set.seed(20000) > options(digits=3) > face <- rFace(200,dMoNo=2,dNoEy=0,p=2) > stupidkfn(dist(face),3) [1] 1 3 1 2 1 3 3 2 2 1 3 2 3 2 3 2 3 1 3 3 1 1 3 3 2 3 1 2 3 2 2 1 3 2 3 1 3 [38] 3 3 3 3 1 3 2 2 2 3 1 3 1 1 3 3 2 2 1 1 1 3 1 3 1 3 3 3 3 1 3 1 1 1 2 1 3 [75] 3 1 1 1 3 2 1 1 1 2 1 3 3 2 3 3 2 3 1 1 1 2 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 [112] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 [149] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 [186] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 > > > > cleanEx() > nameEx("stupidknn") > ### * stupidknn > > flush(stderr()); flush(stdout()) > > ### Name: stupidknn > ### Title: Stupid nearest neighbour random clustering > ### Aliases: stupidknn > ### Keywords: multivariate cluster > > ### ** Examples > > set.seed(20000) > options(digits=3) > face <- rFace(200,dMoNo=2,dNoEy=0,p=2) > stupidknn(dist(face),3) [1] 2 3 2 2 2 3 3 2 2 2 3 2 3 2 3 2 3 2 3 3 2 2 3 3 2 3 2 2 3 2 2 2 3 2 3 2 3 [38] 3 3 1 3 1 1 1 1 2 3 1 1 1 1 3 3 1 2 1 1 1 3 1 3 1 3 3 1 3 1 1 1 1 1 1 1 3 [75] 3 1 1 1 3 1 1 1 1 2 1 3 1 2 3 1 1 1 1 1 1 1 3 1 2 2 2 2 2 2 2 2 2 2 2 2 2 [112] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 [149] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 [186] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 > > > > cleanEx() > nameEx("tdecomp") > ### * tdecomp > > flush(stderr()); flush(stdout()) > > ### Name: tdecomp > ### Title: Root of singularity-corrected eigenvalue decomposition > ### Aliases: tdecomp > ### Keywords: array > > ### ** Examples > > x <- rnorm(10) > y <- rnorm(10) > z <- cov(cbind(x,y)) > round(tdecomp(z),digits=2) [,1] [,2] [1,] -0.48 1.03 [2,] -0.62 -0.29 > > > > cleanEx() > nameEx("unimodal.ind") > ### * unimodal.ind > > flush(stderr()); flush(stdout()) > > ### Name: unimodal.ind > ### Title: Is a fitted denisity unimodal or not? > ### Aliases: unimodal.ind > ### Keywords: univar > > ### ** Examples > > unimodal.ind(c(1,3,3,4,2,1,0,0)) [1] TRUE > > > > cleanEx() > nameEx("weightplots") > ### * weightplots > > flush(stderr()); flush(stdout()) > > ### Name: weightplots > ### Title: Ordered posterior plots > ### Aliases: weightplots > ### Keywords: multivariate cluster > > ### ** Examples > > require(mclust) Loading required package: mclust Package 'mclust' version 6.0.1 Type 'citation("mclust")' for citing this R package in publications. > require(MASS) Loading required package: MASS > data(crabs) > dc <- crabs[,4:8] > cm <- mclustBIC(crabs[,4:8],G=9,modelNames="EEE") > scm <- summary(cm,crabs[,4:8]) > weightplots(scm$z,clusternumbers=1:3,ask=FALSE) > weightplots(scm$z,clusternumbers=1:3,allcol=1:9, ask=FALSE, + legendposition=c(5,0.7)) > # Remove ask=FALSE to have time to watch the plots. > > > > cleanEx() detaching ‘package:MASS’, ‘package:mclust’ > nameEx("wfu") > ### * wfu > > flush(stderr()); flush(stdout()) > > ### Name: wfu > ### Title: Weight function (for Mahalabobis distances) > ### Aliases: wfu > ### Keywords: arith > > ### ** Examples > > md <- seq(0,10,by=0.1) > round(wfu(md,ca=5,ca2=8),digits=2) [1] 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 [16] 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 [31] 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 [46] 1.00 1.00 1.00 1.00 1.00 1.00 0.97 0.93 0.90 0.87 0.83 0.80 0.77 0.73 0.70 [61] 0.67 0.63 0.60 0.57 0.53 0.50 0.47 0.43 0.40 0.37 0.33 0.30 0.27 0.23 0.20 [76] 0.17 0.13 0.10 0.07 0.03 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 [91] 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 > > > > cleanEx() > nameEx("xtable") > ### * xtable > > flush(stderr()); flush(stdout()) > > ### Name: xtable > ### Title: Partition crosstable with empty clusters > ### Aliases: xtable > ### Keywords: array > > ### ** Examples > > c1 <- 1:3 > c2 <- c(1,1,2) > xtable(c1,c2,3) [,1] [,2] [,3] [1,] 1 0 0 [2,] 1 0 0 [3,] 0 1 0 > > > > cleanEx() > nameEx("zmisclassification.matrix") > ### * zmisclassification.matrix > > flush(stderr()); flush(stdout()) > > ### Name: zmisclassification.matrix > ### Title: Matrix of misclassification probabilities between mixture > ### components > ### Aliases: zmisclassification.matrix > ### Keywords: cluster multivariate > > ### ** Examples > > set.seed(12345) > m <- rpois(20,lambda=5) > dim(m) <- c(5,4) > m <- m/apply(m,1,sum) > round(zmisclassification.matrix(m,symmetric=FALSE),digits=2) [,1] [,2] [,3] [,4] [1,] 0.00 0.73 1.00 0.75 [2,] 0.10 0.00 0.56 0.73 [3,] 0.16 0.17 0.00 0.16 [4,] 0.10 0.30 0.56 0.00 > > > > ### *