trimcluster/0000755000176200001440000000000013617773232012642 5ustar liggesuserstrimcluster/NAMESPACE0000644000176200001440000000033613324342134014047 0ustar liggesusers# Default NAMESPACE created by R # Remove the previous line if you edit this file # Export all names exportPattern(".") S3method("plot", "tkm") S3method("print", "tkm") importFrom("graphics", "lines", "plot", "points") trimcluster/man/0000755000176200001440000000000013617251162013406 5ustar liggesuserstrimcluster/man/trimkmeans.Rd0000644000176200001440000000713310466415755016064 0ustar liggesusers\name{trimkmeans} \alias{trimkmeans} \alias{print.tkm} \alias{plot.tkm} \title{Trimmed k-means clustering} \description{ The trimmed k-means clustering method by Cuesta-Albertos, Gordaliza and Matran (1997). This optimizes the k-means criterion under trimming a portion of the points. } \usage{ trimkmeans(data,k,trim=0.1, scaling=FALSE, runs=100, points=NULL, countmode=runs+1, printcrit=FALSE, maxit=2*nrow(as.matrix(data))) \method{print}{tkm}(x, ...) \method{plot}{tkm}(x, data, ...) } \arguments{ \item{data}{matrix or data.frame with raw data} \item{k}{integer. Number of clusters.} \item{trim}{numeric between 0 and 1. Proportion of points to be trimmed.} \item{scaling}{logical. If \code{TRUE}, the variables are centered at their means and scaled to unit variance before execution.} \item{runs}{integer. Number of algorithm runs from initial means (randomly chosen from the data points).} \item{points}{\code{NULL} or a matrix with k vectors used as means to initialize the algorithm. If initial mean vectors are specified, \code{runs} should be 1 (otherwise the same initial means are used for all runs).} \item{countmode}{optional positive integer. Every \code{countmode} algorithm runs \code{trimkmeans} shows a message.} \item{printcrit}{logical. If \code{TRUE}, all criterion values (mean squares) of the algorithm runs are printed.} \item{maxit}{integer. Maximum number of iterations within an algorithm run. Each iteration determines all points which are closer to a different cluster center than the one to which they are currently assigned. The algorithm terminates if no more points have to be reassigned, or if \code{maxit} is reached.} \item{x}{object of class \code{tkm}.} \item{...}{further arguments to be transferred to \code{plot} or \code{\link[fpc]{plotcluster}}.} } \details{ \code{plot.tkm} calls \code{\link[fpc]{plotcluster}} if the dimensionality of the data \code{p} is 1, shows a scatterplot with non-trimmed regions if \code{p=2} and discriminant coordinates computed from the clusters (ignoring the trimmed points) if \code{p>2}. } \value{ An object of class 'tkm' which is a LIST with components \item{classification}{integer vector coding cluster membership with trimmed observations coded as \code{k+1}.} \item{means}{numerical matrix giving the mean vectors of the k classes.} \item{disttom}{vector of squared Euclidean distances of all points to the closest mean.} \item{ropt}{maximum value of \code{disttom} so that the corresponding point is not trimmed.} \item{k}{see above.} \item{trim}{see above.} \item{runs}{see above.} \item{scaling}{see above.} } \references{ Cuesta-Albertos, J. A., Gordaliza, A., and Matran, C. (1997) Trimmed k-Means: An Attempt to Robustify Quantizers, Annals of Statistics, 25, 553-576. } \author{Christian Hennig \email{chrish@stats.ucl.ac.uk} \url{http://www.homepages.ucl.ac.uk/~ucakche/} } \seealso{ \code{\link[fpc]{plotcluster}} } \examples{ set.seed(10001) n1 <-60 n2 <-60 n3 <-70 n0 <-10 nn <- n1+n2+n3+n0 pp <- 2 X <- matrix(rep(0,nn*pp),nrow=nn) ii <-0 for (i in 1:n1){ ii <-ii+1 X[ii,] <- c(5,-5)+rnorm(2) } for (i in 1:n2){ ii <- ii+1 X[ii,] <- c(5,5)+rnorm(2)*0.75 } for (i in 1:n3){ ii <- ii+1 X[ii,] <- c(-5,-5)+rnorm(2)*0.75 } for (i in 1:n0){ ii <- ii+1 X[ii,] <- rnorm(2)*8 } tkm1 <- trimkmeans(X,k=3,trim=0.1,runs=3) # runs=3 is used to save computing time. print(tkm1) plot(tkm1,X) } \keyword{multivariate} \keyword{cluster} trimcluster/DESCRIPTION0000644000176200001440000000112313617773232014345 0ustar liggesusersPackage: trimcluster Title: Cluster Analysis with Trimming Version: 0.1-5 Date: 2020-02-08 VersionNote: Released 0.1-2.1 on 2018-07-20 on CRAN Author: Christian Hennig Depends: R (>= 1.9.0) Suggests: fpc Description: Trimmed k-means clustering. The method is described in Cuesta-Albertos et al. (1997) . Maintainer: Valentin Todorov License: GPL URL: http://www.homepages.ucl.ac.uk/~ucakche/ Packaged: 2020-02-08 18:29:38 UTC; Share Repository: CRAN Date/Publication: 2020-02-09 12:10:02 UTC NeedsCompilation: no trimcluster/tests/0000755000176200001440000000000013617251162013775 5ustar liggesuserstrimcluster/tests/trtest.R0000644000176200001440000000130513607576433015456 0ustar liggesusers## VT::15.01.2020 - this will render the output independent ## from the version of the package suppressPackageStartupMessages(library(trimcluster)) set.seed(10001) n1 <-60 n2 <-60 n3 <-70 n0 <-10 nn <- n1+n2+n3+n0 pp <- 2 X <- matrix(rep(0,nn*pp),nrow=nn) ii <-0 for (i in 1:n1){ ii <-ii+1 X[ii,] <- c(5,-5)+rnorm(2) } for (i in 1:n2){ ii <- ii+1 X[ii,] <- c(5,5)+rnorm(2)*0.75 } for (i in 1:n3){ ii <- ii+1 X[ii,] <- c(-5,-5)+rnorm(2)*0.75 } for (i in 1:n0){ ii <- ii+1 X[ii,] <- rnorm(2)*8 } tkm1 <- trimkmeans(X,k=3,trim=0.1,runs=3) # runs=3 is used to save computing time. print(tkm1) plot(tkm1,X) trimcluster/tests/trtest.Rout.save0000644000176200001440000000424113607576526017150 0ustar liggesusers R Under development (unstable) (2020-01-08 r77640) -- "Unsuffered Consequences" Copyright (C) 2020 The R Foundation for Statistical Computing Platform: x86_64-w64-mingw32/x64 (64-bit) R is free software and comes with ABSOLUTELY NO WARRANTY. You are welcome to redistribute it under certain conditions. Type 'license()' or 'licence()' for distribution details. R is a collaborative project with many contributors. Type 'contributors()' for more information and 'citation()' on how to cite R or R packages in publications. Type 'demo()' for some demos, 'help()' for on-line help, or 'help.start()' for an HTML browser interface to help. Type 'q()' to quit R. > ## VT::15.01.2020 - this will render the output independent > ## from the version of the package > suppressPackageStartupMessages(library(trimcluster)) > > set.seed(10001) > n1 <-60 > n2 <-60 > n3 <-70 > n0 <-10 > nn <- n1+n2+n3+n0 > pp <- 2 > X <- matrix(rep(0,nn*pp),nrow=nn) > ii <-0 > for (i in 1:n1){ + ii <-ii+1 + X[ii,] <- c(5,-5)+rnorm(2) + } > for (i in 1:n2){ + ii <- ii+1 + X[ii,] <- c(5,5)+rnorm(2)*0.75 + } > for (i in 1:n3){ + ii <- ii+1 + X[ii,] <- c(-5,-5)+rnorm(2)*0.75 + } > for (i in 1:n0){ + ii <- ii+1 + X[ii,] <- rnorm(2)*8 + } > tkm1 <- trimkmeans(X,k=3,trim=0.1,runs=3) > # runs=3 is used to save computing time. > print(tkm1) * trimmed k-means * trim= 0.1 , k= 3 Classification (trimmed points are indicated by 4 ): [1] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 4 4 3 3 3 3 3 3 3 4 3 3 3 3 3 3 3 3 3 3 [38] 3 3 3 4 3 3 3 3 3 3 3 3 4 3 3 3 3 3 3 3 3 3 4 2 2 2 2 2 2 2 2 2 2 2 2 2 2 [75] 4 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 4 2 2 2 2 2 2 2 2 2 2 [112] 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 4 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 [149] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 4 1 1 1 1 1 1 1 1 1 1 1 1 1 1 [186] 1 1 1 1 1 4 4 4 4 4 4 4 4 4 4 Means: [,1] [,2] [1,] -4.948673 -4.978344 [2,] 5.125058 5.040808 [3,] 5.139695 -5.083246 Trimmed mean squares: 1.214552 > plot(tkm1,X) > > proc.time() user system elapsed 0.31 0.07 0.39 trimcluster/R/0000755000176200001440000000000013617251162013034 5ustar liggesuserstrimcluster/R/trimkmeans.R0000644000176200001440000000625413324342442015335 0ustar liggesuserstrimkmeans <- function(data,k,trim=0.1, scaling=FALSE, runs=100, points=NULL, countmode=runs+1, printcrit=FALSE, maxit=2*nrow(as.matrix(data))){ data <- as.matrix(data) n <- nrow(data) nin <- ceiling((1-trim)*n) if (scaling) data <- scale(data) crit <- Inf oldclass <- iclass <- optclass <- rep(0,n) disttom <- rep(0,n) # optmeans <- data[sample(n,k),,drop=FALSE] for (i in 1:runs){ if ((i/countmode)==round(i/countmode)) cat("Iteration ",i,"\n") if (is.null(points)) means <- data[sample(n,k),,drop=FALSE] else means <- points wend <- FALSE itcounter <- 0 while(!wend){ itcounter <- itcounter+1 for (j in 1:n){ dj <- rep(0,k) for (l in 1:k) dj[l] <- sum((data[j,]-means[l,])^2) iclass[j] <- which.min(dj) disttom[j] <- min(dj) } iclass[order(disttom)[(nin+1):n]] <- 0 # newcrit <- sum(disttom[iclass>0]) # cat("Iteration ",i," criterion value ",newcrit,"\n") if (itcounter>=maxit | identical(oldclass,iclass)) wend <- TRUE else{ for (l in 1:k){ if (sum(iclass==l)==0) means[l,] <- data[iclass==0,,drop=FALSE][1,] else{ if (sum(iclass==l)>1){ if (dim(means)[2]==1) means[l,] <- mean(data[iclass==l,]) else means[l,] <- colMeans(data[iclass==l,]) } else means[l,] <- data[iclass==l,] } } oldclass <- iclass } } newcrit <- sum(disttom[iclass>0]) if (printcrit) cat("Iteration ",i," criterion value ", newcrit/nin,"\n") if (newcrit<=crit){ optclass <- iclass crit <- newcrit optmeans <- means } } optclass[optclass==0] <- k+1 out <- list(classification=optclass,means=optmeans, criterion=crit/nin,disttom=disttom,ropt=sort(disttom)[nin], k=k,trim=trim,runs=runs,scaling=scaling) class(out) <- "tkm" out } print.tkm <- function(x,...){ cat("* trimmed k-means *\n") cat("trim=",x$trim,", k=",x$k,"\n") cat("Classification (trimmed points are indicated by ",x$k+1,"):\n") print(x$classification) cat("Means:\n") print(x$means) cat("Trimmed mean squares: ",x$criterion,"\n") invisible(x) } plot.tkm <- function(x,data,...){ p <- dim(as.matrix(data))[2] if (p==1){ fpc::plotcluster(data,x$classification, ...) } if (p==2){ # Create the graphs for summarizing results plot(data,type="n",main=paste("k = ", x$k, " and trim = ",x$trim )) points(data[x$classification==x$k+1,],col=1,pch=3) t <- seq(0,2*pi,length=1000) j <- 0 for (i in 1:x$k){ j <- j+1 points(data[x$classification==j,],col=j+1) circ <- sqrt(x$ropt)*cbind(cos(t),sin(t)); lines(rep(1,1000)%*%t(x$means[j,])+circ,col=1) } } if (p>2){ cv <- x$classification