flashClust/0000755000176000001440000000000012014751057012414 5ustar ripleyusersflashClust/MD50000644000176000001440000000056212014751057012727 0ustar ripleyusersd047ea58afd696a26c1283983c7672cf *Changelog 3e1008758ddfbac277cb7bdeef1a24e4 *DESCRIPTION bb5920e348331ca7bf4468a4c025d38b *NAMESPACE 0e93374868d312a7211bbf4902683634 *R/murtagh.R 799103cabce5323e1efce7634490ca1e *inst/CITATION 0f8bdc3d7b60180aae70166beb63bebc *man/flashClust.Rd e61dcc01d34889d0d9ee2f489e3e7c5b *src/hc.f 54a2d4ec7d6a4124f69dec1d567eb9b6 *src/hcass2.f flashClust/src/0000755000176000001440000000000012014741346013203 5ustar ripleyusersflashClust/src/hcass2.f0000644000176000001440000000703112014741346014536 0ustar ripleyusersC+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++C C C C Given a HIERARCHIC CLUSTERING, described as a sequence of C C agglomerations, prepare the seq. of aggloms. and "horiz." C C order of objects for plotting the dendrogram using S routine C C 'plclust'. C C C C Parameters: C C C C IA, IB: vectors of dimension N defining the agglomer- C C ations. C C IIA, IIB: used to store IA and IB values differently C C (in form needed for S command 'plclust' C C IORDER: "horiz." order of objects for dendrogram C C C C F. Murtagh, ESA/ESO/STECF, Garching, June 1991 C C C C HISTORY C C C C Adapted from routine HCASS, which additionally determines C C cluster assignments at all levels, at extra comput. expense C C C C This routine copied by Peter Langfelder from the source C C of R package stats. C C C C---------------------------------------------------------------C SUBROUTINE HCASS2(N,IA,IB,IORDER,IIA,IIB) c Args INTEGER N,IA(N),IB(N),IORDER(N),IIA(N),IIB(N) c Var INTEGER I, J, K, K1, K2, LOC C C Following bit is to get seq. of merges into format acceptable to plclust C I coded clusters as lowest seq. no. of constituents; S's 'hclust' codes C singletons as -ve numbers, and non-singletons with their seq. nos. C do I=1,N IIA(I)=IA(I) IIB(I)=IB(I) end do do I=1,N-2 C In the following, smallest (+ve or -ve) seq. no. wanted K=MIN(IA(I),IB(I)) do J=I+1, N-1 IF(IA(J).EQ.K) IIA(J)=-I IF(IB(J).EQ.K) IIB(J)=-I end do end do do I=1,N-1 IIA(I)=-IIA(I) IIB(I)=-IIB(I) end do do I=1,N-1 IF (IIA(I).GT.0 .AND. IIB(I).LT.0) THEN K = IIA(I) IIA(I) = IIB(I) IIB(I) = K ENDIF IF (IIA(I).GT.0 .AND. IIB(I).GT.0) THEN K1 = MIN(IIA(I),IIB(I)) K2 = MAX(IIA(I),IIB(I)) IIA(I) = K1 IIB(I) = K2 ENDIF end do C C C NEW PART FOR 'ORDER' C IORDER(1) = IIA(N-1) IORDER(2) = IIB(N-1) LOC=2 DO I=N-2,1,-1 DO J=1,LOC IF(IORDER(J).EQ.I) THEN C REPLACE IORDER(J) WITH IIA(I) AND IIB(I) IORDER(J)=IIA(I) IF (J.EQ.LOC) THEN LOC=LOC+1 IORDER(LOC)=IIB(I) else LOC=LOC+1 do K=LOC,J+2,-1 IORDER(K)=IORDER(K-1) end do IORDER(J+1)=IIB(I) end if GOTO 171 ENDIF end do C SHOULD NEVER REACH HERE 171 CONTINUE end do C C do I=1,N IORDER(I) = -IORDER(I) end do C C RETURN END flashClust/src/hc.f0000644000176000001440000001716212014741346013753 0ustar ripleyusersC++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++C C C C HIERARCHICAL CLUSTERING using (user-specified) criterion. C C C C Parameters: C C C C DATA(N,M) input data matrix, C C DISS(LEN) dissimilarities in lower half diagonal C C storage; LEN = N.N-1/2, C C IOPT clustering criterion to be used, C C IA, IB, CRIT history of agglomerations; dimensions C C N, first N-1 locations only used, C C MEMBR, NN, DISNN vectors of length N, used to store C C cluster cardinalities, current nearest C C neighbour, and the dissimilarity assoc. C C with the latter. C C FLAG boolean indicator of agglomerable obj./ C C clusters. C C C C F. Murtagh, ESA/ESO/STECF, Garching, February 1986. C C Modified by Peter Langfelder, implemented bug fix C by Chi Ming Yau C C C------------------------------------------------------------C SUBROUTINE HC(N,LEN,IOPT,IA,IB,CRIT,MEMBR,NN,DISNN, X FLAG,DISS) IMPLICIT DOUBLE PRECISION (A-H, O-Z) DOUBLE PRECISION MEMBR(N),DISS(LEN) INTEGER IA(N),IB(N) DOUBLE PRECISION CRIT(N) DIMENSION NN(N),DISNN(N) LOGICAL FLAG(N) DOUBLE PRECISION INF c was 1D+20 DATA INF/1.D+300/ c c unnecessary initialization of im jj jm to keep g77 -Wall happy c IM = 0 JJ = 0 JM = 0 C C Initializations C DO I=1,N c MEMBR(I)=1. FLAG(I)=.TRUE. ENDDO NCL=N C C Construct dissimilarity matrix C C DO I=1,N-1 C DO J=I+1,N C IND=IOFFSET(N,I,J) C DISS(IND)=0. C DO K=1,M C DISS(IND)=DISS(IND)+(DATA(I,K)-DATA(J,K))**2 C ENDDO C IF (IOPT.EQ.1) DISS(IND)=DISS(IND)/2. C (Above is done for the case of the min. var. method C where merging criteria are defined in terms of variances C rather than distances.) C ENDDO C ENDDO C C Carry out an agglomeration - first create list of NNs C DO I=1,N-1 DMIN=INF DO J=I+1,N IND=IOFFSET(N,I,J) IF (DISS(IND).GE.DMIN) GOTO 500 DMIN=DISS(IND) JM=J 500 CONTINUE ENDDO NN(I)=JM DISNN(I)=DMIN ENDDO C 400 CONTINUE C Next, determine least diss. using list of NNs DMIN=INF DO I=1,N-1 IF (.NOT.FLAG(I)) GOTO 600 IF (DISNN(I).GE.DMIN) GOTO 600 DMIN=DISNN(I) IM=I JM=NN(I) 600 CONTINUE ENDDO NCL=NCL-1 C C This allows an agglomeration to be carried out. C I2=MIN0(IM,JM) J2=MAX0(IM,JM) IA(N-NCL)=I2 IB(N-NCL)=J2 CRIT(N-NCL)=DMIN C C Update dissimilarities from new cluster. C FLAG(J2)=.FALSE. DMIN=INF DO K=1,N IF (.NOT.FLAG(K)) GOTO 800 IF (K.EQ.I2) GOTO 800 X=MEMBR(I2)+MEMBR(J2)+MEMBR(K) IF (I2.LT.K) THEN IND1=IOFFSET(N,I2,K) ELSE IND1=IOFFSET(N,K,I2) ENDIF IF (J2.LT.K) THEN IND2=IOFFSET(N,J2,K) ELSE IND2=IOFFSET(N,K,J2) ENDIF IND3=IOFFSET(N,I2,J2) XX=DISS(IND3) C C WARD'S MINIMUM VARIANCE METHOD - IOPT=1. C IF (IOPT.EQ.1) THEN DISS(IND1)=(MEMBR(I2)+MEMBR(K))*DISS(IND1)+ X (MEMBR(J2)+MEMBR(K))*DISS(IND2)- X MEMBR(K)*XX DISS(IND1)=DISS(IND1)/X ENDIF C C SINGLE LINK METHOD - IOPT=2. C IF (IOPT.EQ.2) THEN DISS(IND1)=MIN(DISS(IND1),DISS(IND2)) ENDIF C C COMPLETE LINK METHOD - IOPT=3. C IF (IOPT.EQ.3) THEN DISS(IND1)=MAX(DISS(IND1),DISS(IND2)) ENDIF C C AVERAGE LINK (OR GROUP AVERAGE) METHOD - IOPT=4. C IF (IOPT.EQ.4) THEN DISS(IND1)=(MEMBR(I2)*DISS(IND1)+MEMBR(J2)*DISS(IND2))/ X (MEMBR(I2)+MEMBR(J2)) ENDIF C C MCQUITTY'S METHOD - IOPT=5. C IF (IOPT.EQ.5) THEN DISS(IND1)=0.5*DISS(IND1)+0.5*DISS(IND2) ENDIF C C MEDIAN (GOWER'S) METHOD - IOPT=6. C IF (IOPT.EQ.6) THEN DISS(IND1)=0.5*DISS(IND1)+0.5*DISS(IND2)-0.25*XX ENDIF C C CENTROID METHOD - IOPT=7. C IF (IOPT.EQ.7) THEN DISS(IND1)=(MEMBR(I2)*DISS(IND1)+MEMBR(J2)*DISS(IND2)- X MEMBR(I2)*MEMBR(J2)*XX/(MEMBR(I2)+MEMBR(J2)))/ X (MEMBR(I2)+MEMBR(J2)) ENDIF C IF (I2.GT.K) GOTO 800 IF (DISS(IND1).GE.DMIN) GOTO 800 DMIN=DISS(IND1) JJ=K 800 CONTINUE ENDDO MEMBR(I2)=MEMBR(I2)+MEMBR(J2) DISNN(I2)=DMIN NN(I2)=JJ C C Update list of NNs insofar as this is required. C This part modified by Chi Ming Yau and PL. For methods IOPT=6 and 7 C use modified updating of nearest neighbors that is a bit slower but C necessary. IF (IOPT.GT.5) THEN DO I=1,N-1 IF (.NOT.FLAG(I)) GOTO 900 IF (I.EQ.I2) GOTO 850 IF (NN(I).EQ.I2) GOTO 850 IF (NN(I).EQ.J2) GOTO 850 C Compare DISNN(I) with updated DISS between I and I2 IF (I2.LT.I) THEN IND=IOFFSET(N,I2,I) ELSE IND=IOFFSET(N,I,I2) ENDIF DMIN=DISS(IND) IF (DMIN.GE.DISNN(I)) GOTO 900 DISNN(I)=DMIN NN(I)=I2 GOTO 900 850 CONTINUE C (Redetermine NN of I:) DMIN=INF DO J=I+1,N IND=IOFFSET(N,I,J) IF (.NOT.FLAG(J)) GOTO 870 IF (I.EQ.J) GOTO 870 IF (DISS(IND).GE.DMIN) GOTO 870 DMIN=DISS(IND) JJ=J 870 CONTINUE ENDDO NN(I)=JJ DISNN(I)=DMIN 900 CONTINUE ENDDO ELSE C For methods IOPT<6 use the original fast update. DO I=1,N-1 IF (.NOT.FLAG(I)) GOTO 901 IF (NN(I).EQ.I2) GOTO 851 IF (NN(I).EQ.J2) GOTO 851 GOTO 901 851 CONTINUE C (Redetermine NN of I:) DMIN=INF DO J=I+1,N IND=IOFFSET(N,I,J) IF (.NOT.FLAG(J)) GOTO 871 IF (I.EQ.J) GOTO 871 IF (DISS(IND).GE.DMIN) GOTO 871 DMIN=DISS(IND) JJ=J 871 CONTINUE ENDDO NN(I)=JJ DISNN(I)=DMIN 901 CONTINUE ENDDO ENDIF C C Repeat previous steps until N-1 agglomerations carried out. C IF (NCL.GT.1) GOTO 400 C C RETURN END C C FUNCTION IOFFSET(N,I,J) C Map row I and column J of upper half diagonal symmetric matrix C onto vector. IMPLICIT DOUBLE PRECISION (A-H, O-Z) C Convert integer I to a double C This hopefully prevents overflow errors when I^2 is greater than C 2^31. IF (N.GT.32768) THEN XI = DBLE(I) IOFFSET=J+NINT( (XI-1)*N - (XI*(XI+1))/2) ELSE IOFFSET=J+(I-1)*N-(I*(I+1))/2 ENDIF RETURN END flashClust/man/0000755000176000001440000000000012014737207013170 5ustar ripleyusersflashClust/man/flashClust.Rd0000644000176000001440000001221111724275033015565 0ustar ripleyusers\name{flashClust} \alias{flashClust} \alias{hclust} \title{ Faster alternative to hclust } \description{ This function implements optimal hierarchical clustering with the same interface as \code{\link{hclust}}. } \usage{ hclust(d, method = "complete", members=NULL) flashClust(d, method = "complete", members=NULL) } \arguments{ \item{d}{ a dissimilarity structure as produced by 'dist'.} \item{method}{ the agglomeration method to be used. This should be (an unambiguous abbreviation of) one of \code{"ward"}, \code{"single"}, \code{"complete"}, \code{"average"}, \code{"mcquitty"}, \code{"median"} or \code{"centroid"}. } \item{members}{\code{NULL} or a vector with length size of \code{d}. See the \sQuote{Details} section.} } \details{ See the description of \code{\link{hclust}} for details on available clustering methods. If \code{members!=NULL}, then \code{d} is taken to be a dissimilarity matrix between clusters instead of dissimilarities between singletons and \code{members} gives the number of observations per cluster. This way the hierarchical cluster algorithm can be \sQuote{started in the middle of the dendrogram}, e.g., in order to reconstruct the part of the tree above a cut (see examples). Dissimilarities between clusters can be efficiently computed (i.e., without \code{hclust} itself) only for a limited number of distance/linkage combinations, the simplest one being squared Euclidean distance and centroid linkage. In this case the dissimilarities between the clusters are the squared Euclidean distances between cluster means. \code{flashClust} is a wrapper for compatibility with older code. } \value{ Returned value is the same as that of \code{\link{hclust}}: An object of class \bold{hclust} which describes the tree produced by the clustering process. The object is a list with components: \item{merge}{an \eqn{n-1} by 2 matrix. Row \eqn{i} of \code{merge} describes the merging of clusters at step \eqn{i} of the clustering. If an element \eqn{j} in the row is negative, then observation \eqn{-j} was merged at this stage. If \eqn{j} is positive then the merge was with the cluster formed at the (earlier) stage \eqn{j} of the algorithm. Thus negative entries in \code{merge} indicate agglomerations of singletons, and positive entries indicate agglomerations of non-singletons.} \item{height}{a set of \eqn{n-1} non-decreasing real values. The clustering \emph{height}: that is, the value of the criterion associated with the clustering \code{method} for the particular agglomeration.} \item{order}{a vector giving the permutation of the original observations suitable for plotting, in the sense that a cluster plot using this ordering and matrix \code{merge} will not have crossings of the branches.} \item{labels}{labels for each of the objects being clustered.} \item{call}{the call which produced the result.} \item{method}{the cluster method that has been used.} \item{dist.method}{the distance that has been used to create \code{d} (only returned if the distance object has a \code{"method"} attribute).} } \references{ This implementation is mentioned in Peter Langfelder, Steve Horvath (2012) Fast R Functions for Robust Correlations and Hierarchical Clustering. Journal of Statistical Software, 46(11), 1-17. \url{http://www.jstatsoft.org/v46/i11/} F.Murtagh's software web site: http://www.classification-society.org/csna/mda-sw/ , section 6 Becker, R. A., Chambers, J. M. and Wilks, A. R. (1988) \emph{The New S Language}. Wadsworth \& Brooks/Cole. (S version.) Everitt, B. (1974). \emph{Cluster Analysis}. London: Heinemann Educ. Books. Hartigan, J. A. (1975). \emph{Clustering Algorithms}. New York: Wiley. Sneath, P. H. A. and R. R. Sokal (1973). \emph{Numerical Taxonomy}. San Francisco: Freeman. Anderberg, M. R. (1973). \emph{Cluster Analysis for Applications}. Academic Press: New York. Gordon, A. D. (1999). \emph{Classification}. Second Edition. London: Chapman and Hall / CRC Murtagh, F. (1985). \dQuote{Multidimensional Clustering Algorithms}, in \emph{COMPSTAT Lectures 4}. Wuerzburg: Physica-Verlag (for algorithmic details of algorithms used). McQuitty, L.L. (1966). Similarity Analysis by Reciprocal Pairs for Discrete and Continuous Data. \emph{Educational and Psychological Measurement}, \bold{26}, 825--831. } \author{ Fionn Murtagh, adapted and packaged by Peter Langfelder} \seealso{ \code{\link{hclust}} } \examples{ # generate some data to cluster set.seed(1); nNodes = 2000; # Random "distance" matrix dst = matrix(runif(n = nNodes^2, min = 0, max = 1), nNodes, nNodes); # Time the flashClust clustering system.time( { h1 = hclust(as.dist(dst), method= "average"); } ); # Time the standard R clustering system.time( { h2 = stats::hclust(as.dist(dst), method = "average"); } ); all.equal(h1, h2) # What is different: h1[[6]] h2[[6]] # Everything but the 'call' component is the same; in particular, the trees are exactly equal. } \keyword{multivariate} \keyword{cluster} flashClust/NAMESPACE0000644000176000001440000000006011142441276013627 0ustar ripleyusersuseDynLib(flashClust) exportPattern("^[^\\.]") flashClust/DESCRIPTION0000644000176000001440000000075612014751057014132 0ustar ripleyusersPackage: flashClust Version: 1.01-2 Date: 2012-08-21 Title: Implementation of optimal hierarchical clustering Author: code by Fionn Murtagh and R development team, modifications and packaging by Peter Langfelder Maintainer: Peter Langfelder Depends: R (>= 2.3.0) ZipData: no License: GPL (>= 2) Description: Fast implementation of hierarchical clustering Packaged: 2012-08-21 17:18:30 UTC; plangfelder Repository: CRAN Date/Publication: 2012-08-21 18:23:43 flashClust/Changelog0000644000176000001440000000174312014737162014234 0ustar ripleyusers 2012-08-21: 1.01-2 . Added the stats routine hcass2 to the package to avoid calling undocumented internal function in the base distribution of R. 2012-03-02: 1.01-1 . Added citation 2011-03-20: 1.01 . Fixed incorrect clustering with methods "centroid" and "median". Thanks to Chi Ming Yau and Daniel Müllner for pointing out the error and suggesting a fix. . Speed now for the most part back up to the level experienced with 1.00-2 but still capable of working with larger data sets. 2011-02-06: 1.00-3 . Fixed overflow errors when the number of clustered objects is over ~45000. The code should now be able to cluster as many as 65000 objects, given enough memory. 2010-07-19: 1.00-2 . added a timing example in the help file 2010-02-19: 1.00 . added a wrapper called hclust so a fast version of hclust is used automatically after loading the package 2009-11-13: 0.10-1 . flashClust now checks that the input distance actually contains some data flashClust/inst/0000755000176000001440000000000011724274253013376 5ustar ripleyusersflashClust/inst/CITATION0000644000176000001440000000137411724274240014534 0ustar ripleyuserscitHeader("To cite flashClust in publications use:") citEntry(entry = "Article", title = "Fast {R} Functions for Robust Correlations and Hierarchical Clustering", author = personList(as.person("Peter Langfelder"), as.person("Steve Horvath")), journal = "Journal of Statistical Software", year = "2012", volume = "46", number = "11", pages = "1--17", url = "http://www.jstatsoft.org/v46/i11/", textVersion = paste("Peter Langfelder, Steve Horvath (2012).", "Fast R Functions for Robust Correlations and Hierarchical Clustering.", "Journal of Statistical Software, 46(11), 1-17.", "URL http://www.jstatsoft.org/v46/i11/.") ) flashClust/R/0000755000176000001440000000000012014741163012612 5ustar ripleyusersflashClust/R/murtagh.R0000644000176000001440000000515112014737620014411 0ustar ripleyusers# Code by F. Murthagh, http://astro.u-strasbg.fr/~fmurtagh/mda-sw/splus # modified by Peter Langfelder to make it compaticle with R's standard hclust flashClust <- function(d, method="complete", members = NULL) { hclust(d, method, members) } hclust <- function(d, method="complete", members = NULL) { # Hierarchical clustering, on raw input data; we will use Euclidean distance. # A range of criteria are supported; also there is a storage-economic option. # Author: F. Murtagh, May 1992 METHODS <- c("ward", "single", "complete", "average", "mcquitty", "median", "centroid") method <- pmatch(method, METHODS) if (is.na(method)) stop("Invalid clustering method") if (method == -1) stop("Ambiguous clustering method") n = attr(d, "Size") len = length(d); if (len!=(n*(n-1)/2)) stop("Distance structure appears invalid."); if (n==1 || len==0) stop("The distance structure is empty."); if (is.null(members)) { members <- rep(1, n) } else if (length(members) != n) stop("invalid length of members") # We choose the general routine, `hc', which # caters for 7 criteria, using a half dissimilarity matrix; (BTW, this uses the # very efficient nearest neighbor chain algorithm, which makes this algorithm # of O(n^2) computational time, and differentiates it from the less efficient # -- i.e. O(n^3) -- implementations in all commercial statistical packages # -- as far as I am aware -- except Clustan.) hcl <- .Fortran("hc", n = as.integer(n), len = as.integer(len), method = as.integer(method), ia = integer(n), ib = integer(n), crit = double(n), membr = as.double(members), nn = integer(n), disnn = double(n), flag = logical(n), diss = as.double(d), PACKAGE = "flashClust") # 2nd step: interpret the information that we now have, -- seq. of aggloms., -- # as merge, height, and order lists. #PL: not clear what this iclass is supposed to be for. #iclass <- matrix(0.0, n, n) #storage.mode(iclass) <- "integer" hcass <- .Fortran("hcass2", n = as.integer(n), ia = as.integer(hcl$ia), ib = as.integer(hcl$ib), order = integer(n), iia = integer(n), iib = integer(n), PACKAGE = "flashClust") merge <- cbind(hcass$iia[1:n-1],hcass$iib[1:n-1]) hhh <- list(merge = merge, height = hcl$crit[1:n-1], order = hcass$order, labels = attr(d, "Labels"), method = METHODS[method], call = match.call(), dist.method = attr(d, "method")) class(hhh) = "hclust" hhh }