fastcluster/0000755000176000001440000000000012147576001012644 5ustar ripleyusersfastcluster/MD50000644000176000001440000000232012147576001013151 0ustar ripleyusersb0e1b52b6fa51a802059e03f170f8027 *DESCRIPTION 37dbb4755f02aba9b95c112a93152d25 *INSTALL eaea37b33dd51bceb21b161ea85fe747 *LICENSE da8e9d68585993250a9c29c3e9bff50b *NAMESPACE dd1cb51855132adca6e9b8ab5bd565c3 *NEWS 65782671c8354dc2ac12129fe8b7d689 *R/fastcluster.R 43b78dcbb252cd4d3d5e5fe641b41aab *README 459081fd7078ab4eadf2e3ce7e45bab1 *inst/CITATION a1289792d2e116da849a72bc13964124 *inst/doc/fastcluster.Rtex 2ce84ff899c99412a838d58cd1970a18 *inst/doc/fastcluster.pdf b83c0e89648a60eec214604e4fb6b041 *man/fastcluster.Rd 78eb0a01c2f2900d76ca1443877e434a *man/hclust.Rd 6217d92d1ce743dbce72210f115d4588 *man/hclust.vector.Rd 2123c50ed57a3c92ffcc659df6b189a0 *python/fastcluster.py 40cbae9d01c1c7e45672ecd6f4adba63 *python/setup.py 4cc703cbf06f22141f03299e48ab750b *python/test/nantest.py eeff082e57d8bca82a53c4223db6b7e4 *python/test/test.py 576863482602b23291cd44e1c079491b *python/test/vectortest.py 97bb0f9bf046e498c47423129fc3691a *src/Makevars 7b8a328733afe582986d5292e9c91278 *src/Makevars.win 08f32ff087008133c2dce9f9755be365 *src/fastcluster.cpp 197262bac0e3bee80ea8516cc1fdafa4 *src/fastcluster_R.cpp 086b8e1b7593e514f7d8aa7cbb5ebe38 *src/fastcluster_python.cpp 7372e601478ed07613127d22232b32b7 *tests/test_fastcluster.R fastcluster/tests/0000755000176000001440000000000012147324442014006 5ustar ripleyusersfastcluster/tests/test_fastcluster.R0000644000176000001440000001546611727666564017564 0ustar ripleyusers# fastcluster: Fast hierarchical clustering routines for R and Python # # Copyright © 2011 Daniel Müllner # # # Test script for the R interface seed = as.integer(runif(1, 0, 1e9)) set.seed(seed) cat(sprintf("Random seed: %d\n",seed)) print_seed <- function() { return(sprintf(' Please send a report to the author of the \'fastcluster\' package, Daniel Müllner. For contact details, see . To make the error reproducible, you must include the following number (the random seed value) in your error report: %d.\n\n', seed)) } # Compare two dendrograms and check whether they are equal, except that # ties may be resolved differently. compare <- function(dg1, dg2) { h1 <- dg1$height h2 <- dg2$height # "height" vectors may have small numerical errors. rdiffs <- abs(h1-h2)/pmax(abs(h1),abs(h2)) rdiffs = rdiffs[complete.cases(rdiffs)] rel_error <- max(rdiffs) # We allow a relative error of 1e-13. if (rel_error>1e-13) { print(h1) print(h2) cat(sprintf('Height vectors differ! The maximum relative error is %e.\n', rel_error)) return(FALSE) } # Filter the indices where consecutive merging distances are distinct. d = diff(dg1$height) b = (c(d,1)!=0 & c(1,d)!=0) #cat(sprintf("Percentage of indices where we can test: %g.\n",100.0*length(b[b])/length(b))) if (any(b)) { m1 = dg1$merge[b,] m2 = dg2$merge[b,] r = function(i) { if (i<0) { return(1) } else { return(b[i]) } } f = sapply(m1,r) fm1 = m1*f fm2 = m2*f # The "merge" matrices must be identical whereever indices are not ambiguous # due to ties. if (!identical(fm1,fm2)) { cat('Merge matrices differ!\n') return(FALSE) } # Compare the "order" vectors only if all merging distances were distinct. if (all(b) && !identical(dg1$order,dg2$order)) { cat('Order vectors differ!\n') return(FALSE) } } return(TRUE) } # Generate uniformly distributed random data generate.uniform <- function() { n = sample(10:1000,1) range_exp = runif(1,min=-10, max=10) cat(sprintf("Number of sample points: %d\n",n)) cat(sprintf("Dissimilarity range: [0,%g]\n",10^range_exp)) d = runif(n*(n-1)/2, min=0, max=10^range_exp) # Fake a compressed distance matrix attributes(d) <- NULL attr(d,"Size") <- n attr(d, "call") <- 'N/A' class(d) <- "dist" return(d) } # Generate normally distributed random data generate.normal <- function() { n = sample(10:1000,1) dim = sample(2:20,1) cat (sprintf("Number of sample points: %d\n",n)) cat (sprintf("Dimension: %d\n",dim)) pcd = matrix(rnorm(n*dim), c(n,dim)) d = dist(pcd) return(d) } # Test the clustering functions when a distance matrix is given. test.dm <- function(d) { d2 = d for (method in c('single','complete','average','mcquitty','ward','centroid','median') ) { cat(paste('Method :', method, '\n')) dg_fastcluster = fastcluster::hclust(d, method=method) dg_stats = stats::hclust(d, method=method) if (!identical(d,d2)) { cat('Input array was corrupted!\n') stop(print_seed()) } if (!compare(dg_stats, dg_fastcluster)) { stop(print_seed()) } } cat('Passed.\n') } # Test the clustering functions for vector input in Euclidean space. test.vector <- function() { # generate test data n = sample(10:1000,1) dim = sample(2:20,1) cat (sprintf("Number of sample points: %d\n",n)) cat (sprintf("Dimension: %d\n",dim)) range_exp = runif(1,min=-10, max=10) pcd = matrix(rnorm(n*dim, sd=10^range_exp), c(n,dim)) pcd2 = pcd # test method='single' cat(paste('Method:', method, '\n')) for (metric in c('euclidean', 'maximum', 'manhattan', 'canberra', 'minkowski')) { cat(paste(' Metric:', metric, '\n')) if (metric=='minkowski') { p = runif(1, min=1.0, max=10.0) cat (sprintf(" p: %g\n",p)); dg_fastcluster = fastcluster::hclust.vector(pcd, method=method, metric=metric, p=p) d = dist(pcd, method=metric, p=p) } else { dg_fastcluster = fastcluster::hclust.vector(pcd, method=method, metric=metric) d = dist(pcd, method=metric) } d2 = d dg_fastcluster_dist = fastcluster::hclust(d, method=method) if (!identical(d,d2) || !identical(pcd,pcd2)) { cat('Input array was corrupted!\n') stop(print_seed()) } if (!compare(dg_fastcluster_dist, dg_fastcluster)) { stop(print_seed()) } } for (method in c('ward','centroid','median') ) { cat(paste('Method:', method, '\n')) dg_fastcluster = fastcluster::hclust.vector(pcd, method=method) if (!identical(pcd,pcd2)) { cat('Input array was corrupted!\n') stop(print_seed()) } d = dist(pcd) # Workaround: fastcluster::hclust expects _squared_ euclidean distances. d = d^2 d2 = d dg_fastcluster_dist = fastcluster::hclust(d, method=method) if (!identical(d,d2)) { cat('Input array was corrupted!\n') stop(print_seed()) } dg_fastcluster_dist$height = sqrt(dg_fastcluster_dist$height) # The Euclidean methods may have small numerical errors due to squaring/ # taking the root in the Euclidean distances. if (!compare(dg_fastcluster_dist, dg_fastcluster)) { stop(print_seed()) } } cat('Passed.\n') } # Test the single linkage function with the "binary" metric test.vector.binary <- function() { # generate test data cat (sprintf("Uniform sampling for the 'binary' metric:\n")) n = sample(10:400,1) dim = sample(n:(2*n),1) cat (sprintf("Number of sample points: %d\n",n)) cat (sprintf("Dimension: %d\n",dim)) pcd = matrix(sample(-1:2, n*dim, replace=T), c(n,dim)) pcd2 = pcd # test method='single' metric='binary' cat(paste('Method:', method, '\n')) cat(paste(' Metric:', metric, '\n')) dg_fastcluster = fastcluster::hclust.vector(pcd, method=method, metric=metric) d = dist(pcd, method=metric) d2 = d dg_fastcluster_dist = fastcluster::hclust(d, method=method) if (!identical(d,d2) || !identical(d,d2)) { cat('Input array was corrupted!\n') stop(print_seed()) } if (!compare(dg_fastcluster_dist, dg_fastcluster)) { stop(print_seed()) } cat('Passed.\n') } N = 15 for (i in (1:N)) { if (i%%2==1) { cat(sprintf('Random test %d of %d (uniform distribution of distances):\n',i,2*N)) d = generate.uniform() } else { cat(sprintf('Random test %d of %d (Gaussian density):\n',i,2*N)) d = generate.normal() } test.dm(d) } for (i in (N+1:N)) { cat(sprintf('Random test %d of %d (Gaussian density):\n',i,2*N)) test.vector() test.vector.binary() } cat('Done.\n') fastcluster/src/0000755000176000001440000000000012147443476013444 5ustar ripleyusersfastcluster/src/fastcluster_python.cpp0000644000176000001440000010614012147521216020077 0ustar ripleyusers/* fastcluster: Fast hierarchical clustering routines for R and Python Copyright © 2011 Daniel Müllner */ // for INT32_MAX in fastcluster.cpp // This must be defined here since Python.h loads the header file pyport.h, // and from this stdint.h. INT32_MAX is defined in stdint.h, but only if // __STDC_LIMIT_MACROS is defined. #define __STDC_LIMIT_MACROS #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION #if __GNUC__ > 4 || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 6)) #define HAVE_DIAGNOSTIC 1 #endif #if HAVE_DIAGNOSTIC #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wswitch-default" #pragma GCC diagnostic ignored "-Wpadded" #pragma GCC diagnostic ignored "-Wlong-long" #pragma GCC diagnostic ignored "-Wformat" #endif #include #if HAVE_DIAGNOSTIC #pragma GCC diagnostic pop #endif #if HAVE_DIAGNOSTIC #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wlong-long" #pragma GCC diagnostic ignored "-Wpedantic" #pragma GCC diagnostic ignored "-Wpadded" #pragma GCC diagnostic ignored "-Wcast-qual" #endif #include #if HAVE_DIAGNOSTIC #pragma GCC diagnostic pop #endif /* It's complicated, but if I do not include the C++ math headers, GCC will complain about conversions from 'double' to 'float', whenever 'isnan' is called in a templated function (but not outside templates). The '#include ' seems to cure the problem. */ //#include #define fc_isnan(X) ((X)!=(X)) // There is Py_IS_NAN but it is so much slower on my x86_64 system with GCC! #include // for std::ptrdiff_t #include // for std::numeric_limits<...>::infinity() #include // for std::stable_sort #include // for std::bad_alloc #include // for std::exception #include "fastcluster.cpp" // backwards compatibility #ifndef NPY_ARRAY_CARRAY_RO #define NPY_ARRAY_CARRAY_RO NPY_CARRAY_RO #endif /* Since the public interface is given by the Python respectively R interface, * we do not want other symbols than the interface initalization routines to be * visible in the shared object file. The "visibility" switch is a GCC concept. * Hiding symbols keeps the relocation table small and decreases startup time. * See http://gcc.gnu.org/wiki/Visibility */ #if HAVE_VISIBILITY #pragma GCC visibility push(hidden) #endif /* Convenience class for the output array: automatic counter. */ class linkage_output { private: t_float * Z; public: linkage_output(t_float * const Z_) : Z(Z_) {} void append(const t_index node1, const t_index node2, const t_float dist, const t_float size) { if (node1(node1); *(Z++) = static_cast(node2); } else { *(Z++) = static_cast(node2); *(Z++) = static_cast(node1); } *(Z++) = dist; *(Z++) = size; } }; /* Generate the SciPy-specific output format for a dendrogram from the clustering output. The list of merging steps can be sorted or unsorted. */ // The size of a node is either 1 (a single point) or is looked up from // one of the clusters. #define size_(r_) ( ((r_ static void generate_SciPy_dendrogram(t_float * const Z, cluster_result & Z2, const t_index N) { // The array "nodes" is a union-find data structure for the cluster // identities (only needed for unsorted cluster_result input). union_find nodes(sorted ? 0 : N); if (!sorted) { std::stable_sort(Z2[0], Z2[N-1]); } linkage_output output(Z); t_index node1, node2; for (node const * NN=Z2[0]; NN!=Z2[N-1]; ++NN) { // Get two data points whose clusters are merged in step i. if (sorted) { node1 = NN->node1; node2 = NN->node2; } else { // Find the cluster identifiers for these points. node1 = nodes.Find(NN->node1); node2 = nodes.Find(NN->node2); // Merge the nodes in the union-find data structure by making them // children of a new node. nodes.Union(node1, node2); } output.append(node1, node2, NN->dist, size_(node1)+size_(node2)); } } /* Python interface code */ static PyObject * linkage_wrap(PyObject * const self, PyObject * const args); static PyObject * linkage_vector_wrap(PyObject * const self, PyObject * const args); // List the C++ methods that this extension provides. static PyMethodDef _fastclusterWrapMethods[] = { {"linkage_wrap", linkage_wrap, METH_VARARGS, NULL}, {"linkage_vector_wrap", linkage_vector_wrap, METH_VARARGS, NULL}, {NULL, NULL, 0, NULL} /* Sentinel - marks the end of this structure */ }; /* Tell Python about these methods. Python 2.x and 3.x differ in their C APIs for this part. */ #if PY_VERSION_HEX >= 0x03000000 static struct PyModuleDef fastclustermodule = { PyModuleDef_HEAD_INIT, "_fastcluster", NULL, // no module documentation -1, /* size of per-interpreter state of the module, or -1 if the module keeps state in global variables. */ _fastclusterWrapMethods, NULL, NULL, NULL, NULL }; /* Make the interface initalization routines visible in the shared object * file. */ #if HAVE_VISIBILITY #pragma GCC visibility push(default) #endif PyMODINIT_FUNC PyInit__fastcluster(void) { PyObject * m; m = PyModule_Create(&fastclustermodule); if (!m) { return NULL; } import_array(); // Must be present for NumPy. Called first after above line. return m; } #if HAVE_VISIBILITY #pragma GCC visibility pop #endif # else // Python 2.x #if HAVE_VISIBILITY #pragma GCC visibility push(default) #endif PyMODINIT_FUNC init_fastcluster(void) { (void) Py_InitModule("_fastcluster", _fastclusterWrapMethods); import_array(); // Must be present for NumPy. Called first after above line. } #if HAVE_VISIBILITY #pragma GCC visibility pop #endif #endif // PY_VERSION class GIL_release { private: // noncopyable GIL_release(GIL_release const &); GIL_release & operator=(GIL_release const &); public: inline GIL_release(bool really = true) : _save(really ? PyEval_SaveThread() : NULL) { } inline ~GIL_release() { if (_save) PyEval_RestoreThread(_save); } private: PyThreadState * _save; }; /* Interface to Python, part 1: The input is a dissimilarity matrix. */ static PyObject *linkage_wrap(PyObject * const, PyObject * const args) { PyArrayObject * D, * Z; long int N_ = 0; unsigned char method; try{ #if HAVE_DIAGNOSTIC #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wold-style-cast" #endif // Parse the input arguments if (!PyArg_ParseTuple(args, "lO!O!b", &N_, // signed long integer &PyArray_Type, &D, // NumPy array &PyArray_Type, &Z, // NumPy array &method)) { // unsigned char return NULL; // Error if the arguments have the wrong type. } #if HAVE_DIAGNOSTIC #pragma GCC diagnostic pop #endif if (N_ < 1 ) { // N must be at least 1. PyErr_SetString(PyExc_ValueError, "At least one element is needed for clustering."); return NULL; } /* (1) The biggest index used below is 4*(N-2)+3, as an index to Z. This must fit into the data type used for indices. (2) The largest representable integer, without loss of precision, by a floating point number of type t_float is 2^T_FLOAT_MANT_DIG. Here, we make sure that all cluster labels from 0 to 2N-2 in the output can be accurately represented by a floating point number. Conversion of N to 64 bits below is not really necessary but it prevents a warning ("shift count >= width of type") on systems where "long int" is 32 bits wide. */ if (N_ > MAX_INDEX/4 || static_cast(N_-1)>>(T_FLOAT_MANT_DIG-1) > 0) { PyErr_SetString(PyExc_ValueError, "Data is too big, index overflow."); return NULL; } t_index N = static_cast(N_); if (method>METHOD_METR_MEDIAN) { PyErr_SetString(PyExc_IndexError, "Invalid method index."); return NULL; } // Allow threads! GIL_release G; t_float * const D_ = reinterpret_cast(PyArray_DATA(D)); cluster_result Z2(N-1); auto_array_ptr members; // For these methods, the distance update formula needs the number of // data points in a cluster. if (method==METHOD_METR_AVERAGE || method==METHOD_METR_WARD || method==METHOD_METR_CENTROID) { members.init(N, 1); } // Operate on squared distances for these methods. if (method==METHOD_METR_WARD || method==METHOD_METR_CENTROID || method==METHOD_METR_MEDIAN) { for (t_float * DD = D_; DD!=D_+static_cast(N)*(N-1)/2; ++DD) *DD *= *DD; } switch (method) { case METHOD_METR_SINGLE: MST_linkage_core(N, D_, Z2); break; case METHOD_METR_COMPLETE: NN_chain_core(N, D_, NULL, Z2); break; case METHOD_METR_AVERAGE: NN_chain_core(N, D_, members, Z2); break; case METHOD_METR_WEIGHTED: NN_chain_core(N, D_, NULL, Z2); break; case METHOD_METR_WARD: NN_chain_core(N, D_, members, Z2); break; case METHOD_METR_CENTROID: generic_linkage(N, D_, members, Z2); break; default: // case METHOD_METR_MEDIAN generic_linkage(N, D_, NULL, Z2); } if (method==METHOD_METR_WARD || method==METHOD_METR_CENTROID || method==METHOD_METR_MEDIAN) { Z2.sqrt(); } t_float * const Z_ = reinterpret_cast(PyArray_DATA(Z)); if (method==METHOD_METR_CENTROID || method==METHOD_METR_MEDIAN) { generate_SciPy_dendrogram(Z_, Z2, N); } else { generate_SciPy_dendrogram(Z_, Z2, N); } } // try catch (const std::bad_alloc&) { return PyErr_NoMemory(); } catch(const std::exception& e){ PyErr_SetString(PyExc_EnvironmentError, e.what()); return NULL; } catch(const nan_error&){ PyErr_SetString(PyExc_FloatingPointError, "NaN dissimilarity value."); return NULL; } #ifdef FE_INVALID catch(const fenv_error&){ PyErr_SetString(PyExc_FloatingPointError, "NaN dissimilarity value in intermediate results."); return NULL; } #endif catch(...){ PyErr_SetString(PyExc_EnvironmentError, "C++ exception (unknown reason). Please send a bug report."); return NULL; } #if HAVE_DIAGNOSTIC #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wold-style-cast" #endif Py_RETURN_NONE; #if HAVE_DIAGNOSTIC #pragma GCC diagnostic pop #endif } /* Part 2: Clustering on vector data */ enum { // metrics METRIC_EUCLIDEAN = 0, METRIC_MINKOWSKI = 1, METRIC_CITYBLOCK = 2, METRIC_SEUCLIDEAN = 3, METRIC_SQEUCLIDEAN = 4, METRIC_COSINE = 5, METRIC_HAMMING = 6, METRIC_JACCARD = 7, METRIC_CHEBYCHEV = 8, METRIC_CANBERRA = 9, METRIC_BRAYCURTIS = 10, METRIC_MAHALANOBIS = 11, METRIC_YULE = 12, METRIC_MATCHING = 13, METRIC_DICE = 14, METRIC_ROGERSTANIMOTO = 15, METRIC_RUSSELLRAO = 16, METRIC_SOKALSNEATH = 17, METRIC_KULSINSKI = 18, METRIC_USER = 19, METRIC_INVALID = 20, // sentinel METRIC_JACCARD_BOOL = 21, // separate function for Jaccard metric on }; // Boolean input data /* Helper class: Throw this if calling the Python interpreter from within C returned an error. */ class pythonerror {}; /* This class handles all the information about the dissimilarity computation. */ class python_dissimilarity { private: t_float * Xa; std::ptrdiff_t dim; // size_t saves many statis_cast<> in products t_index N; auto_array_ptr Xnew; t_index * members; void (cluster_result::*postprocessfn) (const t_float) const; t_float postprocessarg; t_float (python_dissimilarity::*distfn) (const t_index, const t_index) const; // for user-defined metrics PyObject * X_Python; PyObject * userfn; auto_array_ptr precomputed; t_float * precomputed2; PyArrayObject * V; const t_float * V_data; // noncopyable python_dissimilarity(); python_dissimilarity(python_dissimilarity const &); python_dissimilarity & operator=(python_dissimilarity const &); public: // Ignore warning about uninitialized member variables. I know what I am // doing here, and some member variables are only used for certain metrics. #if HAVE_DIAGNOSTIC #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Weffc++" #endif python_dissimilarity (PyArrayObject * const Xarg, t_index * const members_, const unsigned char method, const unsigned char metric, PyObject * const extraarg, bool temp_point_array) : Xa(reinterpret_cast(PyArray_DATA(Xarg))), dim(PyArray_DIM(Xarg, 1)), N(static_cast(PyArray_DIM(Xarg, 0))), Xnew(temp_point_array ? (N-1)*dim : 0), members(members_), postprocessfn(NULL), V(NULL) { switch (method) { case METHOD_METR_SINGLE: postprocessfn = NULL; // default switch (metric) { case METRIC_EUCLIDEAN: set_euclidean(); break; case METRIC_SEUCLIDEAN: if (extraarg==NULL) { PyErr_SetString(PyExc_TypeError, "The 'seuclidean' metric needs a variance parameter."); throw pythonerror(); } #if HAVE_DIAGNOSTIC #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wold-style-cast" #endif V = reinterpret_cast(PyArray_FromAny(extraarg, PyArray_DescrFromType(NPY_DOUBLE), 1, 1, NPY_ARRAY_CARRAY_RO, NULL)); #if HAVE_DIAGNOSTIC #pragma GCC diagnostic pop #endif if (PyErr_Occurred()) { throw pythonerror(); } if (PyArray_DIM(V, 0)!=dim) { PyErr_SetString(PyExc_ValueError, "The variance vector must have the same dimensionality as the data."); throw pythonerror(); } V_data = reinterpret_cast(PyArray_DATA(V)); distfn = &python_dissimilarity::seuclidean; postprocessfn = &cluster_result::sqrt; break; case METRIC_SQEUCLIDEAN: distfn = &python_dissimilarity::sqeuclidean; break; case METRIC_CITYBLOCK: set_cityblock(); break; case METRIC_CHEBYCHEV: set_chebychev(); break; case METRIC_MINKOWSKI: set_minkowski(extraarg); break; case METRIC_COSINE: distfn = &python_dissimilarity::cosine; postprocessfn = &cluster_result::plusone; // precompute norms precomputed.init(N); for (t_index i=0; i(dim); break; case METRIC_JACCARD: distfn = &python_dissimilarity::jaccard; break; case METRIC_CANBERRA: distfn = &python_dissimilarity::canberra; break; case METRIC_BRAYCURTIS: distfn = &python_dissimilarity::braycurtis; break; case METRIC_MAHALANOBIS: if (extraarg==NULL) { PyErr_SetString(PyExc_TypeError, "The 'mahalanobis' metric needs a parameter for the inverse covariance."); throw pythonerror(); } #if HAVE_DIAGNOSTIC #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wold-style-cast" #endif V = reinterpret_cast(PyArray_FromAny(extraarg, PyArray_DescrFromType(NPY_DOUBLE), 2, 2, NPY_ARRAY_CARRAY_RO, NULL)); #if HAVE_DIAGNOSTIC #pragma GCC diagnostic pop #endif if (PyErr_Occurred()) { throw pythonerror(); } if (PyArray_DIM(V, 0)!=N || PyArray_DIM(V, 1)!=dim) { PyErr_SetString(PyExc_ValueError, "The inverse covariance matrix has the wrong size."); throw pythonerror(); } V_data = reinterpret_cast(PyArray_DATA(V)); distfn = &python_dissimilarity::mahalanobis; postprocessfn = &cluster_result::sqrt; break; case METRIC_YULE: distfn = &python_dissimilarity::yule; break; case METRIC_MATCHING: distfn = &python_dissimilarity::matching; postprocessfn = &cluster_result::divide; postprocessarg = static_cast(dim); break; case METRIC_DICE: distfn = &python_dissimilarity::dice; break; case METRIC_ROGERSTANIMOTO: distfn = &python_dissimilarity::rogerstanimoto; break; case METRIC_RUSSELLRAO: distfn = &python_dissimilarity::russellrao; postprocessfn = &cluster_result::divide; postprocessarg = static_cast(dim); break; case METRIC_SOKALSNEATH: distfn = &python_dissimilarity::sokalsneath; break; case METRIC_KULSINSKI: distfn = &python_dissimilarity::kulsinski; postprocessfn = &cluster_result::plusone; precomputed.init(N); for (t_index i=0; i(sum); } break; case METRIC_USER: X_Python = reinterpret_cast(Xarg); this->userfn = extraarg; distfn = &python_dissimilarity::user; break; default: // case METRIC_JACCARD_BOOL: distfn = &python_dissimilarity::jaccard_bool; } break; case METHOD_METR_WARD: postprocessfn = &cluster_result::sqrtdouble; break; default: postprocessfn = &cluster_result::sqrt; } } #if HAVE_DIAGNOSTIC #pragma GCC diagnostic pop #endif ~python_dissimilarity() { #if HAVE_DIAGNOSTIC #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wold-style-cast" #endif Py_XDECREF(V); #if HAVE_DIAGNOSTIC #pragma GCC diagnostic pop #endif } inline t_float operator () (const t_index i, const t_index j) const { return (this->*distfn)(i,j); } inline t_float X (const t_index i, const t_index j) const { return Xa[i*dim+j]; } inline bool Xb (const t_index i, const t_index j) const { return reinterpret_cast(Xa)[i*dim+j]; } inline t_float * Xptr(const t_index i, const t_index j) const { return Xa+i*dim+j; } void merge(const t_index i, const t_index j, const t_index newnode) const { t_float const * const Pi = i(members[i]) + Pj[k]*static_cast(members[j])) / static_cast(members[i]+members[j]); } members[newnode] = members[i]+members[j]; } void merge_weighted(const t_index i, const t_index j, const t_index newnode) const { t_float const * const Pi = i(members[i]) + Pj[k]*static_cast(members[j])) / static_cast(members[i]+members[j]); } members[j] += members[i]; } void merge_inplace_weighted(const t_index i, const t_index j) const { t_float const * const Pi = Xa+i*dim; t_float * const Pj = Xa+j*dim; for(t_index k=0; k(members[i]); t_float mj = static_cast(members[j]); return sqeuclidean(i,j)*mi*mj/(mi+mj); } inline t_float ward_initial(const t_index i, const t_index j) const { // alias for sqeuclidean // Factor 2!!! return sqeuclidean(i,j); } // This method must not produce NaN if the input is non-NaN. inline static t_float ward_initial_conversion(const t_float min) { return min*.5; } inline t_float ward_extended(const t_index i, const t_index j) const { t_float mi = static_cast(members[i]); t_float mj = static_cast(members[j]); return sqeuclidean_extended(i,j)*mi*mj/(mi+mj); } /* We need two variants of the Euclidean metric: one that does not check for a NaN result, which is used for the initial distances, and one which does, for the updated distances during the clustering procedure. */ template t_float sqeuclidean(const t_index i, const t_index j) const { t_float sum = 0; /* for (t_index k=0; k::infinity()) { set_chebychev(); } else if (postprocessarg==1.0){ set_cityblock(); } else if (postprocessarg==2.0){ set_euclidean(); } else { distfn = &python_dissimilarity::minkowski; postprocessfn = &cluster_result::power; } #if HAVE_DIAGNOSTIC #pragma GCC diagnostic pop #endif } void set_euclidean() { distfn = &python_dissimilarity::sqeuclidean; postprocessfn = &cluster_result::sqrt; } void set_cityblock() { distfn = &python_dissimilarity::cityblock; } void set_chebychev() { distfn = &python_dissimilarity::chebychev; } t_float seuclidean(const t_index i, const t_index j) const { t_float sum = 0; for (t_index k=0; kmax) { max = diff; } } return max; } t_float cosine(const t_index i, const t_index j) const { t_float sum = 0; for (t_index k=0; k(sum1) / static_cast(sum2); } t_float canberra(const t_index i, const t_index j) const { t_float sum = 0; for (t_index k=0; k(dim)-NTT-NXO); // NFFTT } void nbool_correspond_xo(const t_index i, const t_index j) const { NXO = 0; for (t_index k=0; k(2*NTFFT) / static_cast(NTFFT + NFFTT); } // Prevent a zero denominator for equal vectors. t_float dice(const t_index i, const t_index j) const { nbool_correspond(i, j); return (NXO==0) ? 0 : static_cast(NXO) / static_cast(NXO+2*NTT); } t_float rogerstanimoto(const t_index i, const t_index j) const { nbool_correspond_xo(i, j); return static_cast(2*NXO) / static_cast(NXO+dim); } t_float russellrao(const t_index i, const t_index j) const { nbool_correspond_tt(i, j); return static_cast(dim-NTT); } // Prevent a zero denominator for equal vectors. t_float sokalsneath(const t_index i, const t_index j) const { nbool_correspond(i, j); return (NXO==0) ? 0 : static_cast(2*NXO) / static_cast(NTT+2*NXO); } t_float kulsinski(const t_index i, const t_index j) const { nbool_correspond_tt(i, j); return static_cast(NTT) * (precomputed[i] + precomputed[j]); } // 'matching' distance = Hamming distance t_float matching(const t_index i, const t_index j) const { nbool_correspond_xo(i, j); return static_cast(NXO); } // Prevent a zero denominator for equal vectors. t_float jaccard_bool(const t_index i, const t_index j) const { nbool_correspond(i, j); return (NXO==0) ? 0 : static_cast(NXO) / static_cast(NXO+NTT); } }; static PyObject *linkage_vector_wrap(PyObject * const, PyObject * const args) { PyArrayObject * X, * Z; unsigned char method, metric; PyObject * extraarg; try{ // Parse the input arguments #if HAVE_DIAGNOSTIC #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wold-style-cast" #endif if (!PyArg_ParseTuple(args, "O!O!bbO", &PyArray_Type, &X, // NumPy array &PyArray_Type, &Z, // NumPy array &method, // unsigned char &metric, // unsigned char &extraarg )) { // Python object return NULL; } #if HAVE_DIAGNOSTIC #pragma GCC diagnostic pop #endif if (PyArray_NDIM(X) != 2) { PyErr_SetString(PyExc_ValueError, "The input array must be two-dimensional."); } npy_intp const N_ = PyArray_DIM(X, 0); if (N_ < 1 ) { // N must be at least 1. PyErr_SetString(PyExc_ValueError, "At least one element is needed for clustering."); return NULL; } npy_intp const dim = PyArray_DIM(X, 1); if (dim < 1 ) { PyErr_SetString(PyExc_ValueError, "Invalid dimension of the data set."); return NULL; } /* (1) The biggest index used below is 4*(N-2)+3, as an index to Z. This must fit into the data type used for indices. (2) The largest representable integer, without loss of precision, by a floating point number of type t_float is 2^T_FLOAT_MANT_DIG. Here, we make sure that all cluster labels from 0 to 2N-2 in the output can be accurately represented by a floating point number. Conversion of N to 64 bits below is not really necessary but it prevents a warning ("shift count >= width of type") on systems where "int" is 32 bits wide. */ if (N_ > MAX_INDEX/4 || dim > MAX_INDEX || static_cast(N_-1)>>(T_FLOAT_MANT_DIG-1) > 0) { PyErr_SetString(PyExc_ValueError, "Data is too big, index overflow."); return NULL; } t_index N = static_cast(N_); cluster_result Z2(N-1); auto_array_ptr members; if (method==METHOD_METR_WARD || method==METHOD_METR_CENTROID) { members.init(2*N-1, 1); } if ((method!=METHOD_METR_SINGLE && metric!=METRIC_EUCLIDEAN) || metric>=METRIC_INVALID) { PyErr_SetString(PyExc_IndexError, "Invalid metric index."); return NULL; } if (PyArray_ISBOOL(X)) { if (metric==METRIC_HAMMING) { metric = METRIC_MATCHING; // Alias } if (metric==METRIC_JACCARD) { metric = METRIC_JACCARD_BOOL; } } if (extraarg!=Py_None && metric!=METRIC_MINKOWSKI && metric!=METRIC_SEUCLIDEAN && metric!=METRIC_MAHALANOBIS && metric!=METRIC_USER) { PyErr_SetString(PyExc_TypeError, "No extra parameter is allowed for this metric."); return NULL; } /* temp_point_array must be true if the alternative algorithm is used below (currently for the centroid and median methods). */ bool temp_point_array = (method==METHOD_METR_CENTROID || method==METHOD_METR_MEDIAN); python_dissimilarity dist(X, members, method, metric, extraarg, temp_point_array); if (method!=METHOD_METR_SINGLE && method!=METHOD_METR_WARD && method!=METHOD_METR_CENTROID && method!=METHOD_METR_MEDIAN) { PyErr_SetString(PyExc_IndexError, "Invalid method index."); return NULL; } // Allow threads if the metric is not "user"! GIL_release G(metric!=METRIC_USER); switch (method) { case METHOD_METR_SINGLE: MST_linkage_core_vector(N, dist, Z2); break; case METHOD_METR_WARD: generic_linkage_vector(N, dist, Z2); break; case METHOD_METR_CENTROID: generic_linkage_vector_alternative(N, dist, Z2); break; default: // case METHOD_METR_MEDIAN: generic_linkage_vector_alternative(N, dist, Z2); } if (method==METHOD_METR_WARD || method==METHOD_METR_CENTROID) { members.free(); } dist.postprocess(Z2); t_float * const Z_ = reinterpret_cast(PyArray_DATA(Z)); if (method!=METHOD_METR_SINGLE) { generate_SciPy_dendrogram(Z_, Z2, N); } else { generate_SciPy_dendrogram(Z_, Z2, N); } } // try catch (const std::bad_alloc&) { return PyErr_NoMemory(); } catch(const std::exception& e){ PyErr_SetString(PyExc_EnvironmentError, e.what()); return NULL; } catch(const nan_error&){ PyErr_SetString(PyExc_FloatingPointError, "NaN dissimilarity value."); return NULL; } catch(const pythonerror){ return NULL; } catch(...){ PyErr_SetString(PyExc_EnvironmentError, "C++ exception (unknown reason). Please send a bug report."); return NULL; } #if HAVE_DIAGNOSTIC #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wold-style-cast" #endif Py_RETURN_NONE; #if HAVE_DIAGNOSTIC #pragma GCC diagnostic pop #endif } #if HAVE_VISIBILITY #pragma GCC visibility pop #endif fastcluster/src/fastcluster_R.cpp0000644000176000001440000006420212147521216016761 0ustar ripleyusers/* fastcluster: Fast hierarchical clustering routines for R and Python Copyright © 2011 Daniel Müllner */ #if __GNUC__ > 4 || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 6)) #define HAVE_DIAGNOSTIC 1 #endif #if HAVE_DIAGNOSTIC #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wredundant-decls" #pragma GCC diagnostic ignored "-Wpadded" #endif #include #include #include // for R_pow #if HAVE_DIAGNOSTIC #pragma GCC diagnostic pop #endif #define fc_isnan(X) ((X)!=(X)) // There is ISNAN but it is so much slower on my x86_64 system with GCC! #include // for std::ptrdiff_t #include // for std::numeric_limits<...>::infinity() #include // for std::stable_sort #include // for std::runtime_error #include // for std::string #include // for std::bad_alloc #include // for std::exception #include "fastcluster.cpp" /* Since the public interface is given by the Python respectively R interface, * we do not want other symbols than the interface initalization routines to be * visible in the shared object file. The "visibility" switch is a GCC concept. * Hiding symbols keeps the relocation table small and decreases startup time. * See http://gcc.gnu.org/wiki/Visibility */ #if HAVE_VISIBILITY #pragma GCC visibility push(hidden) #endif /* Helper function: order the nodes so that they can be displayed nicely in a dendrogram. This is used for the 'order' field in the R output. */ #if HAVE_DIAGNOSTIC #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wpadded" #endif struct pos_node { t_index pos; int node; }; #if HAVE_DIAGNOSTIC #pragma GCC diagnostic pop #endif void order_nodes(const int N, const int * const merge, const t_index * const node_size, int * const order) { /* Parameters: N : number of data points merge : (N-1)×2 array which specifies the node indices which are merged in each step of the clustering procedure. Negative entries -1...-N point to singleton nodes, while positive entries 1...(N-1) point to nodes which are themselves parents of other nodes. node_size : array of node sizes - makes it easier order : output array of size N Runtime: Θ(N) */ auto_array_ptr queue(N/2); int parent; int child; t_index pos = 0; queue[0].pos = 0; queue[0].node = N-2; t_index idx = 1; do { --idx; pos = queue[idx].pos; parent = queue[idx].node; // First child child = merge[parent]; if (child<0) { // singleton node, write this into the 'order' array. order[pos] = -child; ++pos; } else { /* compound node: put it on top of the queue and decompose it in a later iteration. */ queue[idx].pos = pos; queue[idx].node = child-1; // convert index-1 based to index-0 based ++idx; pos += node_size[child-1]; } // Second child child = merge[parent+N-1]; if (child<0) { order[pos] = -child; } else { queue[idx].pos = pos; queue[idx].node = child-1; ++idx; } } while (idx>0); } #define size_(r_) ( ((r_ void generate_R_dendrogram(int * const merge, double * const height, int * const order, cluster_result & Z2, const int N) { // The array "nodes" is a union-find data structure for the cluster // identites (only needed for unsorted cluster_result input). union_find nodes(sorted ? 0 : N); if (!sorted) { std::stable_sort(Z2[0], Z2[N-1]); } t_index node1, node2; auto_array_ptr node_size(N-1); for (t_index i=0; inode1; node2 = Z2[i]->node2; } else { node1 = nodes.Find(Z2[i]->node1); node2 = nodes.Find(Z2[i]->node2); // Merge the nodes in the union-find data structure by making them // children of a new node. nodes.Union(node1, node2); } // Sort the nodes in the output array. if (node1>node2) { t_index tmp = node1; node1 = node2; node2 = tmp; } /* Conversion between labeling conventions. Input: singleton nodes 0,...,N-1 compound nodes N,...,2N-2 Output: singleton nodes -1,...,-N compound nodes 1,...,N */ merge[i] = (node1(node1)-1 : static_cast(node1)-N+1; merge[i+N-1] = (node2(node2)-1 : static_cast(node2)-N+1; height[i] = Z2[i]->dist; node_size[i] = size_(node1) + size_(node2); } order_nodes(N, merge, node_size, order); } /* R interface code */ enum { METRIC_R_EUCLIDEAN = 0, METRIC_R_MAXIMUM = 1, METRIC_R_MANHATTAN = 2, METRIC_R_CANBERRA = 3, METRIC_R_BINARY = 4, METRIC_R_MINKOWSKI = 5 }; #if HAVE_DIAGNOSTIC #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wpadded" #endif class R_dissimilarity { private: t_float * Xa; std::ptrdiff_t dim; // std::ptrdiff_t saves many statis_cast<> in products t_float * members; void (cluster_result::*postprocessfn) (const t_float) const; t_float postprocessarg; t_float (R_dissimilarity::*distfn) (const t_index, const t_index) const; auto_array_ptr row_repr; int N; // no default constructor R_dissimilarity(); // noncopyable R_dissimilarity(R_dissimilarity const &); R_dissimilarity & operator=(R_dissimilarity const &); public: // Ignore warning about uninitialized member variables. I know what I am // doing here, and some member variables are only used for certain metrics. #if HAVE_DIAGNOSTIC #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Weffc++" #endif R_dissimilarity (t_float * const X_, const int N_, const int dim_, t_float * const members_, const unsigned char method, const unsigned char metric, const t_float p, bool make_row_repr) : Xa(X_), dim(dim_), members(members_), postprocessfn(NULL), postprocessarg(p), N(N_) { switch (method) { case METHOD_VECTOR_SINGLE: switch (metric) { case METRIC_R_EUCLIDEAN: distfn = &R_dissimilarity::sqeuclidean; postprocessfn = &cluster_result::sqrt; break; case METRIC_R_MAXIMUM: distfn = &R_dissimilarity::maximum; break; case METRIC_R_MANHATTAN: distfn = &R_dissimilarity::manhattan; break; case METRIC_R_CANBERRA: distfn = &R_dissimilarity::canberra; break; case METRIC_R_BINARY: distfn = &R_dissimilarity::dist_binary; break; case METRIC_R_MINKOWSKI: distfn = &R_dissimilarity::minkowski; postprocessfn = &cluster_result::power; break; default: throw std::runtime_error(std::string("Invalid method.")); } break; case METHOD_VECTOR_WARD: postprocessfn = &cluster_result::sqrtdouble; break; default: postprocessfn = &cluster_result::sqrt; } if (make_row_repr) { row_repr.init(2*N-1); for (t_index i=0; i*distfn)(i,j); } inline t_float X (const t_index i, const t_index j) const { // "C-style" array alignment return Xa[i*dim+j]; } inline t_float * Xptr(const t_index i, const t_index j) const { // "C-style" array alignment return Xa+i*dim+j; } void merge(const t_index i, const t_index j, const t_index newnode) const { merge_inplace(row_repr[i], row_repr[j]); row_repr[newnode] = row_repr[j]; } void merge_inplace(const t_index i, const t_index j) const { for(t_index k=0; k(i1,i2)*members[i1]*members[i2]/ \ (members[i1]+members[i2]); } inline double ward_initial(t_index const i1, t_index const i2) const { /* In the R interface, ward_initial is the same as ward. Only the Python interface has two different functions here. */ return ward(i1,i2); } // This method must not produce NaN if the input is non-NaN. inline static t_float ward_initial_conversion(const t_float min) { // identity return min; } double ward_extended(t_index i1, t_index i2) const { return ward(row_repr[i1], row_repr[i2]); } /* The following definitions and methods have been taken directly from the R source file /src/library/stats/src/distance.c in the R release 2.13.0. The code has only been adapted very slightly. (Unfortunately, the methods cannot be called directly in the R libraries since the functions are declared "static" in the above file.) Note to maintainers: If the code in distance.c changes in future R releases compared to 2.13.0, please update the definitions here, if necessary. */ // translation of variable names #define nc dim #define nr N #define x Xa #define p postprocessarg // The code from distance.c starts here #define both_FINITE(a,b) (R_FINITE(a) && R_FINITE(b)) #ifdef R_160_and_older #define both_non_NA both_FINITE #else #define both_non_NA(a,b) (!ISNAN(a) && !ISNAN(b)) #endif /* We need two variants of the Euclidean metric: one that does not check for a NaN result, which is used for the initial distances, and one which does, for the updated distances during the clustering procedure. */ // still public template double sqeuclidean(t_index const i1, t_index const i2) const { double dev, dist; int count, j; count = 0; dist = 0; double * p1 = x+i1*nc; double * p2 = x+i2*nc; for(j = 0 ; j < nc ; ++j) { if(both_non_NA(*p1, *p2)) { dev = (*p1 - *p2); if(!ISNAN(dev)) { dist += dev * dev; ++count; } } ++p1; ++p2; } if(count == 0) return NA_REAL; if(count != nc) dist /= (static_cast(count)/static_cast(nc)); //return sqrt(dist); // we take the square root later if (check_NaN) { #if HAVE_DIAGNOSTIC #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wfloat-equal" #endif if (fc_isnan(dist)) throw(nan_error()); #if HAVE_DIAGNOSTIC #pragma GCC diagnostic pop #endif } return dist; } inline double sqeuclidean_extended(t_index const i1, t_index const i2) const { return sqeuclidean(row_repr[i1], row_repr[i2]); } private: double maximum(t_index i1, t_index i2) const { double dev, dist; int count, j; count = 0; dist = -DBL_MAX; double * p1 = x+i1*nc; double * p2 = x+i2*nc; for(j = 0 ; j < nc ; ++j) { if(both_non_NA(*p1, *p2)) { dev = fabs(*p1 - *p2); if(!ISNAN(dev)) { if(dev > dist) dist = dev; ++count; } } ++p1; ++p2; } if(count == 0) return NA_REAL; return dist; } double manhattan(t_index i1, t_index i2) const { double dev, dist; int count, j; count = 0; dist = 0; double * p1 = x+i1*nc; double * p2 = x+i2*nc; for(j = 0 ; j < nc ; ++j) { if(both_non_NA(*p1, *p2)) { dev = fabs(*p1 - *p2); if(!ISNAN(dev)) { dist += dev; ++count; } } ++p1; ++p2; } if(count == 0) return NA_REAL; if(count != nc) dist /= (static_cast(count)/static_cast(nc)); return dist; } double canberra(t_index i1, t_index i2) const { double dev, dist, sum, diff; int count, j; count = 0; dist = 0; double * p1 = x+i1*nc; double * p2 = x+i2*nc; for(j = 0 ; j < nc ; ++j) { if(both_non_NA(*p1, *p2)) { sum = fabs(*p1 + *p2); diff = fabs(*p1 - *p2); if (sum > DBL_MIN || diff > DBL_MIN) { dev = diff/sum; #if HAVE_DIAGNOSTIC #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wfloat-equal" #endif if(!ISNAN(dev) || (!R_FINITE(diff) && diff == sum && /* use Inf = lim x -> oo */ (dev = 1.))) { dist += dev; ++count; } #if HAVE_DIAGNOSTIC #pragma GCC diagnostic pop #endif } } ++p1; ++p2; } if(count == 0) return NA_REAL; if(count != nc) dist /= (static_cast(count)/static_cast(nc)); return dist; } double dist_binary(t_index i1, t_index i2) const { int total, count, dist; int j; total = 0; count = 0; dist = 0; double * p1 = x+i1*nc; double * p2 = x+i2*nc; for(j = 0 ; j < nc ; ++j) { if(both_non_NA(*p1, *p2)) { if(!both_FINITE(*p1, *p2)) { // warning(_("treating non-finite values as NA")); } else { #if HAVE_DIAGNOSTIC #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wfloat-equal" #endif if(*p1 || *p2) { ++count; if( ! (*p1 && *p2) ) { ++dist; } } #if HAVE_DIAGNOSTIC #pragma GCC diagnostic pop #endif ++total; } } ++p1; ++p2; } if(total == 0) return NA_REAL; if(count == 0) return 0; return static_cast(dist) / static_cast(count); } double minkowski(t_index i1, t_index i2) const { double dev, dist; int count, j; count= 0; dist = 0; double * p1 = x+i1*nc; double * p2 = x+i2*nc; for(j = 0 ; j < nc ; ++j) { if(both_non_NA(*p1, *p2)) { dev = (*p1 - *p2); if(!ISNAN(dev)) { dist += R_pow(fabs(dev), p); ++count; } } ++p1; ++p2; } if(count == 0) return NA_REAL; if(count != nc) dist /= (static_cast(count)/static_cast(nc)); //return R_pow(dist, 1.0/p); // raise to the (1/p)-th power later return dist; } }; #if HAVE_DIAGNOSTIC #pragma GCC diagnostic pop #endif extern "C" { SEXP fastcluster(SEXP const N_, SEXP const method_, SEXP D_, SEXP members_) { SEXP r = NULL; // return value try{ /* Input checks */ // Parameter N: number of data points PROTECT(N_); if (!IS_INTEGER(N_) || LENGTH(N_)!=1) Rf_error("'N' must be a single integer."); const int N = *INTEGER_POINTER(N_); if (N<2) Rf_error("N must be at least 2."); const std::ptrdiff_t NN = static_cast(N)*(N-1)/2; UNPROTECT(1); // N_ // Parameter method: dissimilarity index update method PROTECT(method_); if (!IS_INTEGER(method_) || LENGTH(method_)!=1) Rf_error("'method' must be a single integer."); const int method = *INTEGER_POINTER(method_) - 1; // index-0 based; if (methodMETHOD_METR_MEDIAN) { Rf_error("Invalid method index."); } UNPROTECT(1); // method_ // Parameter members: number of members in each node auto_array_ptr members; if (method==METHOD_METR_AVERAGE || method==METHOD_METR_WARD || method==METHOD_METR_CENTROID) { members.init(N); if (Rf_isNull(members_)) { for (t_index i=0; i D__; if (method!=METHOD_METR_SINGLE) { D__.init(NN); for (std::ptrdiff_t i=0; i(N, D__, NULL, Z2); break; case METHOD_METR_AVERAGE: NN_chain_core(N, D__, members, Z2); break; case METHOD_METR_WEIGHTED: NN_chain_core(N, D__, NULL, Z2); break; case METHOD_METR_WARD: NN_chain_core(N, D__, members, Z2); break; case METHOD_METR_CENTROID: generic_linkage(N, D__, members, Z2); break; case METHOD_METR_MEDIAN: generic_linkage(N, D__, NULL, Z2); break; default: throw std::runtime_error(std::string("Invalid method.")); } D__.free(); // Free the memory now members.free(); // (not strictly necessary). SEXP m; // return field "merge" PROTECT(m = NEW_INTEGER(2*(N-1))); int * const merge = INTEGER_POINTER(m); SEXP dim_m; // Specify that m is an (N-1)×2 matrix PROTECT(dim_m = NEW_INTEGER(2)); INTEGER(dim_m)[0] = N-1; INTEGER(dim_m)[1] = 2; SET_DIM(m, dim_m); SEXP h; // return field "height" PROTECT(h = NEW_NUMERIC(N-1)); double * const height = NUMERIC_POINTER(h); SEXP o; // return fiels "order' PROTECT(o = NEW_INTEGER(N)); int * const order = INTEGER_POINTER(o); if (method==METHOD_METR_CENTROID || method==METHOD_METR_MEDIAN) generate_R_dendrogram(merge, height, order, Z2, N); else generate_R_dendrogram(merge, height, order, Z2, N); SEXP n; // names PROTECT(n = NEW_CHARACTER(3)); SET_STRING_ELT(n, 0, COPY_TO_USER_STRING("merge")); SET_STRING_ELT(n, 1, COPY_TO_USER_STRING("height")); SET_STRING_ELT(n, 2, COPY_TO_USER_STRING("order")); PROTECT(r = NEW_LIST(3)); // field names in the output list SET_ELEMENT(r, 0, m); SET_ELEMENT(r, 1, h); SET_ELEMENT(r, 2, o); SET_NAMES(r, n); UNPROTECT(6); // m, dim_m, h, o, r, n } // try catch (const std::bad_alloc&) { Rf_error( "Memory overflow."); } catch(const std::exception& e){ Rf_error( e.what() ); } catch(const nan_error&){ Rf_error("NaN dissimilarity value."); } #ifdef FE_INVALID catch(const fenv_error&){ Rf_error( "NaN dissimilarity value in intermediate results."); } #endif catch(...){ Rf_error( "C++ exception (unknown reason)." ); } return r; } SEXP fastcluster_vector(SEXP const method_, SEXP const metric_, SEXP X_, SEXP members_, SEXP p_) { SEXP r = NULL; // return value try{ /* Input checks */ // Parameter method: dissimilarity index update method PROTECT(method_); if (!IS_INTEGER(method_) || LENGTH(method_)!=1) Rf_error("'method' must be a single integer."); int method = *INTEGER_POINTER(method_) - 1; // index-0 based; if (methodMETHOD_VECTOR_MEDIAN) { Rf_error("Invalid method index."); } UNPROTECT(1); // method_ // Parameter metric PROTECT(metric_); if (!IS_INTEGER(metric_) || LENGTH(metric_)!=1) Rf_error("'metric' must be a single integer."); int metric = *INTEGER_POINTER(metric_) - 1; // index-0 based; if (metric<0 || metric>5 || (method!=METHOD_VECTOR_SINGLE && metric!=0) ) { Rf_error("Invalid metric index."); } UNPROTECT(1); // metric_ // data array PROTECT(X_ = AS_NUMERIC(X_)); SEXP dims_ = PROTECT( Rf_getAttrib( X_, R_DimSymbol ) ) ; if( dims_ == R_NilValue || LENGTH(dims_) != 2 ) { Rf_error( "Argument is not a matrix."); } const int * const dims = INTEGER(dims_); const int N = dims[0]; const int dim = dims[1]; if (N<2) Rf_error("There must be at least two data points."); // Make a working copy of the dissimilarity array // for all methods except "single". double * X__ = NUMERIC_POINTER(X_); // Copy the input array and change it from Fortran-contiguous style // to C-contiguous style // (Waste of memory for 'single'; the other methods need a copy auto_array_ptr X(LENGTH(X_)); for (std::ptrdiff_t i=0; i members; if (method==METHOD_VECTOR_WARD || method==METHOD_VECTOR_CENTROID) { members.init(N); if (Rf_isNull(members_)) { for (t_index i=0; i(method), static_cast(metric), p, make_row_repr); cluster_result Z2(N-1); /* Clustering step */ switch (method) { case METHOD_VECTOR_SINGLE: MST_linkage_core_vector(N, dist, Z2); break; case METHOD_VECTOR_WARD: generic_linkage_vector(N, dist, Z2); break; case METHOD_VECTOR_CENTROID: generic_linkage_vector_alternative(N, dist, Z2); break; case METHOD_VECTOR_MEDIAN: generic_linkage_vector_alternative(N, dist, Z2); break; default: throw std::runtime_error(std::string("Invalid method.")); } X.free(); // Free the memory now members.free(); // (not strictly necessary). dist.postprocess(Z2); SEXP m; // return field "merge" PROTECT(m = NEW_INTEGER(2*(N-1))); int * const merge = INTEGER_POINTER(m); SEXP dim_m; // Specify that m is an (N-1)×2 matrix PROTECT(dim_m = NEW_INTEGER(2)); INTEGER(dim_m)[0] = N-1; INTEGER(dim_m)[1] = 2; SET_DIM(m, dim_m); SEXP h; // return field "height" PROTECT(h = NEW_NUMERIC(N-1)); double * const height = NUMERIC_POINTER(h); SEXP o; // return fiels "order' PROTECT(o = NEW_INTEGER(N)); int * const order = INTEGER_POINTER(o); if (method==METHOD_VECTOR_SINGLE) generate_R_dendrogram(merge, height, order, Z2, N); else generate_R_dendrogram(merge, height, order, Z2, N); SEXP n; // names PROTECT(n = NEW_CHARACTER(3)); SET_STRING_ELT(n, 0, COPY_TO_USER_STRING("merge")); SET_STRING_ELT(n, 1, COPY_TO_USER_STRING("height")); SET_STRING_ELT(n, 2, COPY_TO_USER_STRING("order")); PROTECT(r = NEW_LIST(3)); // field names in the output list SET_ELEMENT(r, 0, m); SET_ELEMENT(r, 1, h); SET_ELEMENT(r, 2, o); SET_NAMES(r, n); UNPROTECT(6); // m, dim_m, h, o, r, n } // try catch (const std::bad_alloc&) { Rf_error( "Memory overflow."); } catch(const std::exception& e){ Rf_error( e.what() ); } catch(const nan_error&){ Rf_error("NaN dissimilarity value."); } catch(...){ Rf_error( "C++ exception (unknown reason)." ); } return r; } #if HAVE_VISIBILITY #pragma GCC visibility push(default) #endif void R_init_fastcluster(DllInfo * const info) { R_CallMethodDef callMethods[] = { {"fastcluster", (DL_FUNC) &fastcluster, 4}, {"fastcluster_vector", (DL_FUNC) &fastcluster_vector, 5}, {NULL, NULL, 0} }; R_registerRoutines(info, NULL, callMethods, NULL, NULL); } #if HAVE_VISIBILITY #pragma GCC visibility pop #endif } // extern "C" #if HAVE_VISIBILITY #pragma GCC visibility pop #endif fastcluster/src/fastcluster.cpp0000644000176000001440000014466712147521216016516 0ustar ripleyusers/* fastcluster: Fast hierarchical clustering routines for R and Python Copyright © 2011 Daniel Müllner This library implements various fast algorithms for hierarchical, agglomerative clustering methods: (1) Algorithms for the "stored matrix approach": the input is the array of pairwise dissimilarities. MST_linkage_core: single linkage clustering with the "minimum spanning tree algorithm (Rohlfs) NN_chain_core: nearest-neighbor-chain algorithm, suitable for single, complete, average, weighted and Ward linkage (Murtagh) generic_linkage: generic algorithm, suitable for all distance update formulas (Müllner) (2) Algorithms for the "stored data approach": the input are points in a vector space. MST_linkage_core_vector: single linkage clustering for vector data generic_linkage_vector: generic algorithm for vector data, suitable for the Ward, centroid and median methods. generic_linkage_vector_alternative: alternative scheme for updating the nearest neighbors. This method seems faster than "generic_linkage_vector" for the centroid and median methods but slower for the Ward method. All these implementation treat infinity values correctly. They throw an exception if a NaN distance value occurs. */ #include // for std::ptrdiff_t #include // for std::numeric_limits<...>::infinity() #include // for std::fill_n #include // for std::runtime_error #include // for std::string // Microsoft Visual Studio does not have fenv.h #ifdef _MSC_VER #if (_MSC_VER == 1500 || _MSC_VER == 1600) #define NO_INCLUDE_FENV #endif #endif #ifndef NO_INCLUDE_FENV #include #endif #include // also for DBL_MAX, DBL_MIN #ifndef DBL_MANT_DIG #error The constant DBL_MANT_DIG could not be defined. #endif #define T_FLOAT_MANT_DIG DBL_MANT_DIG #ifndef LONG_MAX #include #endif #ifndef LONG_MAX #error The constant LONG_MAX could not be defined. #endif #ifndef INT_MAX #error The constant INT_MAX could not be defined. #endif #ifndef INT32_MAX #define __STDC_LIMIT_MACROS #include #endif #ifndef HAVE_DIAGNOSTIC #if __GNUC__ > 4 || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 6)) #define HAVE_DIAGNOSTIC 1 #endif #endif #ifndef HAVE_VISIBILITY #if __GNUC__ >= 4 #define HAVE_VISIBILITY 1 #endif #endif /* Since the public interface is given by the Python respectively R interface, * we do not want other symbols than the interface initalization routines to be * visible in the shared object file. The "visibility" switch is a GCC concept. * Hiding symbols keeps the relocation table small and decreases startup time. * See http://gcc.gnu.org/wiki/Visibility */ #if HAVE_VISIBILITY #pragma GCC visibility push(hidden) #endif typedef int_fast32_t t_index; #ifndef INT32_MAX #define MAX_INDEX 0x7fffffffL #else #define MAX_INDEX INT32_MAX #endif #if (LONG_MAX < MAX_INDEX) #error The integer format "t_index" must not have a greater range than "long int". #endif #if (INT_MAX > MAX_INDEX) #error The integer format "int" must not have a greater range than "t_index". #endif typedef double t_float; enum method_codes { // non-Euclidean methods METHOD_METR_SINGLE = 0, METHOD_METR_COMPLETE = 1, METHOD_METR_AVERAGE = 2, METHOD_METR_WEIGHTED = 3, METHOD_METR_WARD = 4, METHOD_METR_CENTROID = 5, METHOD_METR_MEDIAN = 6 }; enum { // Euclidean methods METHOD_VECTOR_SINGLE = 0, METHOD_VECTOR_WARD = 1, METHOD_VECTOR_CENTROID = 2, METHOD_VECTOR_MEDIAN = 3 }; enum { // Return values RET_SUCCESS = 0, RET_MEMORY_ERROR = 1, RET_STL_ERROR = 2, RET_UNKNOWN_ERROR = 3 }; // self-destructing array pointer template class auto_array_ptr{ private: type * ptr; auto_array_ptr(auto_array_ptr const &); // non construction-copyable auto_array_ptr& operator=(auto_array_ptr const &); // non copyable public: auto_array_ptr() : ptr(NULL) { } template auto_array_ptr(index const size) : ptr(new type[size]) { } template auto_array_ptr(index const size, value const val) : ptr(new type[size]) { std::fill_n(ptr, size, val); } ~auto_array_ptr() { delete [] ptr; } void free() { delete [] ptr; ptr = NULL; } template void init(index const size) { ptr = new type [size]; } template void init(index const size, value const val) { init(size); std::fill_n(ptr, size, val); } inline operator type *() const { return ptr; } }; struct node { t_index node1, node2; t_float dist; /* inline bool operator< (const node a) const { return this->dist < a.dist; } */ inline friend bool operator< (const node a, const node b) { return (a.dist < b.dist); } }; class cluster_result { private: auto_array_ptr Z; t_index pos; public: cluster_result(const t_index size) : Z(size) , pos(0) {} void append(const t_index node1, const t_index node2, const t_float dist) { Z[pos].node1 = node1; Z[pos].node2 = node2; Z[pos].dist = dist; ++pos; } node * operator[] (const t_index idx) const { return Z + idx; } /* Define several methods to postprocess the distances. All these functions are monotone, so they do not change the sorted order of distances. */ void sqrt() const { for (node * ZZ=Z; ZZ!=Z+pos; ++ZZ) { ZZ->dist = ::sqrt(ZZ->dist); } } void sqrt(const t_float) const { // ignore the argument sqrt(); } void sqrtdouble(const t_float) const { // ignore the argument for (node * ZZ=Z; ZZ!=Z+pos; ++ZZ) { ZZ->dist = ::sqrt(2*ZZ->dist); } } #ifdef R_pow #define my_pow R_pow #else #define my_pow pow #endif void power(const t_float p) const { t_float const q = 1/p; for (node * ZZ=Z; ZZ!=Z+pos; ++ZZ) { ZZ->dist = my_pow(ZZ->dist,q); } } void plusone(const t_float) const { // ignore the argument for (node * ZZ=Z; ZZ!=Z+pos; ++ZZ) { ZZ->dist += 1; } } void divide(const t_float denom) const { for (node * ZZ=Z; ZZ!=Z+pos; ++ZZ) { ZZ->dist /= denom; } } }; class doubly_linked_list { /* Class for a doubly linked list. Initially, the list is the integer range [0, size]. We provide a forward iterator and a method to delete an index from the list. Typical use: for (i=L.start; L succ; private: auto_array_ptr pred; // Not necessarily private, we just do not need it in this instance. public: doubly_linked_list(const t_index size) // Initialize to the given size. : start(0) , succ(size+1) , pred(size+1) { for (t_index i=0; i(2*N-3-(r_))*(r_)>>1)+(c_)-1] ) // Z is an ((N-1)x4)-array #define Z_(_r, _c) (Z[(_r)*4 + (_c)]) /* Lookup function for a union-find data structure. The function finds the root of idx by going iteratively through all parent elements until a root is found. An element i is a root if nodes[i] is zero. To make subsequent searches faster, the entry for idx and all its parents is updated with the root element. */ class union_find { private: auto_array_ptr parent; t_index nextparent; public: union_find(const t_index size) : parent(size>0 ? 2*size-1 : 0, 0) , nextparent(size) { } t_index Find (t_index idx) const { if (parent[idx] != 0 ) { // a → b t_index p = idx; idx = parent[idx]; if (parent[idx] != 0 ) { // a → b → c do { idx = parent[idx]; } while (parent[idx] != 0); do { t_index tmp = parent[p]; parent[p] = idx; p = tmp; } while (parent[p] != idx); } } return idx; } void Union (const t_index node1, const t_index node2) { parent[node1] = parent[node2] = nextparent++; } }; class nan_error{}; #ifdef FE_INVALID class fenv_error{}; #endif static void MST_linkage_core(const t_index N, const t_float * const D, cluster_result & Z2) { /* N: integer, number of data points D: condensed distance matrix N*(N-1)/2 Z2: output data structure The basis of this algorithm is an algorithm by Rohlf: F. James Rohlf, Hierarchical clustering using the minimum spanning tree, The Computer Journal, vol. 16, 1973, p. 93–95. */ t_index i; t_index idx2; doubly_linked_list active_nodes(N); auto_array_ptr d(N); t_index prev_node; t_float min; // first iteration idx2 = 1; min = std::numeric_limits::infinity(); for (i=1; i tmp) d[i] = tmp; else if (fc_isnan(tmp)) throw (nan_error()); #if HAVE_DIAGNOSTIC #pragma GCC diagnostic pop #endif if (d[i] < min) { min = d[i]; idx2 = i; } } Z2.append(prev_node, idx2, min); } } /* Functions for the update of the dissimilarity array */ inline static void f_single( t_float * const b, const t_float a ) { if (*b > a) *b = a; } inline static void f_complete( t_float * const b, const t_float a ) { if (*b < a) *b = a; } inline static void f_average( t_float * const b, const t_float a, const t_float s, const t_float t) { *b = s*a + t*(*b); #ifndef FE_INVALID #if HAVE_DIAGNOSTIC #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wfloat-equal" #endif if (fc_isnan(*b)) { throw(nan_error()); } #if HAVE_DIAGNOSTIC #pragma GCC diagnostic pop #endif #endif } inline static void f_weighted( t_float * const b, const t_float a) { *b = (a+*b)*.5; #ifndef FE_INVALID #if HAVE_DIAGNOSTIC #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wfloat-equal" #endif if (fc_isnan(*b)) { throw(nan_error()); } #if HAVE_DIAGNOSTIC #pragma GCC diagnostic pop #endif #endif } inline static void f_ward( t_float * const b, const t_float a, const t_float c, const t_float s, const t_float t, const t_float v) { *b = ( (v+s)*a - v*c + (v+t)*(*b) ) / (s+t+v); //*b = a+(*b)-(t*a+s*(*b)+v*c)/(s+t+v); #ifndef FE_INVALID #if HAVE_DIAGNOSTIC #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wfloat-equal" #endif if (fc_isnan(*b)) { throw(nan_error()); } #if HAVE_DIAGNOSTIC #pragma GCC diagnostic pop #endif #endif } inline static void f_centroid( t_float * const b, const t_float a, const t_float stc, const t_float s, const t_float t) { *b = s*a - stc + t*(*b); #ifndef FE_INVALID if (fc_isnan(*b)) { throw(nan_error()); } #if HAVE_DIAGNOSTIC #pragma GCC diagnostic pop #endif #endif } inline static void f_median( t_float * const b, const t_float a, const t_float c_4) { *b = (a+(*b))*.5 - c_4; #ifndef FE_INVALID #if HAVE_DIAGNOSTIC #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wfloat-equal" #endif if (fc_isnan(*b)) { throw(nan_error()); } #if HAVE_DIAGNOSTIC #pragma GCC diagnostic pop #endif #endif } template static void NN_chain_core(const t_index N, t_float * const D, t_members * const members, cluster_result & Z2) { /* N: integer D: condensed distance matrix N*(N-1)/2 Z2: output data structure This is the NN-chain algorithm, described on page 86 in the following book: Fionn Murtagh, Multidimensional Clustering Algorithms, Vienna, Würzburg: Physica-Verlag, 1985. */ t_index i; auto_array_ptr NN_chain(N); t_index NN_chain_tip = 0; t_index idx1, idx2; t_float size1, size2; doubly_linked_list active_nodes(N); t_float min; for (t_float const * DD=D; DD!=D+(static_cast(N)*(N-1)>>1); ++DD) { #if HAVE_DIAGNOSTIC #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wfloat-equal" #endif if (fc_isnan(*DD)) { throw(nan_error()); } #if HAVE_DIAGNOSTIC #pragma GCC diagnostic pop #endif } #ifdef FE_INVALID if (feclearexcept(FE_INVALID)) throw fenv_error(); #endif for (t_index j=0; jidx2) { t_index tmp = idx1; idx1 = idx2; idx2 = tmp; } if (method==METHOD_METR_AVERAGE || method==METHOD_METR_WARD) { size1 = static_cast(members[idx1]); size2 = static_cast(members[idx2]); members[idx2] += members[idx1]; } // Remove the smaller index from the valid indices (active_nodes). active_nodes.remove(idx1); switch (method) { case METHOD_METR_SINGLE: /* Single linkage. Characteristic: new distances are never longer than the old distances. */ // Update the distance matrix in the range [start, idx1). for (i=active_nodes.start; i(members[i]); for (i=active_nodes.start; i(members[i]) ); // Update the distance matrix in the range (idx1, idx2). for (; i(members[i]) ); // Update the distance matrix in the range (idx2, N). for (i=active_nodes.succ[idx2]; i(members[i]) ); break; default: throw std::runtime_error(std::string("Invalid method.")); } } #ifdef FE_INVALID if (fetestexcept(FE_INVALID)) throw fenv_error(); #endif } class binary_min_heap { /* Class for a binary min-heap. The data resides in an array A. The elements of A are not changed but two lists I and R of indices are generated which point to elements of A and backwards. The heap tree structure is H[2*i+1] H[2*i+2] \ / \ / ≤ ≤ \ / \ / H[i] where the children must be less or equal than their parent. Thus, H[0] contains the minimum. The lists I and R are made such that H[i] = A[I[i]] and R[I[i]] = i. This implementation is not designed to handle NaN values. */ private: t_float * const A; t_index size; auto_array_ptr I; auto_array_ptr R; // no default constructor binary_min_heap(); // noncopyable binary_min_heap(binary_min_heap const &); binary_min_heap & operator=(binary_min_heap const &); public: binary_min_heap(t_float * const A_, const t_index size_) : A(A_), size(size_), I(size), R(size) { // Allocate memory and initialize the lists I and R to the identity. This // does not make it a heap. Call heapify afterwards! for (t_index i=0; i>1); idx>0; ) { --idx; update_geq_(idx); } } inline t_index argmin() const { // Return the minimal element. return I[0]; } void heap_pop() { // Remove the minimal element from the heap. --size; I[0] = I[size]; R[I[0]] = 0; update_geq_(0); } void remove(t_index idx) { // Remove an element from the heap. --size; R[I[size]] = R[idx]; I[R[idx]] = I[size]; if ( H(size)<=A[idx] ) { update_leq_(R[idx]); } else { update_geq_(R[idx]); } } void replace ( const t_index idxold, const t_index idxnew, const t_float val) { R[idxnew] = R[idxold]; I[R[idxnew]] = idxnew; if (val<=A[idxold]) update_leq(idxnew, val); else update_geq(idxnew, val); } void update ( const t_index idx, const t_float val ) const { // Update the element A[i] with val and re-arrange the indices to preserve // the heap condition. if (val<=A[idx]) update_leq(idx, val); else update_geq(idx, val); } void update_leq ( const t_index idx, const t_float val ) const { // Use this when the new value is not more than the old value. A[idx] = val; update_leq_(R[idx]); } void update_geq ( const t_index idx, const t_float val ) const { // Use this when the new value is not less than the old value. A[idx] = val; update_geq_(R[idx]); } private: void update_leq_ (t_index i) const { t_index j; for ( ; (i>0) && ( H(i)>1) ); i=j) heap_swap(i,j); } void update_geq_ (t_index i) const { t_index j; for ( ; (j=2*i+1)=H(i) ) { ++j; if ( j>=size || H(j)>=H(i) ) break; } else if ( j+1 static void generic_linkage(const t_index N, t_float * const D, t_members * const members, cluster_result & Z2) { /* N: integer, number of data points D: condensed distance matrix N*(N-1)/2 Z2: output data structure */ const t_index N_1 = N-1; t_index i, j; // loop variables t_index idx1, idx2; // row and column indices auto_array_ptr n_nghbr(N_1); // array of nearest neighbors auto_array_ptr mindist(N_1); // distances to the nearest neighbors auto_array_ptr row_repr(N); // row_repr[i]: node number that the // i-th row represents doubly_linked_list active_nodes(N); binary_min_heap nn_distances(&*mindist, N_1); // minimum heap structure for // the distance to the nearest neighbor of each point t_index node1, node2; // node numbers in the output t_float size1, size2; // and their cardinalities t_float min; // minimum and row index for nearest-neighbor search t_index idx; for (i=0; ii} D(i,j) for i in range(N-1) t_float const * DD = D; for (i=0; i::infinity(); for (idx=j=i+1; ji} D(i,j) Normally, we have equality. However, this minimum may become invalid due to the updates in the distance matrix. The rules are: 1) If mindist[i] is equal to D(i, n_nghbr[i]), this is the correct minimum and n_nghbr[i] is a nearest neighbor. 2) If mindist[i] is smaller than D(i, n_nghbr[i]), this might not be the correct minimum. The minimum needs to be recomputed. 3) mindist[i] is never bigger than the true minimum. Hence, we never miss the true minimum if we take the smallest mindist entry, re-compute the value if necessary (thus maybe increasing it) and looking for the now smallest mindist entry until a valid minimal entry is found. This step is done in the lines below. The update process for D below takes care that these rules are fulfilled. This makes sure that the minima in the rows D(i,i+1:)of D are re-calculated when necessary but re-calculation is avoided whenever possible. The re-calculation of the minima makes the worst-case runtime of this algorithm cubic in N. We avoid this whenever possible, and in most cases the runtime appears to be quadratic. */ idx1 = nn_distances.argmin(); if (method != METHOD_METR_SINGLE) { while ( mindist[idx1] < D_(idx1, n_nghbr[idx1]) ) { // Recompute the minimum mindist[idx1] and n_nghbr[idx1]. n_nghbr[idx1] = j = active_nodes.succ[idx1]; // exists, maximally N-1 min = D_(idx1,j); for (j=active_nodes.succ[j]; j(members[idx1]); size2 = static_cast(members[idx2]); members[idx2] += members[idx1]; } Z2.append(node1, node2, mindist[idx1]); // Remove idx1 from the list of active indices (active_nodes). active_nodes.remove(idx1); // Index idx2 now represents the new (merged) node with label N+i. row_repr[idx2] = N+i; // Update the distance matrix switch (method) { case METHOD_METR_SINGLE: /* Single linkage. Characteristic: new distances are never longer than the old distances. */ // Update the distance matrix in the range [start, idx1). for (j=active_nodes.start; j(members[j]) ); if (n_nghbr[j] == idx1) n_nghbr[j] = idx2; } // Update the distance matrix in the range (idx1, idx2). for (; j(members[j]) ); if (D_(j, idx2) < mindist[j]) { nn_distances.update_leq(j, D_(j, idx2)); n_nghbr[j] = idx2; } } // Update the distance matrix in the range (idx2, N). if (idx2(members[j]) ); min = D_(idx2,j); for (j=active_nodes.succ[j]; j(members[j]) ); if (D_(idx2,j) < min) { min = D_(idx2,j); n_nghbr[idx2] = j; } } nn_distances.update(idx2, min); } break; case METHOD_METR_CENTROID: { /* Centroid linkage. Shorter and longer distances can occur, not bigger than max(d1,d2) but maybe smaller than min(d1,d2). */ // Update the distance matrix in the range [start, idx1). t_float s = size1/(size1+size2); t_float t = size2/(size1+size2); t_float stc = s*t*mindist[idx1]; for (j=active_nodes.start; j static void MST_linkage_core_vector(const t_index N, t_dissimilarity & dist, cluster_result & Z2) { /* N: integer, number of data points dist: function pointer to the metric Z2: output data structure The basis of this algorithm is an algorithm by Rohlf: F. James Rohlf, Hierarchical clustering using the minimum spanning tree, The Computer Journal, vol. 16, 1973, p. 93–95. */ t_index i; t_index idx2; doubly_linked_list active_nodes(N); auto_array_ptr d(N); t_index prev_node; t_float min; // first iteration idx2 = 1; min = std::numeric_limits::infinity(); for (i=1; i tmp) d[i] = tmp; else if (fc_isnan(tmp)) throw (nan_error()); #if HAVE_DIAGNOSTIC #pragma GCC diagnostic pop #endif if (d[i] < min) { min = d[i]; idx2 = i; } } Z2.append(prev_node, idx2, min); } } template static void generic_linkage_vector(const t_index N, t_dissimilarity & dist, cluster_result & Z2) { /* N: integer, number of data points dist: function pointer to the metric Z2: output data structure This algorithm is valid for the distance update methods "Ward", "centroid" and "median" only! */ const t_index N_1 = N-1; t_index i, j; // loop variables t_index idx1, idx2; // row and column indices auto_array_ptr n_nghbr(N_1); // array of nearest neighbors auto_array_ptr mindist(N_1); // distances to the nearest neighbors auto_array_ptr row_repr(N); // row_repr[i]: node number that the // i-th row represents doubly_linked_list active_nodes(N); binary_min_heap nn_distances(&*mindist, N_1); // minimum heap structure for // the distance to the nearest neighbor of each point t_index node1, node2; // node numbers in the output t_float min; // minimum and row index for nearest-neighbor search for (i=0; ii} D(i,j) for i in range(N-1) for (i=0; i::infinity(); t_index idx; for (idx=j=i+1; j(i,j); } if (tmp(idx1,j); for (j=active_nodes.succ[j]; j(idx1,j); if (tmp(j, idx2); if (tmp < mindist[j]) { nn_distances.update_leq(j, tmp); n_nghbr[j] = idx2; } else if (n_nghbr[j] == idx2) n_nghbr[j] = idx1; // invalidate } // Find the nearest neighbor for idx2. if (idx2(idx2,j); for (j=active_nodes.succ[j]; j(idx2, j); if (tmp < min) { min = tmp; n_nghbr[idx2] = j; } } nn_distances.update(idx2, min); } } } } template static void generic_linkage_vector_alternative(const t_index N, t_dissimilarity & dist, cluster_result & Z2) { /* N: integer, number of data points dist: function pointer to the metric Z2: output data structure This algorithm is valid for the distance update methods "Ward", "centroid" and "median" only! */ const t_index N_1 = N-1; t_index i, j=0; // loop variables t_index idx1, idx2; // row and column indices auto_array_ptr n_nghbr(2*N-2); // array of nearest neighbors auto_array_ptr mindist(2*N-2); // distances to the nearest neighbors doubly_linked_list active_nodes(N+N_1); binary_min_heap nn_distances(&*mindist, N_1, 2*N-2, 1); // minimum heap // structure for the distance to the nearest neighbor of each point t_float min; // minimum for nearest-neighbor searches // Initialize the minimal distances: // Find the nearest neighbor of each point. // n_nghbr[i] = argmin_{j>i} D(i,j) for i in range(N-1) for (i=1; i::infinity(); t_index idx; for (idx=j=0; j(i,j); } if (tmp If everything is OK, the test program will run forever, without an error message. ''')) import fastcluster as fc import numpy as np from scipy.spatial.distance import pdist, squareform import math import sys import atexit def print_seed(): print("Seed: {0}".format(seed)) atexit.register(print_seed) seed = np.random.randint(0,1e9) print_seed() np.random.seed(seed) abstol = 1e-14 # absolute tolerance rtol = 1e-13 # relative tolerance # NaN values are used in computations. Do not warn about them. np.seterr(invalid='ignore') def correct_for_zero_vectors(D, pcd, metric): # Correct some metrics: we want the distance from the zero vector # to itself to be 0, not NaN. if metric in ('jaccard', 'dice', 'sokalsneath'): z = np.flatnonzero(np.all(pcd==0, axis=1)) if len(z): DD = squareform(D) DD[np.ix_(z, z)] = 0 D = squareform(DD) return D def test_all(n,dim): method = 'single' # metrics for boolean vectors pcd = np.array(np.random.random_integers(0,1,(n,dim)), dtype=np.bool) pcd2 = pcd.copy() for metric in ('hamming', 'jaccard', 'yule', 'matching', 'dice', 'rogerstanimoto', #'sokalmichener', # exclude, bug in Scipy # http://projects.scipy.org/scipy/ticket/1486 'russellrao', 'sokalsneath', #'kulsinski' # exclude, bug in Scipy # http://projects.scipy.org/scipy/ticket/1484 ): sys.stdout.write("Metric: " + metric + "...") D = pdist(pcd, metric) D = correct_for_zero_vectors(D, pcd, metric) try: Z2 = fc.linkage_vector(pcd, method, metric) except FloatingPointError: # If linkage_vector reported a NaN dissimilarity value, # check whether the distance matrix really contains NaN. if np.any(np.isnan(D)): print("Skip this test: NaN dissimilarity value.") continue else: raise AssertionError('"linkage_vector" erroneously reported NaN.') if np.any(pcd2!=pcd): raise AssertionError('Input array was corrupted.', pcd) test(Z2, method, D) # metrics for real vectors bound = math.sqrt(n) pcd = np.random.random_integers(-bound,bound,(n,dim)) for metric in ['euclidean', 'sqeuclidean', 'cityblock', 'chebychev', 'minkowski', 'cosine', 'correlation', 'hamming', 'jaccard', 'canberra', # canberra: see bug in older Scipy versions # http://projects.scipy.org/scipy/ticket/1430 'braycurtis', 'seuclidean', 'mahalanobis', 'user']: sys.stdout.write("Metric: " + metric + "...") if metric=='minkowski': p = np.random.uniform(1.,10.) sys.stdout.write("p: " + str(p) + "...") D = pdist(pcd, metric, p) Z2 = fc.linkage_vector(pcd, method, metric, p) elif metric=='user': # Euclidean metric as a user function fn = (lambda u, v: np.sqrt(((u-v)*(u-v).T).sum())) D = pdist(pcd, fn) Z2 = fc.linkage_vector(pcd, method, fn) else: D = pdist(pcd, metric) D = correct_for_zero_vectors(D, pcd, metric) try: Z2 = fc.linkage_vector(pcd, method, metric) except FloatingPointError: if np.any(np.isnan(D)): print("Skip this test: NaN dissimilarity value.") continue else: raise AssertionError( '"linkage_vector" erroneously reported NaN.') test(Z2, method, D) D = pdist(pcd) for method in ['ward', 'centroid', 'median']: Z2 = fc.linkage_vector(pcd, method) test(Z2, method, D) def test(Z2, method, D): sys.stdout.write("Method: " + method + "...") I = np.array(Z2[:,:2], dtype=int) Ds = squareform(D) n = len(Ds) row_repr = np.arange(2*n-1) row_repr[n:] = -1 size = np.ones(n, dtype=np.int) np.fill_diagonal(Ds, np.nan) mins = np.empty(n-1) for i in range(n-1): for j in range(n-1): mins[j] = np.nanmin(Ds[j,j+1:]) gmin = np.nanmin(mins) if abs(Z2[i,2]-gmin) > max(abs(Z2[i,2]),abs(gmin))*rtol and \ abs(Z2[i,2]-gmin)>abstol: raise AssertionError( 'Not the global minimum in step {2}: {0}, {1}'. format(Z2[i,2], gmin,i), squareform(D)) i1, i2 = row_repr[I[i,:]] if (i1<0): raise AssertionError('Negative index i1.', squareform(D)) if (i2<0): raise AssertionError('Negative index i2.', squareform(D)) if I[i,0]>=I[i,1]: raise AssertionError('Convention violated.', squareform(D)) if i1>i2: i1, i2 = i2, i1 if abs(Ds[i1,i2]-gmin) > max(abs(Ds[i1,i2]),abs(gmin))*rtol and \ abs(Ds[i1,i2]-gmin)>abstol: raise AssertionError( 'The global minimum is not at the right place in step {5}: ' '({0}, {1}): {2} != {3}. Difference: {4}' .format(i1, i2, Ds[i1, i2], gmin, Ds[i1, i2]-gmin, i), squareform(D)) s1 = size[i1] s2 = size[i2] S = float(s1+s2) if method=='single': if i1>0: # mostly unnecessary; workaround for a bug/feature in NumPy # 1.7.0.dev, see http://projects.scipy.org/numpy/ticket/2078 Ds[:i1,i2] = np.min( Ds[:i1,(i1,i2)],axis=1) Ds[i1:i2,i2] = np.minimum(Ds[i1,i1:i2],Ds[i1:i2,i2]) Ds[i2,i2:] = np.min( Ds[(i1,i2),i2:],axis=0) elif method=='complete': if i1>0: Ds[:i1,i2] = np.max( Ds[:i1,(i1,i2)],axis=1) Ds[i1:i2,i2] = np.maximum(Ds[i1,i1:i2],Ds[i1:i2,i2]) Ds[i2,i2:] = np.max( Ds[(i1,i2),i2:],axis=0) elif method=='average': Ds[:i1,i2] = ( Ds[:i1,i1]*s1 + Ds[:i1,i2]*s2 ) / S Ds[i1:i2,i2] = ( Ds[i1,i1:i2]*s1 + Ds[i1:i2,i2]*s2 ) / S Ds[i2,i2:] = ( Ds[i1,i2:]*s1 + Ds[i2,i2:]*s2 ) / S elif method=='weighted': if i1>0: Ds[:i1,i2] = np.mean( Ds[:i1,(i1,i2)],axis=1) Ds[i1:i2,i2] = ( Ds[i1,i1:i2] + Ds[i1:i2,i2] )*.5 Ds[i2,i2:] = np.mean( Ds[(i1,i2),i2:],axis=0) elif method=='ward': Ds[:i1,i2] = np.sqrt((np.square(Ds[:i1,i1])*(s1+size[:i1]) -gmin*gmin*size[:i1] +np.square(Ds[:i1,i2])*(s2+size[:i1]))/(S+size[:i1])) Ds[i1:i2,i2] = np.sqrt((np.square(Ds[i1,i1:i2])*(s1+size[i1:i2]) -gmin*gmin*size[i1:i2] +np.square(Ds[i1:i2,i2])*(s2+size[i1:i2])) /(S+size[i1:i2])) Ds[i2,i2:] = np.sqrt((np.square(Ds[i1,i2:])*(s1+size[i2:]) -gmin*gmin*size[i2:] +np.square(Ds[i2,i2:])*(s2+size[i2:]))/(S+size[i2:])) elif method=='centroid': Ds[:i1,i2] = np.sqrt((np.square(Ds[:i1,i1])*s1 +np.square(Ds[:i1,i2])*s2)*S-gmin*gmin*s1*s2) / S Ds[i1:i2,i2] = np.sqrt((np.square(Ds[i1,i1:i2])*s1 +np.square(Ds[i1:i2,i2])*s2)*S-gmin*gmin*s1*s2) / S Ds[i2,i2:] = np.sqrt((np.square(Ds[i1,i2:])*s1 +np.square(Ds[i2,i2:])*s2)*S-gmin*gmin*s1*s2) / S elif method=='median': Ds[:i1,i2] = np.sqrt((np.square(Ds[:i1,i1]) +np.square(Ds[:i1,i2]))*2-gmin*gmin)*.5 Ds[i1:i2,i2] = np.sqrt((np.square(Ds[i1,i1:i2]) +np.square(Ds[i1:i2,i2]))*2-gmin*gmin)*.5 Ds[i2,i2:] = np.sqrt((np.square(Ds[i1,i2:]) +np.square(Ds[i2,i2:]))*2-gmin*gmin)*.5 else: raise ValueError('Unknown method.') Ds[i1, i1:n] = np.inf Ds[:i1, i1] = np.inf row_repr[n+i] = i2 size[i2] = S print('OK.') while True: dim = np.random.random_integers(2,12) n = np.random.random_integers(max(2*dim,5),200) print('Dimension: {0}'.format(dim)) print('Number of points: {0}'.format(n)) try: test_all(n,dim) except AssertionError as E: print(E.args[0]) print(E.args[1]) sys.exit() fastcluster/python/test/test.py0000644000176000001440000001273512120523333016474 0ustar ripleyusers#!/usr/bin/env python # -*- coding: utf-8 -*- import sys if sys.hexversion < 0x03000000: # uniform unicode handling for both Python 2.x and 3.x def u(x): return x.decode('utf-8') else: def u(x): return x print(u('''Test program for the 'fastcluster' package. Copyright (c) 2011 Daniel Müllner, If everything is OK, the test program will run forever, without an error message. ''')) import fastcluster as fc import numpy as np from scipy.spatial.distance import pdist, squareform import math import sys import atexit def print_seed(): print("Seed: {0}".format(seed)) atexit.register(print_seed) seed = np.random.randint(0,1e9) np.random.seed(seed) #abstol = 1e-14 # absolute tolerance rtol = 1e-14 # relative tolerance # NaN values are used in computations. Do not warn about them. np.seterr(invalid='ignore') def test_all(): D2 = D.copy() for method in ['single', 'complete', 'average', 'weighted', 'ward', 'centroid', 'median']: Z2 = fc.linkage(D, method) if np.any(D2!=D): raise AssertionError('Input array was corrupted.') test(Z2, method) def test(Z2, method): sys.stdout.write("Method: " + method + "...") I = np.array(Z2[:,:2], dtype=int) Ds = squareform(D) n = len(Ds) row_repr = np.arange(2*n-1) row_repr[n:] = -1 size = np.ones(n, dtype=np.int) np.fill_diagonal(Ds, np.nan) mins = np.empty(n-1) for i in range(n-1): for j in range(n-1): mins[j] = np.nanmin(Ds[j,j+1:]) gmin = np.nanmin(mins) if (Z2[i,2]-gmin) > max(abs(Z2[i,2]),abs(gmin))*rtol: raise AssertionError('Not the global minimum in step {2}: {0}, {1}'.\ format(Z2[i,2], gmin, i)) i1, i2 = row_repr[I[i,:]] if (i1<0): raise AssertionError('Negative index i1.') if (i2<0): raise AssertionError('Negative index i2.') if I[i,0]>=I[i,1]: raise AssertionError('Convention violated.') if i1>i2: i1, i2 = i2, i1 if (Ds[i1,i2]-gmin) > max(abs(Ds[i1,i2]),abs(gmin))*rtol: raise AssertionError('The global minimum is not at the right place: ' '({0}, {1}): {2} != {3}. Difference: {4}'.\ format(i1, i2, Ds[i1, i2], gmin, Ds[i1, i2]-gmin)) s1 = size[i1] s2 = size[i2] S = float(s1+s2) if method=='single': if i1>0: # mostly unnecessary; workaround for a bug/feature in NumPy 1.7.0.dev # see http://projects.scipy.org/numpy/ticket/2078 Ds[:i1,i2] = np.min( Ds[:i1,(i1,i2)],axis=1) Ds[i1:i2,i2] = np.minimum(Ds[i1,i1:i2],Ds[i1:i2,i2]) Ds[i2,i2:] = np.min( Ds[(i1,i2),i2:],axis=0) elif method=='complete': if i1>0: Ds[:i1,i2] = np.max( Ds[:i1,(i1,i2)],axis=1) Ds[i1:i2,i2] = np.maximum(Ds[i1,i1:i2],Ds[i1:i2,i2]) Ds[i2,i2:] = np.max( Ds[(i1,i2),i2:],axis=0) elif method=='average': Ds[:i1,i2] = ( Ds[:i1,i1]*s1 + Ds[:i1,i2]*s2 ) / S Ds[i1:i2,i2] = ( Ds[i1,i1:i2]*s1 + Ds[i1:i2,i2]*s2 ) / S Ds[i2,i2:] = ( Ds[i1,i2:]*s1 + Ds[i2,i2:]*s2 ) / S elif method=='weighted': if i1>0: Ds[:i1,i2] = np.mean( Ds[:i1,(i1,i2)],axis=1) Ds[i1:i2,i2] = ( Ds[i1,i1:i2] + Ds[i1:i2,i2] ) *.5 Ds[i2,i2:] = np.mean( Ds[(i1,i2),i2:],axis=0) elif method=='ward': Ds[:i1,i2] = np.sqrt((np.square(Ds[:i1,i1])*(s1+size[:i1]) -gmin*gmin*size[:i1]+np.square(Ds[:i1,i2]) *(s2+size[:i1]))/(S+size[:i1])) Ds[i1:i2,i2] = np.sqrt((np.square(Ds[i1,i1:i2])*(s1+size[i1:i2]) -gmin*gmin*size[i1:i2]+np.square(Ds[i1:i2,i2]) *(s2+size[i1:i2]))/(S+size[i1:i2])) Ds[i2,i2:] = np.sqrt((np.square(Ds[i1,i2:])*(s1+size[i2:]) -gmin*gmin*size[i2:]+np.square(Ds[i2,i2:]) *(s2+size[i2:]))/(S+size[i2:])) elif method=='centroid': Ds[:i1,i2] = np.sqrt((np.square(Ds[:i1,i1])*s1 +np.square(Ds[:i1,i2])*s2)*S-gmin*gmin*s1*s2) / S Ds[i1:i2,i2] = np.sqrt((np.square(Ds[i1,i1:i2])*s1 +np.square(Ds[i1:i2,i2])*s2)*S-gmin*gmin*s1*s2) / S Ds[i2,i2:] = np.sqrt((np.square(Ds[i1,i2:])*s1 +np.square(Ds[i2,i2:])*s2)*S-gmin*gmin*s1*s2) / S elif method=='median': Ds[:i1,i2] = np.sqrt((np.square(Ds[:i1,i1])+\ np.square(Ds[:i1,i2]))*2-gmin*gmin)*.5 Ds[i1:i2,i2] = np.sqrt((np.square(Ds[i1,i1:i2])+\ np.square(Ds[i1:i2,i2]))*2-gmin*gmin)*.5 Ds[i2,i2:] = np.sqrt((np.square(Ds[i1,i2:])+\ np.square(Ds[i2,i2:]))*2-gmin*gmin)*.5 else: raise ValueError('Unknown method.') Ds[i1, i1:n] = np.nan Ds[:i1, i1] = np.nan row_repr[n+i] = i2 size[i2] = S print('OK.') while True: dim = np.random.random_integers(2,20) n = np.random.random_integers(2,100) print('Dimension: {0}'.format(dim)) print('Number of points: {0}'.format(n)) D = pdist(np.random.randn(n,dim)) try: print('Real distance values:') test_all() D = np.round(D*n/4) print('Integer distance values:') test_all() except AssertionError as E: print(E) print(squareform(D)) sys.exit() fastcluster/python/test/nantest.py0000644000176000001440000000257212015606477017205 0ustar ripleyusers'''Test whether the fastcluster package correctly recognizes NaN values and raises a FloatingPointError.''' import numpy as np import fastcluster n = np.random.random_integers(2,100) # Part 1: distance matrix input N = n*(n-1)//2 D = np.random.rand(N) # Insert a single NaN value pos = np.random.randint(N) D[pos] = np.nan for method in ['single', 'complete', 'average', 'weighted', 'ward', 'centroid', 'median']: try: fastcluster.linkage(D, method=method) raise AssertionError('fastcluster did not detect a NaN value!') except FloatingPointError: pass # Next: the original array does not contain a NaN, but a NaN occurs # as an updated distance. for method in ['average', 'weighted', 'ward', 'centroid', 'median']: try: fastcluster.linkage([np.inf,-np.inf,-np.inf], method=method) raise AssertionError('fastcluster did not detect a NaN value!') except FloatingPointError: pass # Part 2: vector input dim = np.random.random_integers(2,12) X = np.random.rand(n,dim) pos = (np.random.randint(n), np.random.randint(dim)) # Insert a single NaN coordinate X[pos] = np.nan for method in ['single', 'ward', 'centroid', 'median']: try: fastcluster.linkage_vector(X, method=method) raise AssertionError('fastcluster did not detect a NaN value!') except FloatingPointError: pass print('OK.') fastcluster/python/setup.py0000644000176000001440000001333712147517770015716 0ustar ripleyusers#!/usr/bin/env python # -*- coding: utf-8 -*- import sys if sys.hexversion < 0x03000000: # uniform unicode handling for both Python 2.x and 3.x def u(x): return x.decode('utf-8') def textfileopen(filename): return open(filename, mode='r') else: def u(x): return x def textfileopen(filename): return open(filename, mode='r', encoding='utf_8') u(''' fastcluster: Fast hierarchical clustering routines for R and Python Copyright © 2011 Daniel Müllner ''') #import distutils.debug #distutils.debug.DEBUG = 'yes' from numpy.distutils.core import setup, Extension with textfileopen('fastcluster.py') as f: for line in f: if line.find('__version_info__ =')==0: version = '.'.join(line.split("'")[1:-1:2]) break print('Version: ' + version) setup(name='fastcluster', \ version=version, \ py_modules=['fastcluster'], \ description='Fast hierarchical clustering routines for R and Python.', \ long_description=""" This library provides Python functions for hierarchical clustering. It generates hierarchical clusters from distance matrices or from vector data. Part of this module is intended to replace the functions linkage, single, complete, average, weighted, centroid, median, ward in the module scipy.cluster.hierarchy with the same functionality but much faster algorithms. Moreover, the function 'linkage_vector' provides memory-efficient clustering for vector data. The interface is very similar to MATLAB's Statistics Toolbox API to make code easier to port from MATLAB to Python/Numpy. The core implementation of this library is in C++ for efficiency. Installation files for Windows are provided by Christoph Gohlke on his `web page `_. **The fastcluster package is considered stable and will undergo few changes from now on. If some years from now there have not been any updates, this does not necessarily mean that the package is unmaintained but maybe it just was not necessary to correct anything. Of course, please still report potential bugs and incompatibilities to muellner@math.stanford.edu.** Reference: Daniel Müllner, *fastcluster: Fast Hierarchical, Agglomerative Clustering Routines for R and Python*, Journal of Statistical Software, **53** (2013), no. 9, 1–18, http://www.jstatsoft.org/v53/i09/. """, requires=['numpy'], provides=['fastcluster'], ext_modules=[Extension('_fastcluster', ['../src/fastcluster_python.cpp'], # Feel free to uncomment the line below if you use the GCC. # This switches to more aggressive optimization and turns # more warning switches on. No warning should appear in # the compilation process. # # Also, the author's Python distribution generates debug # symbols by default. This can be turned off, resulting a in # much smaller compiled library. # # Optimization #extra_compile_args=['-O2', '-g0', '-march=native', '-mtune=native', '-fno-math-errno'], # # List of all warning switches, somewhere from stackoverflow.com #extra_compile_args=['-Wall', '-Weffc++', '-Wextra', '-Wall', '-Wcast-align', '-Wchar-subscripts', '-Wcomment', '-Wconversion', '-Wsign-conversion', '-Wdisabled-optimization', '-Wfloat-equal', '-Wformat', '-Wformat=2', '-Wformat-nonliteral', '-Wformat-security', '-Wformat-y2k', '-Wimport', '-Winit-self', '-Winline', '-Winvalid-pch', '-Wunsafe-loop-optimizations', '-Wmissing-braces', '-Wmissing-field-initializers', '-Wmissing-format-attribute', '-Wmissing-include-dirs', '-Wmissing-noreturn', '-Wpacked', '-Wparentheses', '-Wpointer-arith', '-Wredundant-decls', '-Wreturn-type', '-Wsequence-point', '-Wshadow', '-Wsign-compare', '-Wstack-protector', '-Wstrict-aliasing', '-Wstrict-aliasing=2', '-Wswitch', '-Wswitch-enum', '-Wtrigraphs', '-Wuninitialized', '-Wunknown-pragmas', '-Wunreachable-code', '-Wunused', '-Wunused-function', '-Wunused-label', '-Wunused-parameter', '-Wunused-value', '-Wunused-variable', '-Wvariadic-macros', '-Wvolatile-register-var', '-Wwrite-strings', '-Wlong-long', '-Wpadded', '-Wcast-qual', '-Wswitch-default', '-Wnon-virtual-dtor', '-Wold-style-cast', '-Woverloaded-virtual', '-Waggregate-return', '-Werror'], # # Linker optimization #extra_link_args=['-Wl,--strip-all'], )], keywords=['dendrogram', 'linkage', 'cluster', 'agglomerative', 'hierarchical', 'hierarchy', 'ward'], author=u("Daniel Müllner"), author_email="muellner@math.stanford.edu", license="BSD ", classifiers = ["Topic :: Scientific/Engineering :: Information Analysis", "Topic :: Scientific/Engineering :: Artificial Intelligence", "Topic :: Scientific/Engineering :: Bio-Informatics", "Topic :: Scientific/Engineering :: Mathematics", "Programming Language :: Python", "Programming Language :: Python :: 2", "Programming Language :: Python :: 3", "Programming Language :: C++", "Operating System :: OS Independent", "License :: OSI Approved :: BSD License", "License :: OSI Approved :: GNU General Public License v2 (GPLv2)", "Intended Audience :: Science/Research", "Development Status :: 5 - Production/Stable"], url = 'http://math.stanford.edu/~muellner', ) fastcluster/python/fastcluster.py0000644000176000001440000004417612147427704017117 0ustar ripleyusers# -*- coding: utf-8 -*- __doc__ = """Fast hierarchical clustering routines for R and Python Copyright © 2011 Daniel Müllner This module provides fast hierarchical clustering routines. The "linkage" method is designed to provide a replacement for the “linkage” function and its siblings in the scipy.cluster.hierarchy module. You may use the methods in this module with the same syntax as the corresponding SciPy functions but with the benefit of much faster performance. The method "linkage_vector" performs clustering of vector data with memory- saving algorithms. Refer to the User's manual "fastcluster.pdf" for comprehensive details. It is located in the directory inst/doc/ in the source distribution and may also be obtained at . """ __all__ = ['single', 'complete', 'average', 'weighted', 'ward', 'centroid', 'median', 'linkage', 'linkage_vector'] __version_info__ = ('1', '1', '11') __version__ = '.'.join(__version_info__) from numpy import double, empty, array, ndarray, var, cov, dot, bool, \ expand_dims, ceil, sqrt from numpy.linalg import inv try: from scipy.spatial.distance import pdist except ImportError: def pdist(*args, **kwargs): raise ImportError('The fastcluster.linkage function cannot process ' 'vector data since the function ' 'scipy.partial.distance.pdist could not be ' 'imported.') from _fastcluster import linkage_wrap, linkage_vector_wrap def single(D): '''Single linkage clustering (alias). See the help on the “linkage” function for further information.''' return linkage(D, method='single') def complete(D): '''Complete linkage clustering (alias). See the help on the “linkage” function for further information.''' return linkage(D, method='complete') def average(D): '''Hierarchical clustering with the “average” distance update formula (alias). See the help on the “linkage” function for further information.''' return linkage(D, method='average') def weighted(D): '''Hierarchical clustering with the “weighted” distance update formula (alias). See the help on the “linkage” function for further information.''' return linkage(D, method='weighted') def ward(D): '''Hierarchical clustering with the “Ward” distance update formula (alias). See the help on the “linkage” function for further information.''' return linkage(D, method='ward') def centroid(D): '''Hierarchical clustering with the “centroid” distance update formula (alias). See the help on the “linkage” function for further information.''' return linkage(D, method='centroid') def median(D): '''Hierarchical clustering with the “median” distance update formula (alias). See the help on the “linkage” function for further information.''' return linkage(D, method='median') mthidx = {'single' : 0, 'complete' : 1, 'average' : 2, 'weighted' : 3, 'ward' : 4, 'centroid' : 5, 'median' : 6 } def linkage(X, method='single', metric='euclidean', preserve_input=True): '''Hierarchical, agglomerative clustering on a dissimilarity matrix or on Euclidean data. Apart from the argument 'preserve_input', the method has the same input parameters and output format as the functions of the same name in the module scipy.cluster.hierarchy. The argument X is preferably a NumPy array with floating point entries (X.dtype==numpy.double). Any other data format will be converted before it is processed. If X is a one-dimensional array, it is considered a condensed matrix of pairwise dissimilarities in the format which is returned by scipy.spatial.distance.pdist. It contains the flattened, upper- triangular part of a pairwise dissimilarity matrix. That is, if there are N data points and the matrix d contains the dissimilarity between the i-th and j-th observation at position d(i,j), the vector X has length N(N-1)/2 and is ordered as follows: [ d(0,1), d(0,2), ..., d(0,n-1), d(1,2), ..., d(1,n-1), ..., d(n-2,n-1) ] The 'metric' argument is ignored in case of dissimilarity input. The optional argument 'preserve_input' specifies whether the method makes a working copy of the dissimilarity vector or writes temporary data into the existing array. If the dissimilarities are generated for the clustering step only and are not needed afterward, approximately half the memory can be saved by specifying 'preserve_input=False'. Note that the input array X contains unspecified values after this procedure. It is therefore safer to write linkage(X, method="...", preserve_input=False) del X to make sure that the matrix X is not accessed accidentally after it has been used as scratch memory. (The single linkage algorithm does not write to the distance matrix or its copy anyway, so the 'preserve_input' flag has no effect in this case.) If X contains vector data, it must be a two-dimensional array with N observations in D dimensions as an (N×D) array. The preserve_input argument is ignored in this case. The specified metric is used to generate pairwise distances from the input. The following two function calls yield the same output: linkage(pdist(X, metric), method="...", preserve_input=False) linkage(X, metric=metric, method="...") The general scheme of the agglomerative clustering procedure is as follows: 1. Start with N singleton clusters (nodes) labeled 0,...,N−1, which represent the input points. 2. Find a pair of nodes with minimal distance among all pairwise distances. 3. Join the two nodes into a new node and remove the two old nodes. The new nodes are labeled consecutively N, N+1, ... 4. The distances from the new node to all other nodes is determined by the method parameter (see below). 5. Repeat N−1 times from step 2, until there is one big node, which contains all original input points. The output of linkage is stepwise dendrogram, which is represented as an (N−1)×4 NumPy array with floating point entries (dtype=numpy.double). The first two columns contain the node indices which are joined in each step. The input nodes are labeled 0,...,N−1, and the newly generated nodes have the labels N,...,2N−2. The third column contains the distance between the two nodes at each step, ie. the current minimal distance at the time of the merge. The fourth column counts the number of points which comprise each new node. The parameter method specifies which clustering scheme to use. The clustering scheme determines the distance from a new node to the other nodes. Denote the dissimilarities by d, the nodes to be joined by I, J, the new node by K and any other node by L. The symbol |I| denotes the size of the cluster I. method='single': d(K,L) = min(d(I,L), d(J,L)) The distance between two clusters A, B is the closest distance between any two points in each cluster: d(A,B) = min{ d(a,b) | a∈A, b∈B } method='complete': d(K,L) = max(d(I,L), d(J,L)) The distance between two clusters A, B is the maximal distance between any two points in each cluster: d(A,B) = max{ d(a,b) | a∈A, b∈B } method='average': d(K,L) = ( |I|·d(I,L) + |J|·d(J,L) ) / (|I|+|J|) The distance between two clusters A, B is the average distance between the points in the two clusters: d(A,B) = (|A|·|B|)^(-1) · \sum { d(a,b) | a∈A, b∈B } method='weighted': d(K,L) = (d(I,L)+d(J,L))/2 There is no global description for the distance between clusters since the distance depends on the order of the merging steps. The following three methods are intended for Euclidean data only, ie. when X contains the pairwise (non-squared!) distances between vectors in Euclidean space. The algorithm will work on any input, however, and it is up to the user to make sure that applying the methods makes sense. method='centroid': d(K,L) = ( (|I|·d(I,L) + |J|·d(J,L)) / (|I|+|J|) − |I|·|J|·d(I,J)/(|I|+|J|)^2 )^(1/2) There is a geometric interpretation: d(A,B) is the distance between the centroids (ie. barycenters) of the clusters in Euclidean space: d(A,B) = ‖c_A−c_B∥, where c_A denotes the centroid of the points in cluster A. method='median': d(K,L) = ( d(I,L)/2 + d(J,L)/2 − d(I,J)/4 )^(1/2) Define the midpoint w_K of a cluster K iteratively as w_K=k if K={k} is a singleton and as the midpoint (w_I+w_J)/2 if K is formed by joining I and J. Then we have d(A,B) = ∥w_A−w_B∥ in Euclidean space for all nodes A,B. Notice however that this distance depends on the order of the merging steps. method='ward': d(K,L) = ( ((|I|+|L)d(I,L) + (|J|+|L|)d(J,L) − |L|d(I,J)) / (|I|+|J|+|L|) )^(1/2) The global cluster dissimilarity can be expressed as d(A,B) = ( 2|A|·|B|/(|A|+|B|) )^(1/2) · ‖c_A−c_B∥, where c_A again denotes the centroid of the points in cluster A. The clustering algorithm handles infinite values correctly, as long as the chosen distance update formula makes sense. If a NaN value occurs, either in the original dissimilarities or as an updated dissimilarity, an error is raised. The linkage method does not treat NumPy's masked arrays as special and simply ignores the mask.''' X = array(X, copy=False, subok=True) if X.ndim==1: if method=='single': preserve_input = False X = array(X, dtype=double, copy=preserve_input, order='C', subok=True) NN = len(X) N = int(ceil(sqrt(NN*2))) if (N*(N-1)//2) != NN: raise ValueError('The length of the condensed distance matrix ' 'must be (k \choose 2) for k data points!') else: assert X.ndim==2 N = len(X) X = pdist(X, metric) X = array(X, dtype=double, copy=False, order='C', subok=True) Z = empty((N-1,4)) if N > 1: linkage_wrap(N, X, Z, mthidx[method]) return Z mtridx = {'euclidean' : 0, 'minkowski' : 1, 'cityblock' : 2, 'seuclidean' : 3, 'sqeuclidean' : 4, 'cosine' : 5, 'hamming' : 6, 'jaccard' : 7, 'chebychev' : 8, 'canberra' : 9, 'braycurtis' : 10, 'mahalanobis' : 11, 'yule' : 12, 'matching' : 13, 'sokalmichener' : 13, # an alias for 'matching' 'dice' : 14, 'rogerstanimoto' : 15, 'russellrao' : 16, 'sokalsneath' : 17, 'kulsinski' : 18, 'USER' : 19, } booleanmetrics = ('yule', 'matching', 'dice', 'kulsinski', 'rogerstanimoto', 'sokalmichener', 'russellrao', 'sokalsneath', 'kulsinski') def linkage_vector(X, method='single', metric='euclidean', extraarg=None): '''Hierarchical (agglomerative) clustering on Euclidean data. Compared to the 'linkage' method, 'linkage_vector' uses a memory-saving algorithm. While the linkage method requires Θ(N^2) memory for clustering of N points, this method needs Θ(ND) for N points in R^D, which is usually much smaller. The argument X has the same format as before, when X describes vector data, ie. it is an (N×D) array. Also the output array has the same format. The parameter method must be one of 'single', 'centroid', 'median', 'ward', ie. only for these methods there exist memory-saving algorithms currently. If 'method', is one of 'centroid', 'median', 'ward', the 'metric' must be 'euclidean'. For single linkage clustering, any dissimilarity function may be chosen. Basically, every metric which is implemented in the method scipy.spatial.distance.pdist is reimplemented here. However, the metrics differ in some instances since a number of mistakes and typos (both in the code and in the documentation) were corrected in the fastcluster package. Therefore, the available metrics with their definitions are listed below as a reference. The symbols u and v mostly denote vectors in R^D with coordinates u_j and v_j respectively. See below for additional metrics for Boolean vectors. Unless otherwise stated, the input array X is converted to a floating point array (X.dtype==numpy.double) if it does not have already the required data type. Some metrics accept Boolean input; in this case this is stated explicitly below. If a NaN value occurs, either in the original dissimilarities or as an updated dissimilarity, an error is raised. In principle, the clustering algorithm handles infinite values correctly, but the user is advised to carefully check the behavior of the metric and distance update formulas under these circumstances. The distance formulas combined with the clustering in the 'linkage_vector' method do not have specified behavior if the data X contains infinite or NaN values. Also, the masks in NumPy’s masked arrays are simply ignored. metric='euclidean': Euclidean metric, L_2 norm d(u,v) = ∥u−v∥ = ( \sum_j { (u_j−v_j)^2 } )^(1/2) metric='sqeuclidean': squared Euclidean metric d(u,v) = ∥u−v∥^2 = \sum_j { (u_j−v_j)^2 } metric='seuclidean': standardized Euclidean metric d(u,v) = ( \sum_j { (u_j−v_j)^2 / V_j } )^(1/2) The vector V=(V_0,...,V_{D−1}) is given as the 'extraarg' argument. If no 'extraarg' is given, V_j is by default the unbiased sample variance of all observations in the j-th coordinate: V_j = Var_i (X(i,j) ) = 1/(N−1) · \sum_i ( X(i,j)^2 − μ(X_j)^2 ) (Here, μ(X_j) denotes as usual the mean of X(i,j) over all rows i.) metric='mahalanobis': Mahalanobis distance d(u,v) = ( transpose(u−v) V (u−v) )^(1/2) Here, V=extraarg, a (D×D)-matrix. If V is not specified, the inverse of the covariance matrix numpy.linalg.inv(numpy.cov(X, rowvar=False)) is used. metric='cityblock': the Manhattan distance, L_1 norm d(u,v) = \sum_j |u_j−v_j| metric='chebychev': the supremum norm, L_∞ norm d(u,v) = max_j { |u_j−v_j| } metric='minkowski': the L_p norm d(u,v) = ( \sum_j |u_j−v_j|^p ) ^(1/p) This metric coincides with the cityblock, euclidean and chebychev metrics for p=1, p=2 and p=∞ (numpy.inf), respectively. The parameter p is given as the 'extraarg' argument. metric='cosine' d(u,v) = 1 − ⟨u,v⟩ / (∥u∥·∥v∥) = 1 − (\sum_j u_j·v_j) / ( (\sum u_j^2)(\sum v_j^2) )^(1/2) metric='correlation': This method first mean-centers the rows of X and then applies the 'cosine' distance. Equivalently, the correlation distance measures 1 − (Pearson’s correlation coefficient). d(u,v) = 1 − ⟨u−μ(u),v−μ(v)⟩ / (∥u−μ(u)∥·∥v−μ(v)∥) metric='canberra' d(u,v) = \sum_j ( |u_j−v_j| / (|u_j|+|v_j|) ) Summands with u_j=v_j=0 contribute 0 to the sum. metric='braycurtis' d(u,v) = (\sum_j |u_j-v_j|) / (\sum_j |u_j+v_j|) metric=(user function): The parameter metric may also be a function which accepts two NumPy floating point vectors and returns a number. Eg. the Euclidean distance could be emulated with fn = lambda u, v: numpy.sqrt(((u-v)*(u-v)).sum()) linkage_vector(X, method='single', metric=fn) This method, however, is much slower than the build-in function. metric='hamming': The Hamming distance accepts a Boolean array (X.dtype==bool) for efficient storage. Any other data type is converted to numpy.double. d(u,v) = |{j | u_j≠v_j }| metric='jaccard': The Jaccard distance accepts a Boolean array (X.dtype==bool) for efficient storage. Any other data type is converted to numpy.double. d(u,v) = |{j | u_j≠v_j }| / |{j | u_j≠0 or v_j≠0 }| d(0,0) = 0 Python represents True by 1 and False by 0. In the Boolean case, the Jaccard distance is therefore: d(u,v) = |{j | u_j≠v_j }| / |{j | u_j ∨ v_j }| The following metrics are designed for Boolean vectors. The input array is converted to the 'bool' data type if it is not Boolean already. Use the following abbreviations to count the number of True/False combinations: a = |{j | u_j ∧ v_j }| b = |{j | u_j ∧ (¬v_j) }| c = |{j | (¬u_j) ∧ v_j }| d = |{j | (¬u_j) ∧ (¬v_j) }| Recall that D denotes the number of dimensions, hence D=a+b+c+d. metric='yule' d(u,v) = 2bc / (ad+bc) metric='dice': d(u,v) = (b+c) / (2a+b+c) d(0,0) = 0 metric='rogerstanimoto': d(u,v) = 2(b+c) / (b+c+D) metric='russellrao': d(u,v) = (b+c+d) / D metric='sokalsneath': d(u,v) = 2(b+c)/ ( a+2(b+c)) d(0,0) = 0 metric='kulsinski' d(u,v) = (b/(a+b) + c/(a+c)) / 2 metric='matching': d(u,v) = (b+c)/D Notice that when given a Boolean array, the 'matching' and 'hamming' distance are the same. The 'matching' distance formula, however, converts every input to Boolean first. Hence, the vectors (0,1) and (0,2) have zero 'matching' distance since they are both converted to (False, True) but the Hamming distance is 0.5. metric='sokalmichener' is an alias for 'matching'.''' if method=='single': assert metric!='USER' if metric in ('hamming', 'jaccard'): X = array(X, copy=False, subok=True) dtype = bool if X.dtype==bool else double else: dtype = bool if metric in booleanmetrics else double X = array(X, dtype=dtype, copy=False, order='C', subok=True) else: assert metric=='euclidean' X = array(X, dtype=double, copy=(method=='ward'), order='C', subok=True) assert X.ndim==2 N = len(X) Z = empty((N-1,4)) if metric=='seuclidean': if extraarg is None: extraarg = var(X, axis=0, ddof=1) elif metric=='mahalanobis': if extraarg is None: extraarg = inv(cov(X, rowvar=False)) # instead of the inverse covariance matrix, pass the matrix product # with the data matrix! extraarg = array(dot(X,extraarg),dtype=double, copy=False, order='C', subok=True) elif metric=='correlation': X = X-expand_dims(X.mean(axis=1),1) metric='cosine' elif not isinstance(metric, str): assert extraarg is None metric, extraarg = 'USER', metric elif metric!='minkowski': assert extraarg is None if N > 1: linkage_vector_wrap(X, Z, mthidx[method], mtridx[metric], extraarg) return Z fastcluster/man/0000755000176000001440000000000012147324732013421 5ustar ripleyusersfastcluster/man/hclust.vector.Rd0000644000176000001440000000653311727523223016521 0ustar ripleyusers\name{hclust.vector} \alias{hclust.vector} \title{Fast hierarchical, agglomerative clustering of vector data} \description{ This function implements hierarchical, agglomerative clustering with memory-saving algorithms.} \usage{hclust.vector(X, method="single", members=NULL, metric='euclidean', p=NULL)} \arguments{ \item{X}{an \eqn{(N\times D)}{(N×D)} matrix of '\link{double}' values: \eqn{N}{N} observations in \eqn{N}{N} variables.} \item{method}{the agglomeration method to be used. This must be (an unambiguous abbreviation of) one of \code{"single"}, \code{"ward"}, \code{"centroid"} or \code{"median"}.} \item{members}{\code{NULL} or a vector with length the number of observations.} \item{metric}{the distance measure to be used. This must be one of \code{"euclidean"}, \code{"maximum"}, \code{"manhattan"}, \code{"canberra"}, \code{"binary"} or \code{"minkowski"}. Any unambiguous substring can be given.} \item{p}{parameter for the Minkowski metric.} } \details{The function \code{\link{hclust.vector}} provides clustering when the input is vector data. It uses memory-saving algorithms which allow processing of larger data sets than \code{\link{hclust}} does. The \code{"ward"}, \code{"centroid"} and \code{"median"} methods require \code{metric="euclidean"} and cluster the data set with respect to Euclidean distances. For \code{"single"} linkage clustering, any dissimilarity measure may be chosen. Currently, the same metrics are implemented as the \code{\link[stats:dist]{dist}} function provides. The call\preformatted{ hclust.vector(X, method='single', metric=[...])} gives the same result as\preformatted{ hclust(dist(X, metric=[...]), method='single')} but uses less memory and is equally fast. For the Euclidean methods, care must be taken since \code{\link{hclust}} expects \bold{squared} Euclidean distances. Hence, the call\preformatted{ hclust.vector(X, method='ward')} is, aside from the lesser memory requirements, equivalent to\preformatted{ d = dist(X) hc = hclust(d^2, method='ward') hc$height = sqrt(hc$height)} The same applies to the \code{"centroid"} and \code{"median"} methods. More details are in the User's manual \href{http://cran.r-project.org/web/packages/fastcluster/vignettes/fastcluster.pdf}{fastcluster.pdf}, which is available as a vignette. Get this from the R command line with \code{vignette('fastcluster')}. } \references{\url{http://math.stanford.edu/~muellner}} \author{Daniel Müllner} \seealso{\code{\link{fastcluster}}, \code{\link{hclust}}} \examples{# Taken and modified from stats::hclust ## Perform centroid clustering with squared Euclidean distances, ## cut the tree into ten clusters and reconstruct the upper part of the ## tree from the cluster centers. hc <- hclust.vector(USArrests, "cen") # squared Euclidean distances hc$height <- hc$height^2 memb <- cutree(hc, k = 10) cent <- NULL for(k in 1:10){ cent <- rbind(cent, colMeans(USArrests[memb == k, , drop = FALSE])) } hc1 <- hclust.vector(cent, method = "cen", members = table(memb)) # squared Euclidean distances hc1$height <- hc1$height^2 opar <- par(mfrow = c(1, 2)) plot(hc, labels = FALSE, hang = -1, main = "Original Tree") plot(hc1, labels = FALSE, hang = -1, main = "Re-start from 10 clusters") par(opar) } \keyword{multivariate} \keyword{cluster} fastcluster/man/hclust.Rd0000644000176000001440000000436311727523223015217 0ustar ripleyusers\name{hclust} \alias{hclust} \title{Fast hierarchical, agglomerative clustering of dissimilarity data} \description{ This function implements hierarchical clustering with the same interface as \code{\link[stats:hclust]{hclust}} from the \pkg{\link{stats}} package but with much faster algorithms. } \usage{hclust(d, method="complete", members=NULL)} \arguments{ \item{d}{a dissimilarity structure as produced by \code{dist}.} \item{method}{the agglomeration method to be used. This must be (an unambiguous abbreviation of) one of \code{"single"}, \code{"complete"}, \code{"average"}, \code{"mcquitty"}, \code{"ward"}, \code{"centroid"} or \code{"median"}.} \item{members}{\code{NULL} or a vector with length the number of observations.} } \value{An object of class \code{'hclust'}. It encodes a stepwise dendrogram.} \details{See the documentation of the original function \code{\link[stats:hclust]{hclust}} in the \pkg{\link{stats}} package. A comprehensive User's manual \href{http://cran.r-project.org/web/packages/fastcluster/vignettes/fastcluster.pdf}{fastcluster.pdf} is available as a vignette. Get this from the R command line with \code{vignette('fastcluster')}. } \references{\url{http://math.stanford.edu/~muellner}} \author{Daniel Müllner} \seealso{\code{\link{fastcluster}}, \code{\link{hclust.vector}}, \code{\link[stats:hclust]{stats::hclust}}} \examples{# Taken and modified from stats::hclust # # hclust(...) # new method # stats::hclust(...) # old method require(fastcluster) require(graphics) hc <- hclust(dist(USArrests), "ave") plot(hc) plot(hc, hang = -1) ## Do the same with centroid clustering and squared Euclidean distance, ## cut the tree into ten clusters and reconstruct the upper part of the ## tree from the cluster centers. hc <- hclust(dist(USArrests)^2, "cen") memb <- cutree(hc, k = 10) cent <- NULL for(k in 1:10){ cent <- rbind(cent, colMeans(USArrests[memb == k, , drop = FALSE])) } hc1 <- hclust(dist(cent)^2, method = "cen", members = table(memb)) opar <- par(mfrow = c(1, 2)) plot(hc, labels = FALSE, hang = -1, main = "Original Tree") plot(hc1, labels = FALSE, hang = -1, main = "Re-start from 10 clusters") par(opar) } \keyword{multivariate} \keyword{cluster} fastcluster/man/fastcluster.Rd0000644000176000001440000000573312147324732016257 0ustar ripleyusers\name{fastcluster} \alias{fastcluster} \alias{fastcluster-package} \docType{package} \title{Fast hierarchical, agglomerative clustering routines for R and Python} \description{The \pkg{fastcluster} package provides efficient algorithms for hierarchical, agglomerative clustering. In addition to the R interface, there is also a Python interface to the underlying C++ library, to be found in the source distribution. } \details{The function \code{\link{hclust}} provides clustering when the input is a dissimilarity matrix. A dissimilarity matrix can be computed from vector data by \code{\link{dist}}. The \code{\link{hclust}} function can be used as a drop-in replacement for existing routines: \code{\link[stats:hclust]{stats::hclust}} and \code{\link[flashClust:hclust]{flashClust::hclust}} alias \code{\link[flashClust:flashClust]{flashClust::flashClust}}. Once the fastcluster library is loaded at the beginning of the code, every program that uses hierarchical clustering can benefit immediately and effortlessly from the performance gain When the package is loaded, it overwrites the function \code{\link{hclust}} with the new code. The function \code{\link{hclust.vector}} provides memory-saving routines when the input is vector data. Further information: \itemize{ \item R documentation pages: \code{\link{hclust}}, \code{\link{hclust.vector}} \item A comprehensive User's manual: \href{http://cran.r-project.org/web/packages/fastcluster/vignettes/fastcluster.pdf}{fastcluster.pdf}. Get this from the R command line with \code{vignette('fastcluster')}. \item JSS paper: \url{http://www.jstatsoft.org/v53/i09/}. \item See the author's home page for a performance comparison: \url{http://math.stanford.edu/~muellner/fastcluster.html}. } } \references{\url{http://math.stanford.edu/~muellner}} \author{Daniel Müllner} \seealso{\code{\link{hclust}}, \code{\link{hclust.vector}}} \examples{# Taken and modified from stats::hclust # # hclust(...) # new method # hclust.vector(...) # new method # stats::hclust(...) # old method require(fastcluster) require(graphics) hc <- hclust(dist(USArrests), "ave") plot(hc) plot(hc, hang = -1) ## Do the same with centroid clustering and squared Euclidean distance, ## cut the tree into ten clusters and reconstruct the upper part of the ## tree from the cluster centers. hc <- hclust.vector(USArrests, "cen") # squared Euclidean distances hc$height <- hc$height^2 memb <- cutree(hc, k = 10) cent <- NULL for(k in 1:10){ cent <- rbind(cent, colMeans(USArrests[memb == k, , drop = FALSE])) } hc1 <- hclust.vector(cent, method = "cen", members = table(memb)) # squared Euclidean distances hc1$height <- hc1$height^2 opar <- par(mfrow = c(1, 2)) plot(hc, labels = FALSE, hang = -1, main = "Original Tree") plot(hc1, labels = FALSE, hang = -1, main = "Re-start from 10 clusters") par(opar) } \keyword{multivariate} \keyword{cluster} fastcluster/inst/0000755000176000001440000000000012147514400013614 5ustar ripleyusersfastcluster/inst/doc/0000755000176000001440000000000012147517667014403 5ustar ripleyusersfastcluster/inst/doc/fastcluster.pdf0000644000176000001440000033601612147514426017434 0ustar ripleyusers%PDF-1.5 % 22 0 obj <> stream xڕXY۸~ϯ[Z C)'xkuI$HkRkJ@wCQ&lvݏjefsؘLz+ BZYj u3 Qr L6>Uo}٢q3Y&ڪ2dRdҘN%[7BIS34_ÑFYilFy)MkSgmth/!ULV[ Ci*eVM7,3I,,`9j5g])Ѡ,GfTxǡ.<9<'  cӷ "vwGI'x5&R+Cbڻ ͅ}j\BiOeT m?f1B*ytU#.g~$Rb^i!ݪ q2RDwmq!lu Yd"k[ņ}CA_phW.T`"5[OGJ*43 v#3Cϼ壔gFSu v&[\g9R.wwj'(VwE5D p1^RD菝m4Si; nÒ0pc ' ~ ˱]c$%&JrNjy1A /^.$A/ ؄ӥi򿊼vh\2$jjEƨ1O/@)dҫoSE'XH/ $hN0P=ҪG:Bj('+J7FOCB 8Uan#E,R5gtVBc#o)(snJCh8зJr.+K i#)q|Xo_p5Ze7r[يbJKg9XFM޳X%UY#`>"N8L~s=yXv@6  w˧;|S^A&tZ<5v-4nA'ј=9 샴,ZAU 8q%0e%8s.qg][WEgcʕ E4E 4/Q w*QMi,0=Pȭu׎ vhIۇma?݊梥 a직\{UB ()P .Wz` z}T2u\6H!]'gM`4PB'Aq DM/} Ă³^5yoVi@;<^6IzB},Cݧ&K'K94'p)V5AW +*o{5τ?MjEn5(SUv>{Rq]ioߨf4!aAۿBڦ5&SG@^|GW^mbPHGoJQ܅s~e[?Av=VؙAkfCDlrnyO [|x3ްۨDK޸$B)CSU|MKPFR.#;W wRsŎjЪ1w :4w,ٝ&:`T 4ɨz k~l" ].ξix Xiw }wNvyRC/G3NALmmF.(*T(9dk 韶n*/X8,>C O'bBSF,P?ަ}(y4pjр@Zs.8C`u&%N] RZ:ODĢ<KU:~ERϬ텧{xQ&̀Mr`V ;Fu,48>_V}dXq໺o )> stream xZKo6Wj,-D)Ǧ$rJ+v;CR+qZ԰aR"gęE83Q?4R ~GߞGg?PQAHm2hSES?GߟG ,?gģxGf$,Z2wf-*.{Յ9qGAS"%s$r` vEu~YdJddIDy>T$eQy`Arki4$:G4I.s~wթwM4eP@UYWx2i~Q.9P-4 ,eCWW ԈL5IF ʲ(uef9tBEg1ܨņHnY[ӞfQ?L0fofœuw~&  u|͸cVT,A! ߕp$nZSp{KhlzcEdǵ.LK3.a*#Iꯃ4ove1F1>}Mps,?D3e{Z+wejVLZ}vAú= s͡\W҇V"Noa/ouKO>H*"g2X@ekp*1G6vӔV%+Ŧk:(7uїMP$i8?},J\\1f'\3NIMmäT)rȮu; %x4,齟!"5*k{"h$w:]*!mG3ARgm݋9 .SRo H!$yוѝ]qkl;(ͽ#oǠbƏ' 8UPREk.Ŷ{ܴh-dT9N*w]XEۏo9xx`T? V}ſ2vLck";f2l/7xؒ '޶ǽÊ©i{nX4_nrtak{$DuU"\, OeF]u +G"ͭMGy逍L?ĐKu'NTB`r83)60024Cך k6`"$(bKDN%IDyKHSE~q yH.TAxc^)@վG̝<@hԚ?y.u[qBW lj%7׫5Tp_Y@F;]_ǧj1ԧx6J*k@#a> stream xڵY[s~[#L^iɤS/u` 𢐔w DKv,^yW1L_֫v˟n<^HiHF0+zN`}Z՟Z-@EC{X۝`2[m8Ϩt#GcZإ~iW WaaD}\"2ۡ)EӃ'mu*z\oDzqW6;ZRz4_^7^ÞY]ִj6;kF!~H/<oD"`t*rFm*޽,(ۦkpze3ؙiy 7+NOzwZmC Zɣ]j|bW&\$ Dsy1)Y@>t{7,d$v߸3_'nvNx`Ɍ).B#_#e Lz%_Jnʭ>9U[0ӣbTOf]e]VlZ] =AJ@( 'FYOGlˇX()~]!шbkhq$E2g1S9΍qCؐ6Zk :- '7dK 9[i#_My`-U2_J{[)%I)BJqqMq|3[oXF: n5,TB`g7S Q} @mOڛgelGa"Q YWegs"%|zuLмk |4b-Ҩ~rɂ5x5yA+tAQwSm(:sqإ9,ޚzdžxBW-UHl='ZN.;w;GD~b 3&;2|3 a&Œ0,D/ U&hTr۸#l^z_ \\f{P-%B1Hdo$SQ:!.MxƄ`*ڊS9 KL,MgIpV -wKA+RBq<.4ؙ@nTnSktԞ'}|siͱL:O1w:wD*O'G.:q1Jv/WA3ςMe/E<ʦJ]KcӃ,Dag=X3*)sB}MNhL% O 8Nb!'}zYJ:rTxׅ}:ކ|q7ɉ,(}pmWH9PCUXXh!QR%Rxn@4Mjs,#C y9[)D 8"ϐZ@0 h(iR[U6.|Ntd tN7qiHmMO b'5^إ"27%ui_}K,9XMڦzu8B( 0IJ T6go-[MK>@HI\ʦ8å8yB8`Y6zJz{󰦱J;3T|}/X <+<d3}pS>/ CI!wؙ[;49-(?$T&^F&&0ƥf7+͎tomOT{s 8u"19dŦߡ6*vp ;hi&.pnMOc^܃_QrЙ #92wf[aI%h0Wz5hKflO-U enShST[' XBT"y1^Ǖ8dՊ#Zqb xґAiځ5pܵ",T|Ƣ`/|q+xvԝ@ۇ7fU # a}˲8^E_]B~1-Wϼy͡9+I/=(T, IFS^]QVVJbXI-"F];WGSj?>90֩ʦc]Ǽ'ëˬl*4^UrgCB ?ƪ+ 1 *5-!OJcI(T i@.9|{ĺ\dgq,wcpFTal\&v󩗥sW#~;\|Ͼ.|Ӵs;yCz.ðlM"m6xnIROn ^= ̺Y]3xmԙADM42,2ere$ j gccjʒfʓtxf q5eT.ldNs:l)Ƞnn =2/+yf4B"B6IRHwvIu> endstream endobj 85 0 obj <> stream xZ[~[)4p.ȃ6E @Z4KHy3r(.I5@r83 5{?L1ϖٛYIJgw-f .avW6Ӻw0w[|YkgLefٷ{lY}v)e .Nc@q>Ѕ]JAHxAB l(n_Ήۯ{ՙ]Uun}&ܩmA&y Wsuyfre޻nzmM7]|cWvxNzkVdbSJArfKٟ5.##qY)2w-иxQn^0nN<2  Mé5 .55 .eY_DkCjg8lЂC&}MDJ‹r)%)GvGT 큱=߆=ҥ*HdIpӉq),׭U8Bscm(Wެvo=#+x*%t:Lܻ8EcYWoӣ]q 1n0տfiL5g}ܥsK@w3$PJAwǖtQ7:pƲ)޵xoⳛ@s$ Z9ps}f~Y]4$Pzlmw*TƭT׺)fW,#R1G! NK-*1DGE.jrK„c)1(e1/k/͙* +cWqN֘M& .ܢ 9i;GJaD^؍$4`ژdǶiZt 3b wK `a{,vw*D'?E ֊({m0cMYxyXgoZH &߽q()B٦a0/ 5=Zaq;зnxO`c9z4Fv =aGl9,k{9vi[ {ns8H?atx\f(32ֳy U lZ wHq<;4={H5v,fea?(~hU2 TѢt2h!AxhzK ț k*ܑq$u8yxo7NAFrV~w} -^:] t #0:0($Мh g.Ѻ v0l 7)4 < /'c,ZBv?lPW@ ج8V&w$RsY2n#Aa MۛU"'_пK(n7 Pd,?FLe ?h 0 [>$h@ h9r VϙnawN.Ɋ'y¹:,-] ?*[: ~Zut,*kS5Y \PM`/({riGZK,n$tvq/3B"VSdvu`) xz\ЙU9AhQS ϓٚ}՛ո3Oqs'GD;w[vbWD Xvhѐ,2-Ua;(7Wڍ%QqZ6D| hSt0v\<@)}}"=j#ͳá0 OλN )?XR=d9-v|}2qu.!2;QS$:'*x"ە5KkiMk' 5_pu|O8ČN݅>5wj6S\P׻4 KpB艬ñʺew!]_3+tI BGBLײ>oR9(*l/s1wʆtЈz5zu5x,*q,(/CHDst$}m0 3K@mOp3ED|RD3jd~Ud7vE8'Q1lo>)SM-fGLc 3=s%/gmt)p1ڛMp&nRNsW,zI)C*shG>Dنz/B80ULL~_%?XIRN;bq}FgG!RƈK%$E]HkYbp9;oRb|_CJwgLb\k@pmkvkLt`L/m? }10"~zr< ?h@M`1b h.I6 -Fc+O晿Z]o,ź( A4$*&''sn'W%1;̖ٖ:F祍KMMW-$SDꃱ)RbsO'%Y,oډh ,ŋm>SbWNV?v/S+[.пǴ#oH;#/-'Rg+bϳ3Ǐ ȑ%/K&eH_jD"VBMe$'le~c&Vqd rUO47ڠǏ֭PJM>oO؏Sѓ` #$qlpa`.yN'Cl!$1 {@ .(1[DHe>yg ~D"8v!u}鶌 endstream endobj 93 0 obj <> stream x[K6m%^q%edgkSGD%} A 4?[q4Fݘ $H |=yu5$Mn&ŊOf,?ѺhThr߫7Лd}wb΁$Qcqگ9+>D( ~^ʰ:bx_XJz+#>rߓT~e:#i_FoH+M3*UD !b-$fP+~X {5W^ēk4\+Vv#Sg澿@Z`iX`i̤/0J4_"7S"ΰ$:ZS`Hf$#1AgFWˢ.}1MOݮ|eE3m[V馪͗vY.e曹%v h?ŽvM[ԍՔݐcJ%1(QYxXSfGU(j4o L@}[nn`bz%3"9D0Z)MH5_)R[~*78-Tm>EoEڬF/"6%/vL rcN! (`Mc#I)U6/}gX77cq]i,yV f'l*mUۮI 6|r]|u[e\eYLY\ w=I%ȝeD@4}a>vWY.ekyVi>4F1ON4]]!%o{Լ)A3;'bI2|>~ش5>r"cqW |%1j/x$^?p4l}LKc);sLAĎCIBt崗tz>{>|g&0>{o.뼾wc J@{8#aAISg8-Ƃkc޺=sʹe;^!F` L^jwq/UDB.p=1!èfw]BK{},D4X`S}J_<ֻbSlY#z﫰n马m@C1ޓ1$|=ဢxX&D xXp:BYJUE0vF0CUy+t7U*dUdGtUʓp__@nz]XU7`]`lHq!X&qBamq~:sVaޗ9Ig~Mũ3S䁪599 N :ى8= RL%f1jP*{Жc[6U<6bld3ex/!G襁t -QS .Rg#(LQ$Dz9ώʄ^jaL8G1U3nSP&?1Ad,/)/ ~,'6x!9 ~Ng]S.HJ7m[l۳:˹zYi*.X>Σ)DrbGDθ]U yB]ʾVKn+$yrЃ[I~T;jM.Wy]c![0??u4`lMD_#Oi_F@5# H۪3:BӃP#*" XbIU@Rkax_-DFEIE,$nw '`)\Ĥ9R*}TCLS&FZ5^VN {_ˣ@cb́=טM$ZEse9WW[pk/lX11hMd/]7؂_`lr ?@JD^|dJk}kwފ xcWLnd>]5]Y?z&!:G}0qt4ske%d+ dwpޞ+ВF902;]6/Ljdqr)E,NHvHE?ɾ[Xn317U.+H`haew{xe;s[WI u ,6eZ#HgR@{8۩#]W?a7ø1Y]CS!2ݺIO\U}kj4I=؎tVE 6eA X`0|-2,D2kzU3 ~*!Ƙ T0sB[8׭Ξ{kF|ɱ>v'#IH+dnN*?B?x2هUP}6 vjr70 xtPy R*vm0LbXuͫSH ~u.r gv5 ],~  p F ċd> stream xڽY[o+~PỲZ4mӇ- N JE;ZrZù| we1Lx[٢`E*wE"Y+ƒno+;]t˟xqڕLlOӒũ vh;#F,!%P-c2qeO:S<ЕC?`K*3#B(vXLWjv2߶bEEP6cnT৅_Oφ6J))a sbŹv8JJ"؎lRDzcu : ~w~TLtP%)ϋlx%TGU܏M5 0M15anv3YiyAޠxQDO'WL)J4CKvǰ~#ؿ)i$lX*Tq3QڹҘI~rRw!rA''rczLw4vHWLH:sBАYY8Gߎ@Is*+j[In~8+`ߌlqT\FTWO"`pʃTf 8*᾽-ql~E`*xߑ0f+_!O%)1l+W!M/#VD2PiߴSv/֖ 'nP"Yh(W=*'kS3=EK L Lw rH REﰼ3}[QoizF$DoiJܫw$x5OP{e'5}*p(qHau*.k]/]9O#`/襫8I#ׄ9ѥ?ͩ n|և|Q`jsK^ܦ^"sա3ϒ]0͈S߿?H: endstream endobj 117 0 obj <> stream xYݏ۸_a@)@s>m.Ai[[r%y7wfHʔM7H.`p7{g)ΖY _f|? Og%+U6_rtW%vؙ?}{Xfn>-,5& '7" qoC,1. V P=[0u߆os4Qx4f; ~Zmׂen2"\ti,+.$ӅUo ][TڧFڥbZYUsB!Go6O J"O`?v`a!E, a4Yݹ Lz>F/h_7E$Nu2ƍ /"e]2x.wSJ?2e>Q'mc.u@s,2zN^",Kn΅NAzۦ_˶ᡪ߱?tfk~"$]}Wʤ]n7{ Q$yPEФ2nfҚc^@ ,^ &+hhf՚^Ky^DjL_%\r`~q1[VϵJcT`)=ϑ#]^qj@r[/KHTͮ0j2qH"J@7O* 〰>WuzWuD,},KH%vtS:'VWP;g6{}'ր6. T7֌/Ϲ`wRK!Rxhvm՟1W{7ْN`A,5˹==aŌI{hmrm(݈ &0qCmA_ݜػwAtEw1wAJZVe}xbn"nnz4%D~i#@3,2a`Z# Б_z[tVczB%؟3{?<P^@i`؀.hHWw5؈?;ƻVDS.z?b:cB eq SPq?S2wzt$c?abBpt7vd"Y5_`)e3M/dƁ ah<7U*&ĪM;X3`ig śeMNCj N$bjlHa4 >/q qV:bR`?kR&G?Y=XmSl$Z{U^8QHj (GzX vd/wo%n|LL͘򒙅H%K'X"NԂK$uv$SPy.E2)J*" e3Hq0z۬#VPȊ*30pNw[7#qwZNS(Xc5 +garȐUԹMYȢl|?nP r $Rz;taZ:FY`Ͼ BG`YB?G*L>N犫.WS.dPN}<dWx(?,xqJM?`Q  XCA<:0`Dv.eʔH ckp? LyTđSYKu8IXn)dqT$]&sʟx4 iHu\/^G˂Lb"$Ǒ vK1]* G5Ht#TY቎'Z@&*Ʌy &n${o,߼ɸǔA%5qG %\a%Z"+ ~0'{P`6WSR*yi]UoSVa@bhq?mᐎ^0|F/T'&4@/&+1}a8V`E [sPÓ"5Yyn2P\CZn|s[ֶELacom熗Uc#|T{ky+;>)k5KݛcHQ;gI`eYp(ʢK͖|U뿞f,{lάkew?G$)ˠL:v ǒcqʲ_Nt0zl񗪚՚Hcd-W_cd" =01u80:6$74]{D~?O' endstream endobj 121 0 obj <> stream xڵYێ }WWT&`2.ɦHV5W!,dW4F%UECzFnb'7xo7S)D&"7Nh;-MԖ?oUUt7MO儣F r_ aYT4@?̟2z$a]?F~_cupWUʦy5S5аfJݡ=/FN)H~(VLjeHlwF}tGEc7usy䛲9C==$HyY k 2y Tv{U/|?}6ZE%+^{J>DGhrܒ%EBs9 Dgݝ9P>Iy$t0n&`VWa~_H2&0V+DRa`-7}SE}E >hI?SɝT.KD ,i CPU7}W6fJ>|" 95ItGM'؈ly->D |6Qg@ K,^-/f0(?@K~QJ66 H,bK>TEfiВ h WΟZ͛-cLBor|ʠ7"w~Vk*A$38ƒ4Oe=<#9xH;ǾizgS%ۣ33Us`\\SH4hByt 9NŢVZҊ%'I8΍M)$ɴ#VG]G#bZDg~x-{<ߟڥ~d]jUM|'eq)6/ﷻfں[rn)+Ʃ:E.x%]W>b$} 4?^.f:`f To%' |R)oEL @J:F/br$Ӎ -sL, .\ DJ/D m Dz[&/IfsB԰=" dgy<6}PVu|7i W ;}:E~ N5@X#Pm)^[%|yF._ &ҼAx鄰s,>kJ(s^4p!D;{ υVȅt_8]TxTSt" 8aawH|,8%*f55Uoέ D5qzSS 7+-8/=:@תb`v%)o*Ok@un%F735lg`IΡ$w^?̤4,iH5>h4a\\oTl[?h4ZRWoOgUM6,-]U[5#{/\*I 1L9EwMtg߱n9.~*[ 9肥?(\(uZI\{#70.6P8]"պ˰t>_JUk|o Bp,+[d"?Kbi7TD%i4~e] &"Lslܭo qF endstream endobj 124 0 obj <> stream xZ_s۸`JU(x<8L}7hD*/@J>;vdBoD4J/IEo?UE$nn#%%ݬlŒsCW+㏋%Sq.LEQ~ФlSmVC[4ſoa2X™XR)]G+ S]ͳXYmCqh%3\jåaI9.iW}K jo㳻~ Oa %27|J!'$:yh`!$$I3?+u`$BCԑDg_"3=rK 0GeM5lFt W ˌ伥ce{ 9ms*U'lP?QӓANO0era$V~dq|8WB<6hϻh,иf6cꓷl#qL#!g"Gb2tU'1c2qw@I XUB!3wGhxk: '#~Gh@ &J,$aB$2mUD}1P҄d/In)$R FgQ hܕ%3J`FYڒ zAuRG6Ȏ'_$b$e^dL"?m xK5 UOm돘SuqX5-vӥr=KͨC9$^a`Q6!n֐7؟GvEsWVw]MR*9aBSi v~%5@KtH kG~_muWuUWDžU'qY)mpE۹.7eCgUWK++&{Zm2NUYv :Δ^0i?UX,\aFUz0.4w>3,뢪b|FX{V֞}әnC6q,q&Sys&du2Bd7 V3<_ Gi9Ekž_OMddR ?Zj : SCYiX "q\7/с=3UI6u| &S rsS:oH;ΤWooZQ573:EQyj7b!RtXUpQ/֪s:wzCa447fB'6tN@" Z#4t-d9ܵ}sPiP zPN.ׁ9^{'Pa<εɛ661.NMwk2%[ J}4{(mњe&"dqyl~LڤѹB2.Plv S:wNm~N1Ծq|xCظ|DAR9Eh`LݥT&p[7 `X!'ڣD@G $(YV\p~/lܵS7ޝ`beN_ ~0ӳ=Y^OV$:Ν=ˉ SORdN-ǃl3"x{e憍q?tNo,ti7Fǒn=@Ta3:JNԑWu-pX{ǩ2؄; endstream endobj 129 0 obj <> stream xZݏ_H~q #m0hxkQ:/~ޙ DCݝ2$h&:y}/:I6h,$0fY]v\Oݼ4sM87P$}*",%"@DDeNOLBN#T;Rx:ʏ8_P9m͉|`Jͼ )B%[Z N9: $KDe*6{;n$<4zDˈ!=\ Hj|8ro#3A4 .=RT/gB03Z|D.y鯨6jjo޾SZz:UhLfͧ "~@7O7ӧT5vYN7> %K) ˜ӠY,KMs-+ BgQn6|9l=] j[4U7gj,v=QJRvהm[k 18d(Ιxͳ73ᗟۚ!54%qa} RCE ܾ6[[.rMR9|th.rM&,!p~ʀ"*G9La0냎WPs!$$r%,D1=%MM&v<0`V oʦ`gY^# ])ElU]ZWmgZ;^]sڿ;+O** d2*LX(yH Nh;'P|jޖ +Љx5Dv÷:)pÔ*n^ZIEPM8Gʶ}(ֱCj!=nyI4^yX1JUA<[)eL@}}- L&4L,XNx>1BZFlTjO V <!>2ļJdܗzӕ4 8d8J1hdCxh(6;9}=h蜨e>!|*t4+Q}3ںA#XС}<.xR4X1}VP_2PsB{+su%$\v"bYL\SգWsn/\ GMqV$'Q*m*Xq\xlgO{r'~;)sxJɶ+]Mـ ':TP4Kle 9za>Xi 8s $ͼUv6.}yaN>n9pEeuK`%l*^;xJgc>67,esEx~uP$#VP+OS~-Zbr>X}' /x̧ravȥ6ob|t`?AkAwɇ滍o#.yjT,HiJd-&3n#ƒҰqXx: LSmG-#?a"|.`j> )#p I4?[!0_`XrEHѡxೕתL<[WMGD0)y[}p > stream xZ[۸~[e 戤xQ!Mwv>hd͌vdɑL琔LenчbXx9<\SDhKbު(#m$8i? [,T}%>gq]5{)m?|~ Jy`2~VUWλjX056nbNν7%89o!h&"']-)i.?/ptqu9tU:~t*㪷zSOCݷYkτ?Oݝ*MvleJIeI }QmHɇ* phț$|MՔl'qWne}ٕy?qkw ]fF p^v߫)41z]R(T  ΉȆL-y{H`3Fq6fmým0%h̀{f;踲r-#ȑ Kţ!+[v]YXڹ !ب"TB?p3ѓ/hI+ =NJ'"0]$GKM z.,$!רev̕ed!tmcMx cкشh4;nYVMac=%$} iS~l-Wq&–mr-6!K|z@"~WNPumU>}0{D]DŽ~G8rUաЭ@u<.r)^[߬˦@k`+R$ц?7NPn+xfRLz—3PH.;XFXwCI* EցE<% Rq;1)n(ݾZ,;LpR R}i$I>vv҇v@NaO{3'XDYkG {h"ۗTky83ѻoӨf:wԒQ?8)y}kIR3u{I VYjvY z dxgz Ғ#X?ms AgSbz4lh~uIx.l[MX`0`q~IUb8e&/1֛߳:W^UYοƠ(7fiL>cB3am6&|qS-ybE@k]1o7 4Sx;ގW;j:9vy>@ NYk"Me!}HډP>B)<җ1VL=A5Cb SBcՏyY͢~6ANW 3OrhPr̥O_b?, &lƼʖ&񏦠* 79\Xg Q -D,mw)nʱRvO7f١c Uwzϫ6փ}199>lM|xes,3Z*7Ujg׵A UB`LW WO l-%r)f~-RN/yӒ+si`@2,9OȪc͝]y!bT[U؉p=3?C#t-#B8X:DwvSf"aF >g(d aJ.qtw!KS&}yEݡ8a3|*%Lv$/.аNcU!.bW[PjI Cg{Ly-{{8wm5qS,8{LېJ0. _)3,v]Z,Z9jGSgϘGCCdfh3y5ǠQP,Hύ}^M{SfK7&FT}i2J\I80wTh5c/(sjS 2])kjR!f\O0|^#աE0 SVl j5IإB oݤ:Iy̡6u =! T9z$_%C&GAAULw 9SPS/aaO: Wtx·nJVhbitpn@E2HKx:&)nY+ќnhsQD)^L%Nʞ3wj>t-E DRۼˏx9g )ˀDL0+.x4\M]^ƯBz}1aD xT"s >>. aZaOH4T`A }8afW (Q7~N䳷Y^^Zy`b8}93B`.wK_ uJ!ȌlƌEH ZKtcgֆ.n#s"s*'jk\X-+x+b}c'޵9v;l;>Cl17^9ħUICͤ_ѻ^ ߦc٥5sAL;q כmJ˻zے\Jz) uaؼtoǞ+mwwex(+4xϗuaEݛ"Ũy 'S-QMc cL"6' Rk endstream endobj 141 0 obj <> stream xZKoW6j!)>$& Yb} ˲[[^Tbf?dDb=*N%b?,ҨD~WG9U]E2%T@tuWlI rخ$ˇg|h'sʲWV$QI'ҰVI?f] Cۺ8-[0J@ET $PDgQˆ4PVM}nnX$=Ԋz머bܦO)W S*~ aY&L+IHyIԌa1 Z$Z.I<9Ncp 5,Q\SJ[) YzfG / "< ' ?|Xg㶫6+qcG@CnC/tp45k'BTOyB~Cl_6)%H}90<,]wN+읉 }S7sPb_kE⠽"o{&5m(E9+sJhjhP* ߓdI p#&d|Cʙ'scYZDWtܧ& Xk"wuU"DoK|qMYV}}5>IωHbQ <|-OrpCac:RBP-uᯥuʸl X&?P".l!t6]\c,݌ )tq0]oaY\C:_VYT!ٝ>rHw?n@f| 8f P+nm_7r5{Œ M!r-ђK~4?,`݀ҝ>0b̅+' ciKD=QOAv;8HKXGdi(d.4HЃ %WqLN@A+^#&r篅$V/@W "hv$w!T1D-xcwӀZT2R)2x`jjPeM6 ]v IL;e:8Tw&TPMR3xs%P dT!ֺ5I\Tt/MRbR1ގNv-i b.%%LC/,اb}~7|Qȍi|WFQ\Љ2".qYהNԥp**٢_ LTXd"\\9'XicMRk\Km-]$d??n7Tta:Źc/Esc4Ηb6pTPviR D'ojLP/Ã17mX`r/ſBGl =!=W9]ylRi1kn -=K{[\$ t'|ޏ/a|[l-4qRZ{?n;n:^ʱ>|m!zw߸tQ7mb+E%sEbnאs)a PVLuznoOcoWk_ŸWu)^o`"zv> stream xYoD_o^]TPA?Q䳝ΗھTهw{kUwvgm ,H8A^߼d4HDp~d,yGx, q27"b4a]ܑm_n/_łt5|_?_N"U_UVEi/ߟ몿ޔgzT;eD*8KG0%&\nVgHO[cP[g⛗Hገ$zu-"!e4,+W|n3CkOmcfN735u2dm꣺1oۦMC8v& J)IgH'"4kPSdBifG2a4%|sJ*hꗲ)P^MxaYU=ж9jμ^sDMuvlV(; " /75EuA*A֖J~ح,ٚeW n@$֋36> $ibwnblCLc;ᓘ (A&+4zpoLI|q$&t aU\z%$=" b7u"LN@/$>P2@|м'p@\k'LM eB*"Έx#D !f##P r uTF ~ ^.:Ji^AY W3W{̳`GtWmwC )uո[Mp˗˶s<1i"5Mvpl+m]cͥ|ֈj 9>BŴ{=X"\9\>i4u-/UV.?'vZ)+%\y]:koؔ-M=b3ekw{SaO)ߧVqb{CտLϰ8ɳ !]arwEH wb/]=c6/GS ,Ad0N:L+pGM[Fk^ERvy>ģC' 5Z@f3?)> q ~a|ڝi|=bؑhOv2y32qGo7xqvUxVW7L6+ FE1U9GpQe]M,DD1M*b>M7 ?ϳjV`xI9uDy!-y endstream endobj 149 0 obj <> stream xYKWf&i#  9X>rzfqP~}jֈ~TUWW}h*<ϢNjH -c$W^jX~;3LX rpֻ}ۥ,6_ U"asѠD%XyOy=xYB,L@2}T.nϵ74""?my"H'rtFܐPɝsk}%ȣH,<ګ"2D"QQ?FlY4OTy 4d:hO0'0(@kHo !% GϾIV]_D' _IT~Yj6NAC|⿺]_\ezU)}݇?njfٌ2ح۱i?UQX܌K4]aN L= ƭ Ch .oI01-,8qb8B2(ƗΔ(]4԰@%Y"^G`"z&lMnWc~[/%Qrrq=R4ܭourÃir(xAy3 j.f(Us "Qơ $Y{u7R$KHKn#sClyevJ+&llT=D-(ښu r5@F`b0P , T9lIX0\./ILSJ{=IAEt`)S2&p⻭@TM$t">m6 [?N6V-9BW8=!kR9eu2Ru7YȘDcʡЁ\䷚O=†\$V*D^ }߶k҄d[&|㼮p%"fp2s`Ce=)9Y"|ێ: 1?iy:exQ%odlnl]-@גN Դ;zh q` q5Zg޻-]a.4TmHAׯN )f7X5t1  G}_4%ٗtvO:5Fx*0:}0'3xBpuܵ y|mOmκ4"04't=rjji⟛7TC|grl?õK9 nQ˾p u>v$%=v"kfV؏έjU lh>W6j}]}DH!@q$ uҠO7dAu^w8۔[^pձ]Yy|WO)'[y\.y6o|DB)tIhn`xat%]U#80,}d{xǢ( xmhȢ8 `/"枠u] :~K: 5]g2tT89vG=Q1θh6Er&P:.!T-\EqM oi<<ԕrOScRA+b:)S9܈r:[mZp~+oMAڡg/e V γ"]^y'rt0"CU 'ghdc='?{i }"8T ^Bs5&X);uO2󇊏iU'$/>T}R?9H endstream endobj 164 0 obj <> stream xڝYI۸W(,X%5I<㴻*L-Ar$[Rj-\|[E"Q?\/,2zq]Tdz1L,7 ~yv*uuQT_yc^;_qBSujG\=}5wE˳;n6m>Z+){x@ǡ /Z^WE<}ټĿ-CMYL:2x'76oMZ^|oMѴyvO#q[7fhۺXQv1omK{ ?i n]Yu6n}Stgxu{xO<Ę3*=Ma"? YTZH#=ca$5U? P"q2R$8rQZ{ bXP2bApByWEj&r$yP?t\"q~늶}98_%&QfC:?fUY,x>b+7<y̢ >=兩$bW7+iFy >O QUۻNƝiήyQ WEK=7{!yL0w.ʆ> mJUJړކspbw:,']F-1E @I&],`4-?ٕ@ GI0pH[?N3lfur{܀nG"λ7sـp\u3x o I!"fG0 a"߬/3B)(rT"X}8B8=2 /X[$].m ry1愓5'UKh]=;aMQ^{p> LL\ T;u#?(v H}d:c!ZHJƔ*)H~d9xK*Mѓ/K2J?,֘{$+9qfd3p2BBE.o*Th@ޮUU `bpG(l|/Occ(Ѱоv彩EqFIHƳd>x!E]ٹ܉ŕsA zcޛBf~u )"א8mitd 0S9G,DU- )#ȡJ e 2%}&g:ќ zv1!Et04 {3PV4|;Gt<4nti&xx e10$a<5qyDH0Xrv=Yu%V_ne(ۀhMLjc*+u Ŀ@_!j/8h<-[)ƯfXV49I:w^ߊ^7[SarR*NM^ty=6$cM_՝7\-L[}.nUl>\&C߼zdb'T掑J9\zXΠu(^ f#þP~ҦfB! > stream xePN0{tyEB\!JD\l}HȖlgfABm ew9RFDyƲv:${B ^*[- vѵG _PĚN!6DǔetH^=jx!IBٕd쵭)և.%E:4P^b0l\q=N-K'\'UX0/Jq㙋щ|/| endstream endobj 231 0 obj <> stream xڝX TS>1prD(X<'u*uZm*ΨEyEAœd@aV@X֪V[5WZmmmgZ{Q0B[;wזDO8a^tۃ쮐E<СurƝmwwr0+S:vO/8(hdΏI IPO8qʄ zszzq@`xtr|x: jzRdazLtz֐ utoJ_O_>Wzk ߳ь3q&32ƛazpq)pLƅᙁ b32#s$f 3c3 Of!*,b31Kerjq1+Ujf ׋Q0NJ"{){UY`vezUrs9me?ptj3O~Ͽ?T>3f~.!.@?ze45$Ƀ8juiG7nZvPTP79v) RF d4KV"_K[쐄_V6Css0KQkPY,2TJxqStйSnqax+}Ӕʵ#b 5cghrEۿdG_vlAˑeۍă`I4s̶J**q8s->~_`v+;bq+V|%_Xq6JU1r^ˁ<Kj0J9x^ R?2X/]Ov(ߖ# Ј}v7$&C.==e֠1:.%6;*Ipp_Yj-1ۧx-i%B &`T:ռVlJUP !^ջi,3Mr. CF!~Kf3K\MY<3CX8^ Id%ė:eّP08{7 poI{>*\p*Q c1( nf/(4&~q: `}8]NFؑ~gWR85{],q$wgLs>l$oe_Z+ :^r -;-!" `5AUxp)^-TkҚ! t9Yi !YW #>i87η^w>} Fy?#bV3V̾]Pi"~S R 0qAkscSF)46>vꭋ GcƂ״4}|Sg`dp͓rg9+5ýNASp8HAojνEC/ "ѳ n`keH"@X2Da0Lu9|(Tgs.,p% -jLt (d;r sD-Z}LRTr&L*sV7,֓玶W쇦}nf=I:f(,4 K,aPSh~I z.Lc`l ozZడ5[Tm7rHhw+Ƒ+Tv>ߛsŴUE8 Hd !#HHy7d1xn&cq2_V`eO ?*իH?h$.7"Tع&p?TǬӅ.OܖhBI!q{xǫ!ǫpҷpDA^%dHa9gqprϧ߷bFŧ? d6R07\K"aHY;D g%P n*/qr LA,r5=n]i?Spu瓥Z_"n0:v6HMnVֱlȽN4@펩"/ٝUc~(ϰ?\9MU`ydqAշceBC[$ȳ*CoJ=Fɾ;NDCd F\ :VUak*x'ׂR ~rƊ?^S9 ݕrT5*L֊6+:)3W 60dr spN(qCa ӺN8Zȩg0 x @^M9{N#;(Zl6$3,b]S7Hutv4Gj,_l(/C`{H'd>>I Ⴣk˟ruiJܨE{:.YZs5 ߺ @V:}N8g얀!3*9nG.Qͭ[-:H!6ɦr>[]PУ{Tcx,PXNƊ|!$2x0 'IB%f#YO_פ7d Mt[F!}{NJDixo#o} A ˶c{i]}cQ=8o'jkP U~m.-V ZqD;0'$ Ngɫ[V\E K - La519S` DTu!}+pp8N$À<+gJ++^`6L5wReT!8)ЙuF MfD[<=ϟ7qøO|ԩ j{DX^`fY?BWK%JK d-*ϛ=' Ӣ9=~ƪx6K48w8sc2:mMWӁ*=сh 9߷ 4{ܴk- q]J9VR8m#JKF(rIMmDF!pRόv}?RUWTT 2۵*5pmly-mlN/Җөa殓6_);Zz/4c16E$$gr3vCv3.>VʆQ?)WwOvE463+/RkU4Q6Dgj'ɬٓ dz< /D5agoE{dn.ڈ\Aް*>8kI>a.U\{)<~d q (8R.U%{N8m퀄_9>[%UU 1e_#b<ȕ]|z75QdT/8Lw"T6,Ep;v4'Z2eT*bH!ȆCG/ḧ́M-&-'Pḵrti>܊cPYt)S@hx%V{`p+{ `ԙsjruLiz}N&+&1Ԫkj>';v@"HPi(77CTMLׯ&UYUxq^zɊݨj{#Z7٩EHlCT]}rV'tsnd0 &RzGFⴰ c9 endstream endobj 233 0 obj <> stream xmS{LSW>"sJ-kb2_@\܈XZm@EBKho[V" cʜN #.jE²eY5׌N㜜wI_-L+\ek69)!TY,|58IeҶvfBZad'xwI7 Ƥ8./Ә6ݴioPG$% :LK$3%FXNjKĭ6Y,'W2RGtdnYCnޞ*\'Zi J0, ۄmƶb9X.cIn\["b" "ޗ -@AKݿSO(JBPA#Teql;ig;9p˰ٓcVf *ŠW(\M_qU6#q MiN.Bψ)brݼB o8{ܧ(?nhsr_&/0߭Zh`fYO"hm|  ᥫ8Q_)#H84!_$~HV%xzc=Jc=9C8v{llFd]"a0DyO)Ry`z5;Z745 |LņC?!6BjJ)נ2rjEͰ%}U `C? A ]=6i+ClJ cD|!{3> }FBA:n16Ǥ4<4G_Cj& R"? Z'CCG( K#K6 ~ w&P`5C?3gPzD3(S?{{G ]οS^;p{}eτwg>` #U;Ax7Uae]֢b[)|pu3vRmu4]8oGZ]-z !QO:#,E1Q8'p; NaB-X|YTw_)2E*w!ETAA5ـ1p_'? .o"ŊOf@rxĸ)u#^u Z; UFQ&ȁ=.Syy¸<[8G`;{|q}y3<.RkZEEBԢP$Zh(r3TOTrE1M3Lq/Vĥ[x endstream endobj 235 0 obj <> stream xmU{Tg*Z7V{[>GE@UVE(L@` $ o G #X W=TNvs9so~'\]1H43j-G*%"RrWp0nQbv[ŷvzɻ0Ѭ5@H$ٚ~"cOVjlE'O#'VŞPpoyB-S)yBd"YYad<$rCƨ޿s B&s|X8%aX`YlL :{Hl:.bPa, c\0抅aWE2QK)qxO?sC#'j*3"BAubISFaqPp su4Ho4@+ՕVE85hrFբT1 )<#e QPC,l`&y",?x <ۂ;o*/ \3=)|y%LvCh|w}Us㧠z-ˎFj!^g9'{U-Bb{/i^LjJ%yul&jT#GUfxv Fn`8Y~6:_HH{6[ %B_}OlP\\bXA0hqL?//ayl@z~;)9Sj6M!P3Ѕg&_ ^X3txǀ@6񸶺ZhQVP:n C)РG1;RFYk,2PK[S?JĮO|H/:m65> vŇnSu,XFV3cd<]/!f&d3`n Ž଍NpTI%yC0̥eըht4vO$FFwt_TClj[(㘧9QxR 8:wl/,&6Zkj])n\3_%-q~!ȵꥮiAKO9D}OC_qQ p׎؄s Zsʎ"~ f<iߞH&FX=dR-yRkB?הJE8>dĿlB3X<\q<7ZYKM< E< =p@(ԡ(̆fj&$?Q 'o8Xzy.`Q+7~W+l,0:3g0[RVjՄ:o}l$4 +86\PWTmSPdug8< pԑaA-fq0"dw~#p,;' 24pFK}Pb+ i;VWw&jׯgUܦ܃{Rg ]e1x2A(]TFŨ f'bLC,(sMC \dqsPݛ4M-pUstMVU3Ie:܇52& kN>8.r~քJ!4p~/ o^;M^/{)]̮_@ mݵrtWf endstream endobj 237 0 obj <> stream xڥW \Sg!{ED 3 ֭Q:.T"*ZþaOr@0nVjnU[;.vc^ `qM.KHD"ي~~>f[=/,"Udm[JxB$< pbA8>!')J7)<~xܦX('H]5oW'd&EEDxx͟`\rX9cmphL|zrLGp.x->|13>#$,2Xij>~6n|fߛkI bݸߊ8j5q1ZC6PL+%rbEc)rQxj[j5GͧPS/Pɗ˔ ZGFm|)?j#Om6S[j+@(G(!ጘ.k;89~t^2-8fvκ\$.ߍSzbvlBqDnoc]Imh[T*'IWO+>e3*|K%i=O˱Ru!C k8}TW=Ga渚5J8"GSt5>@JƧ+U]rt r;˱!J=\RPILb}źê6i*POo.PC!3Ʊȁf'Q>lU<-BQO.֨5jrAf8XbIUV c[~%3&@5qcϹ,\5jmvQO<h$|?ʞȩ9i`)Ad3,NΏ;C2ӭ'6!J[(\%PjN0+)ÚT TKRCqeխ?!Qz) ]AWEkx0M萤S -Ç!:M:9"$ِ:ҵ|:|ԺĺPĮ]뀃hR(%ͩy9sDMjdhDipGc×hlN M Ql&a kIm-3(&Vu9^w֝:_wOSYMyE,꒬7&I4ͿqNs!IlHommß_}p4OAHށa|Q+݈/ ͧ\f>+K ^$Cf3uLz{62L MqLP1~"pGF('-݉=wNq?f5̔ Ȱ{~~N <-n#+K \5eQӽ9XS:kA7J*_IeT<OarE4}oLbD Iv]G8r"LSv@}޾3_XBIاHW=e!V +*Mw=q,B0CZˇrqHhU ̜U>KQVGf2ʀpμp*3 oٸud  V ƒx|g^TUT 2k1%MYql_5bnm܅Y)r$b~rZ3+zϒ4^X>QF+Rx@$ #hߟb J˶HFF=Xݣ,>j4~ekt;56p8'G=݊rQ X 2k֖U b*/K 5j߂:*m=#pJK:xBhdm1ݎFue;OGN}5b ,9;XHcCz$V9:-}=<.uw>亓UMa?BHT'$͞ fa d3R*dFز+|ny$wŰZBM;`l ߜN聅mIh6<ז/˖#ډsgZy!2?G= AЛnMmQ6be 3ݟqE$m> Sz#^)!/_.,,)U4) QdˢW!69Sq`)ȒaOCӪ*!f$: MPݽꚊ#* I.w.Ѻt]MoW(""^Orx:ZOkFNbhwp;L9ű_e-)

> stream xڅU}Pg5a LQGT8NZU"P*V"`6@ " ߚ=ړ̝eCk_7j[:N}ۀ-z7n<$!$I2q͊%3W&':~^tZ6Sx5G|_%cDV&rgi=h2Aῐ"dB ޞXCDTP8;&6%ȠٖkT͎<Ֆ"UB*IZJ۪JIQfF5שdrT|jmzUeikTRWa)a:Om\ PYD\K=aIN  Rmb9H D2J&ktb`$!'"FREZceed:yoPTO*l !FЭy.m>O{{]/Z* ? BlxkR3)>wZ8t/trS8e&xEcl}ϵ heU<E R4KJ;G&?!6(zV 4ڦz6MźfOY] @h{ pDDL! YܥA.bD(;4ӳ= -='9| M$@^󣼯eb"[Ta-V/zۃHpjSb:P@$L\!S^օ3;;Sav CFҁv[P7oVn򡱔-z)c?_V-@uxP\8_J8; ~X'NRLu6[4@G'qӤm)(ɇ|PU}I`~HE {z W]J1%0ʫ-35AS/8𖹋vrV9u;[[8Iezf}>__߹s}XTf-˜p^s5EKD\.oh[WX݌ԣ g8{;Q7hE.^.gͣ3fO};tQZ(3ōr,3LA 7S)Hfa}o4vF#wN7\ I EX#ΗR(ɃSN4ñz(< GG#ˮg/&vT5?n\&-$  l)!LCp|9U/<0 SiT!l =?aAp$I#~&B,AyJc,ڪsu4%Um3(ώEFP>;)1a;٫ f҅ /,ik4ĊV>3MiAP>Jm[bZF 2L1W!qW4Iǥ;H'OtP6܌”П)pf1F;u+>ugCn]gT6/(;>o*[7@K<N*?][%.˝ɦ@VBEF0X^ ?$w}ub49f?gH<̶Z,xuq)F , 9,s}͂.x[OrgG_~&=?/ +b\[=,og5Tq8Yt h~Yh'}bC1)X*AI.ZD$l3"7J5fz?Um>xY0#PWT[NZ*[t Yk:qUgҕlㆃE@E,Oy*.wCQڪUzCK%-Gb!cCAH0aA~@HHogUumMUbǤF endstream endobj 241 0 obj <> stream xڥYw\׷eagʸffK5QWTĊ" EzGD`).{v&.,=vC$D,Xs\;ߋc3w=ܑPD"Yx<9c.[0q¸U^~!i`J"ް8Xޠ(IJ#W^jO.oiMYI$2gO0eN`PT6/0I&L7\gغEڏ]5`.$7َ uv ]q*GV8=Ox_XG/SCc-&ZhrzզFER? EYP ՃIR5R)95zRP6`j5NޤޢFRoS18j<5HMPSwit=ʞCͥQBjZB-Q)jZE9R)'j ZG6P; bPKJ+)QJnYDH-YXX򵊵*ePcǘ=yoV:{u+};F+O7ͺdo \7j^Zb"S_6Gr 7ʹ Ii%7 r|fBMRZpW"!&^֥HKUBMg.LFrjj,ӡ| {Qp-B&ܕ=;*(80+t74h4g鲸8H-In^+hQIF*hPԡ bLdl3{⑬ hpL f,fbJe}oЛUԈZ+^JtyLfaֆR D'b#rFȇ8MT,0a%Gk#c!m,$@i Yo5|j;p.l[Y zgO 僺R['DDO٤R "Jv<"KRԁT>n<K$PFe`>W %XE{5k 6ynf8gV>l9>ح4 Q; LH{}Pkt!/ivXbl:-&*au%{k/˷$B$D%3x/^9+~!@ku8ymPQppPرQS6Y`L2x:< 44 7x`Z ZH)z4 ,O<ֱ]թt. zǑEn3]XD,ᢗ_mra|K2|řП4|$IQ5rE+W_І&Ic+l [u9|~Q `6w~LxfNlG|Yz+i7+fzfՒcWo< DmvR8K![d2G}NT06n"'3ރ}w괝;xx'د~'d/%'LJ\vʞ=Ewu}dw?>A|EI?b)nv9\i\n7{팩s.rpèLE JeM4Pz>9I`ݒ#?ĭ Q[6ms ȒMeg_ߴN=́B>4?nakӿ 񒏞 AP< 2b9Z`u{qtsA$6em0`,Ux2T4)<CwSdwDM{eoKx }bHע69Zx֫5FʑK;xm Z-`PKw"\ ?>w]Sa&AjE4j,).cs`}pSQ} Zo f1]q~G{T>&m2?PNop[k7_9݉(O|NB~/|!e|tz ImC`Hh*D[NWnZ\WdFkK ɷp K'dn"x&dYt]/xy˵0<6~#Vv8w\ñ԰29+Km1.J9q?_3I+C._W`;Qx d.U-xeJ(] !q~HxEta.Omc, 8<aW4O@b4MFyr\ߘ(堷>mŻT]]SRPgf4FA΀)LʧPYdA;CI Oo $\cZuyC7/MG,GrбuzZ'<Ǘhtuz,ǛNwsV Cã8,E92(1P %G Gx4$]2ZzY]?g]yJ{YWJ$1 IQJLl ړ^m#d]<<%fن2I}ɷ,dq8Z4RJ50z )gS;= 9JWmb bEpEh{ʿȣZE'46θx(b0ٕC4Ɍ% 1:] m.]^RܾQ)/] XfT't;5^A6gL@;;~WdlOB!PpGPYҡ?Cz޺PvN)~: >ѕvPkn H}9=C{ȁ8G'Ъ ŕY_2K(a0f+ .jW7*ZWc80P .&sD(\>apg<^MڞA]uUkVr-Mࡆ\YaIc`\膐=iE$_#pNIVA#7 8s1% ΐĸ@w`2%Tk {YSV{02L,4wo?~C{s*_t<ׅ]k Zn9ҤSc }JV4pqbt'*|+#z)˵._vZᘗXuOkaEF=mER6xXu){t6b2[d`HPY=%Uf+~[!ss[<NJ%"7'L]%ZZLETI``Tdڏ|qw;}+X|EKc *d-wiN_}M$A=0(fV?Z}&c+eWwY HrF<΄b.OF2fh<xbWi ^e凾oT`xhlo30H}u󯛧FP\]V-=x- ёW9υa<͎zx` VL`NS9F)u&C0غr;v4jhNٚ阽0LuMw9di ~l`skH&36NOh=`')}=:* 2&JY ^7inEfLnx~GK>a(&4닋 vQ׃R>'u"Hȣֿ\ 1/?\rGy<\)!2 ۷5L8wp >Zյ/ Gx3:hv?S6.9|c{;4&h')hz&L@c@UEf"iH*+Bx BTHFP K Ѕme=!EQ3ըs;'F("}/h]o/A:HcHrWjz T*+"N 4FDJ|^Rn'mwNxQh xb^JZ)YilSqb,*IS%Ƌ1"k"-Ƙ(b3?Tx&"E1>ogH*ucRWm\"%BWaVA7&T]{/G*2Uh ;t ,FШ-Np5[fۼ`x]Jr[XRddkV4@q.F;$/Xͷ&(`*Eon:]3g\aq}tw` zH7> stream xڭY |SUi >Aq^߫`Qa2: Vd-[)[%m6ii} m6]hi)-K "cAeuq|~7]/s1i CoxbQiώJI{}`؇X27𣿙7;9/n)gk&&x})9%MJJH\tB|*0:?EbDyEIQim^%=2)0(=-0:.1*%>0=>pg!wuǂE7gF`f%{XE!!Mp""T^b1xyģ|1bL,&ˈ_ˉSj9DxXOl 66[Pb+N va."MD{`?GL">xJy9~I'L޳>rSNbZt㽕^873*flN92Ԝ<6wCCPorxx#7Z\ C{CJzUtxS`U+jN%МBPY_F3 k@ ~]T*nr5t_B͑&6{8!AP qq)T1/pd1efѮȄVKEԑ|$++/q1q'1C/(Ok4:JOMw|oxK<nu. &#>eKD1YiCzs#6).ACjMJ[ E(,}d壛hhLyMN,C v jaXe]$̚O{+=lߧ~m!I]:5)E]i08g6\d)h. >4ꠂa5PǜDuy Z_@uE~4, ֮k (Ko4J5.l&Q'1ZCnSxM jp*@aPץ7l Ny#e8cN8Z5 ᐃQOjUܬ+\,FӁ'1;B;d0WGy߃VxxkŠenJqtn)a L6&z5Sm BE :Vn4׼2(p.ЪrϋsiE f ј<"AZ_,vXsP&9 (C~ ΁3N)Tu@J&=(VJ Fc a{Ѐݥ|o8)'0.2=p',-Z_Y1rs`/g FsedZZA*H4RpK(Lf,(Eתdh\Ae0M͕bm^B݁y;/yP1^ y^ J M@֗BSiT2q,mRUC  hAnt\@w{pP^^ m2&l5H!`=W72WJ 7x<2{2JhI$Grt#s\*:~CPā/A4DbΛ$[-p$VDDFE3]c)S Y+$RTeRfHg9WSF eUUzCn`l (|y9jMIw#2}~UkcZÄY :1)SF]T *$d($RhU@Ny(HU]9kj12C84yW6Pl-*D+Gn6pr{@4C(G4 /ٺi Tkh?y ~#&&t>!ߛƶ ohm, FMaW @bl8Sl0r.37l{;B [wڅ<|ؿڛЦJS-.ؑ*QLպh25L,Ǡ&]$y*7#3O* C H%})I5Eo~mdㅠ, KMƻeVf ۠+_BfL;C{сa>ۉJТll[RJƤʂ6%G>o^LRRwt!CپԾ!ۿ}yC{&a ܵ\URUjNJB:)u@%WCiRPR4\SSeXivQvpHl>f3c<,Z4Mu36Q3G : u1&h08!,pӕTtP%%r%c , 6 : BRZH7PGa;)_P @~uH[uQ[|薃d`wa_XR1`bIQbY±>0಴Y\fNB9 '7}N#?F~~ߣ'ؿz t6djbMS6Zߡ(V`=frXdЛ T[k_5v h¨b!V?\<(UBƴױAsґx^!b?ށe?^e8<ʐkTbD,PA j57Tǥf\ NV[fhZnN(%ij,UTt:j MQ_΀YnkXNum.`\j  S t`uڎm9 asN!0hj<7?a Q;FZ[YǛ2 lyE,Ѯ)DP :Ei+fof\B B5 c`#VL9-e^-UTIna8Vsҹ+3WqBh;ܧ8)Rs^ 5GᆫWT 6r4yeg#¿ )7It ORIAkAT^,19aTʵZ)A#^+G׽&)qEaqVdn 3p l `k߱]@T7-ma~]CQxMJhQ8YDSdXV9Nt: Db%N!wddF>N_!?Wwf O==۸BO; +܋BEʖg~0rv8IwHi ^3?B`WGp {ݎ~<"hL'jiWF8{d>?A‡ _9ČK VA ߮t E {@Q}!CR\<72xo ت;S &sTO{L9~,U{.aMhg@k$O)*9PC;:b_ JI@wX3Zÿ/ +1m(E;T![bp{S顊zgS@ڋRAq)c? "bH >φV5KK}%4Z) %AkguuuW\e72Wx>jpM *ЪYTA8*Kn&6u]vC掵R~mVWkC(INzIy25 /[*)ڂHNUs$: M6%ӔNb 2j{вr*? ƲS}9 PJ6p?uݏo}'S7Hw| - kE^]6ѣ=tyX<BdΣд26nMvVL'le6vyГcb1c3{ 2!-s[e:'O)RX=YūҢ1nh15CtHc-n:ښU|Vxh B ;6-;K~myϷ`a 'f!vu)K̈́DIzqKf^jjv2@ޯdzF=Yg;64C;mc1kЩ&D5y<\;XHٙ¡ dHN& WQ% dzn3IeZzc/?4\@OaE_B`UMti5}dC ]rgH'4T7 DYM~"_*mV!e<(,A fr.ux71;~hyxPx>IzE~ޘqƪq[t5vUc}FЅF1ŒMa,Z27j%=f<􋋕g5qz)`TTS9R烮H[c \lq^ǪZT'rQ"=?6tm&n ilfH'K;Cֆ֚>Uw C+΢)Y9;]YIΦȦ*2TK=aS4kZy, }dWM>3/b[ڏ?fĈD焋_2B{RCQ+ry 4Pf91L炔-Lo=j-wow72МdOj:ؖޯ݊Vd p#h㰯 F.;3A-eDlyeC 48!bq0A#TIc7;!y\sa= ¸ef|R~H-1=%dC3PPj/mJ)Z{4{>խuOʚr}شҧW2Og٥Z,mq|'[SʴЭ<OCyyԚ@v[l0 Kz3Ǫv&||'rF]DU.=uSlӧigLH:}zw5XaԲYہg endstream endobj 245 0 obj <> stream x}V XWPhFFC9QqCvFQ[QVeQDi4""(V$&sLt\^j^4qwʹw08t}}ss`lmBP˖m}'nkuf&#VcȂl۷Z4vr (ptxpKR|mܠ [u6hٜMKB9蹬ҬC?( ^6uWݡk""tMM|#=-7pLs7(zᅹr5.VCwlp̹BɰKgF47iXLMO5_[D%o2ssrrY44~DfZ f o h̙uwZ5 .~q{јtGE8?+aY$!B ]9SPYSh,# 2GH(k5 DrM* t/ix^Le#/pEK^1gCx8?NI~ )p"'HD%?@F< >e洨{@TZ: Ku==/T)CO8$4"Id#YNp,^+ozSt0q!;;/ eکP U<)i!`n(L0\|x> Jz/OU&HrP$C/ _-.ry"_ >VYzź]˽BI@Mqd;5P x4R-gd^9T!c>Ȁdϝ]>Oye+הPIoH${.oO 1󐵽 ?t0 sځj]l3ٸ]zJ D +ސHsši`j0?pY5L8S.*8ZK\NJeO3 =e^LUɚF1vx寿fњsyH|́? k'xA誄7tXZgo_[˕xpʩRoBr8οw:= FnX5z5'Lu\!< +?Uq{eC*]f5CB$t`.ŴQC8RA zD s!d4%)>}2;t۠w%n*IkgTpcyL0Z>-+8ՁAyP~ņu72a4zpNtVB=jHCzaۀ[wp_˷Ze RGa:P֯N9dk~U?AبƔVk_<|Ccltj|ͧިQCGJxt}L ]y-hdJ7Tw W߰P&#D&n ?U׬ۢ+n)ˁ%~Hl*="BTxCmyģpp*+θOe񥄁)NPp?p3m(&)V X$2lՓ&MHJso׷\) JgۭxU6zv%K2CXχ>nV펿 p:rb'%E|FCLbevղlJxv] 󒊌0^WYg 㹕C?@ٵ,*Xg;pt2we,}_t{nܖr3E0ؕsPeȼ]&7%rhPf W~X,Uuo7/гohpr}{+?oK֘Ek/J*dLIq"sX.@s߾fǚE m7M2hN&&E]+u0Fw-,3Y*,Ƶ%*amP]P& Eհfɔo*,> stream xڅX \T?rv<6X\J%4%q@BpAPvagXe]%3%Lr4WvoIWm9ޗ}03 3%iI;E#@U9ZԞ3$Kgc% .zmwKf0TIݰ@'貙2b]h hSQUZ̡̺P(H1d/l?֦&`Z  K]e`Ņ`1iJ pU }@uPIV܎{Pż}b,@5H]RA-{yb*E Cw` q ' uq6 4<eo)VRzbc fK/mKon,,(8T$imV[5hEaWksNM"U3n1L"]G pgl)jٜ ~Bf•1!A5qTghۜfEo77.|ƟȪejFNtCh{K&-n~G~h'in=IuK'LzeO:)P eHů:NS^7čg$<]65$xd6*: AG%ŦK2 XGIaZ>2dF-!,,**,!MY+n7g$dE $NzD PHdPB "Q{H T& 1'4l ~15c: qDAT1^KHS%١/Â_E|_ᮊs::AfJfzANT pC R9IIm ;3J!k:A)B:plSo@|Pa<'mmc!޾a6Qke(aJuAt:&w&U*,"$}͒!J4 `wF_0ۜ ~桰Uyv-RB-rl?J1LCr(;"E=S=Xg@avHUT : F$VvKɒjsCO8Z.}]sٺuA:;cSibK}s-y|!jmZэD$n쀘CGd֠)F6ВpC"fj2k92k* [ji4'QĖTFlY?Kaa)-lC"=S#FbM/hr+4^׳&Xn cNu2슆9GikJ7'QC8- i>*@׽83Z4~a0CK;`w\@00- P!R15ǽOvCVp2wewOK}|WyZـ#k >Rc2cw:pnC9>[RZfnb#yb`&ڠp@^n-hsF{6Q,W-ҿ38ҁ>U{$͝GX#V߾+mmkg ~>@5ٴ{o ~+}<^Zoԝ8HJڶ95 Vf ]_ HL!&K,~*N}̯+.}xByFOZ^7-./%wo-vH_?YcMc9 b\MX?:zF.G^7.\is᛺uo_d(PTpdfTB.`^P=Ձ +R +I nB2 1}5Ȼe`ii*Ȁ \%yG"7٠:rM`&1ܽn>ug|{gϿs !׊=g85]]uU0 ‚vm%zSJ\*m'oX7^c+|\kG79W'1 yj՟?|J0d# xRj#:J˧)'Ƶdt+ CӃxF&^ᰚ@uD~qrX_=%1}OowZ; [9'G0{?uAa`mPltجcX?G9Ě͓F}6sg\z+LZ|I_öC;}agWmG}?DX/~sxn8L< (ӆ4~?ݧ]&3>AF5x!A%Gq xo]}qripuwAL'ruI.8a9{ۀM1I_qc4K$^Q LC]UhZ>bXBk~&\}P2Eh28ʴtu~!yΑKp'tz/_O~K"0NT<`i80#R_ U>smR:=l5%@oFs7H`fSqѸ ;#i:,z5_ |W5Q: hדBwgvRLE xIoC 0#Euh5;Xa cj"2e6N~YtZhniy!3ZYx0LL#x""gZNΘ[O:qV?HIKJK1LϘn+~^Rq G橎1+6T50`k/V_e yU qPQo݊=Ioѓb&q8=vC M Nɴa^ !c49Ċ,0KO? J*|4lB m^i;٩)?H@oF\zmYZ^k?NV'ɓP&PlU9Fɳ'Dž{' 'jtTj?u=EZMIb endstream endobj 249 0 obj <> stream xڕyTSW_X^J ´jĊӺ!ZҢcbYXr"B `d Zm3APӱ:Sjh173ks9" ||@0ukYVRɜLLr&,y7?wA&㕸=ɽ^e&Ȅ7#""5tL)~#"b~X^ĉ:p*4MKTIUk9B NH HH u,H7`sgFi UFB3whYƘAWc tV3W@zzt?R>RC|Q}FKZ4z\ngѪ"|RI@16o`PU\ q=uˍ%خ²j@wGp2KSRK~57v/#+uh_4^#\= r1䐔&*CQJD,]&ߞs8l-\}@ne"8_DlM|w+_mo~P xٓdEY9?(Nq c0 •p^gD/I^A;8Sϝe"A'Qo~ Ftr];cvjUe2Y9#H.,A3pheԱ }Hs;jZ8ȋڙʂN@Y1 Æ0n8brM5z D-9==H0lWL;5gH$I;78 6l+|_+[ phmX*ԁ"Pa'x4̓|xl }vxwX:6Lb'oXBԩ0D6C#{Dl_' aXy47  Bx u˛OM|R{(Y%hsaƝZy!,+V-~$U#J:Ģ5xX;S(ӖzfwUXHHh 0۠-i\;;vFO_oS9OOջO{^Sj# -a@U9U >s P%t=|Cg!M;zCw!,-ذ"6v୛_~yeg/qGp(qG  R)$WyY .sI n؁#Q(u'8 z*C>p>vr.٭rȟOd)<=e U'$D*tMf9Y{ $=:HTPttUjm OA"MVl}@cV[;c%A]FE1"f+;4/G-Z cAÜ;}%\!A)(̡}C@!# j*;S pWlYiAzrz k.➿N͒$CFcV 8p( `iQk"3 =+KL=u{|D:͗1 .: > stream xcd`aa`dd 74 JM/I,əa!C<?/*UD~@5`tb`cdͮn,,/I-KI+)64300q/,L(Q0200 I Nz ^ٙ y) ^zz ~@L<ԌĜ44`נ` Ѐ`M=t~E9@g1v&FFk,\{c ߏ)ڹr^I7GWUUg n)^}=i9[ ?]X@ߡ z&L0qpN>u{= 0a~Du z'zpT Edž, endstream endobj 253 0 obj <> stream xcd`aa`ddr M3 JM/I,əf!Cy?wȰI*BD>@`\+SReg``_PYQ`d`` $-*ˋ3R|ˁ y I9i i ! A Az΅rs`4dd`bf`fdd} &,Y߽~yIww[ʮ=c}:{} 9O/>}h9ǟIYurlY=> ^luͭvݵ}ؾX.~Wo&ڦ.^1gܖur._~[+n_߭$;9~<{)XBy8'pupŽwR+xxO?r$^9 Ie  endstream endobj 255 0 obj <> stream xcd`aa`dd u q M3 JM/I,əf!Cy?aU |" H' ]-&H9(3=DXWHZ*$U*8)x%&ggg*$(x)34R3sBR#B]܃C5Н &h xLR7t9mJ`W鞱F.ݹrު)Kv_=NE.!P@xP;԰ߟgxwS?0X_4ѳժ?wk=9jʕ[u_.oߢfZ-Y1wºrxa[+n'Wߍs%s8~-<{ XBy8'pupoǽwRO<<z7O;W1+ ~ endstream endobj 8 0 obj <> stream x\[sF}ϯH~J*+M2$35h 8%Ik!Xh ̇WHA'XWE-b`D//A cBXAe"|0*\V `p"iM@>=K,&,юdCJ^c I`Lt2$H$9yl񺄶$yZ@fI0#yH-r5kPq!ה'H¬0aX7&Zx8tnI/lӌH@"A5<[lH| O({X,Zԑ'= ~9f}8Dp:%<퉗 5_P?KYúq'Fi$iYJ7rc/jR%)wt(J kEq+ЫHנM$Ȼ"2.i$Eč!Giآ!};5$mx Q4GkԒeGK8"3T\qFXWEEc;IQGK9&8b%/!MQԤTRq0L,È#D'C q+& 鏾D Sq:b@2+y-/}y}}xz_.}u:oVgU=G}!?7;ln.-jb:|%En~+S>.A`BJdv 8z LPn1IN1=L0:;U&q{@jpe |3=j, ze h}'H8ْns OysY^%\ixͰ61fttt!YID`ns w,wܽ aD6 |1@My4 +gtsB*MY0IJN{5o&yN_] tR)dbV-'*pQ(6;vj{=Uh*ҴU0)XyT 8XUp^ޛ7sE-ybT-Yi<*ȥ# LK@EBcz LjBD s2.e 2HƵѵ8T:XAڡݣ+-Be:Tk==Ћ ]۲.lŧ n>NV}LvSE鮏aڽ=0lvKEexoK%w紙v[ŅP2 ے-.T%7m*lq _c0>S,. k#S%y ];Aa']=ڲ}:_-Ĭqp} N}}>_[1gVܧZ""Xi)~;_ҌQhvB]wF>i9lM0ćm6("9ţ t/OmѴA^fӏ!OeВK{<=dzܕ.jdT}1uXPK@h%<ҫF)#+ynq(*mr>{1%"6(&]?VT2^ܝY~,h!6mPZ3E,{e")4a:?}9MY'MIJ6/f 7ESN14F0DD>-YNmꐞE)d%&kg[1FKnvo'ܟ<۝=G;GoǷUzXnAoBz3#xzi%-?*pxԧʷl-~|JʚUytI!Ώ+ `Vw ГSkZ||Js9[\ꠓ:WHĤ:6gu|Y 䒶I{x;$ 'ť -pgDi'<%bŤSOAaL#0OXicxcΑ'6W]Ց't1ڐIWGlOmV}T29c |{ƍE.[u.c zWB%tle$Xx#67_..7q&CGU/ՔS ~OJĞSJ-seOy8b;=w)ߟʜSSԓx*s!|q>ק'ק|,2g'-D8*â%ǯ}&B@`FԬD잘i%VYo%" $\5_\4B?Hrj7ED=Hz"/hp߫g#Ukzy(Jk Zk0.)H .: C_/WMl|evD}:l_"D)uAGЛcCMnAj֎MVz8uV;{EZTl}\lRvFuHT0cC~ @8Kq{g;KL|]C\0.:oy()8,| }*G:au E]t*s*KNu*qTcTKM؊v&5-_\;Ƿ|_䡟<05[Iq֔]ke^T; ݪsOin;.!Zqs/Fre\2,R3UTd \h7wE /D/QIT 7>S9U:c+z 2L +/ L_Hԙȼ~w Eg(:CPLnp<Xjy=13h` E]Z}ڔfzQRaP[]Xܡ,Pf::{^hԱMHeIUS-WņgW9.7$UGoUm-겅V-ז-NN sf?H [Rݨx4ORi|O˾ψKiHk-f5W7['vv}onV'j^[n.KY+J漙unsdl5ۃ;k.-׷W/njk_Wl!t&%Sf~u&%.F>Ub7+֎)Z#Quԧ\Ͻzy,OӗI $=sJ~T|VF*W~(%trmN6&ݷs[)x\įDi!ywvN[a*Zݖ0pe)lu}4Tr{'s;T}Ġu7|v[/4g:@ww mQ8/9%+:E[\6″mQŦ fC3+0mroTA"p#Pף!_USYE޹7|zqf2s;39J>З|:-J~\)JTLM|-Ƚyč^/S!#cpvo.WK!$_^T֩nG;g Ӥ{)k~MyT*LKX@i>?̩w<`0A^8i\ƞzB(ӧ+B`TlLoBN0>үW/1 endstream endobj 257 0 obj <> stream xڝW XSg>!s\X@"VoNUpZDzB ;ʾ$|!! ZqZqtb;8'I?anN<,nfET9v%ȂcM %6!'1޲ B xizi^[B SVQcZg?KBێ ']]$٢RG3!q@I ZeNe+.܁>y6ƋD"; s%?&.T&y O$jGhIVRu:(tҘ((Jۺ~W|x>Ex> B&I&f-hE]V8m&t|4dW&Ȋ;i,Œ_ܩI9)-^ZD:t^j;G!;6\,;hPSv\ؕ'ok2L”8ކɆFmkc1ud6#~8};2υFO! )I/5! ?֒h2Dv fx ]Q(XJNw"IЂ6%kApwL=fT.PtAˠ /Іfs*-ֱeJm3Uų7w>FD읤*%^ۨslXt*I x[hKGBfԅ%eEE# |d3Qh hj8='5q'1huek ƺ{j]ͧgw,Mya)@J󿏹 ACLf/EHbe˜P ];>(3Ћzښe0;dbdObö).R}9$Ͽi'\¾ࣤC4UC<,+?M @hsLwzB~$AaUmkױ>wj+4o.3{u(|>q둵< A+Pf$*;8rFOIJt&+!K22aCCmUmsdb+L?vFhz', :!^` 5 *_{_ "=bOn& B֙Ms'$nQC㺢+=ԩ36C5aTp4"%ߜQ5_ u4(|m:&j,mOtm9z]ןRIs;wԭw"!zHJ3}2W !4P4Ͱ_PB&$PƜ-v^ /ܴsu &~b_GU");iE50ʊth"f_T;8 n4C#|%ecZU>-KwyBU9U|MӷyH<2v*U5a>pzԊ@ŦI1*!)>MԳ43 ǟu|OXXbUeDzB%X[7jCUڨjh bo8s_L75l endstream endobj 260 0 obj <> stream xڅ Tg'$ VXZu&ڪ]K)>Z TQh}"@"D^Bx *G@`$բhQQVXUkvgq=kMq !%˃.7/ @X!>*Xm5J$p;*d q%7^^̸ *Rc*6QqC&nn"$Q3A1784RDnTDŽ)NҊ7*R(BQ@eb钙(/Xpɟ&O=*utp;'1BO`;CZ%@+ !'^%^#on܇,M![b!XBKeD` !#ƉdovvK=_ʲewɱdC*՟Ge{_c#:գ:%gxO)g0P5A :J*[_F6"CU64cVj%|7ڲDF I9·qJdRJvVl/ovRṞJ%R,&NX5*/UZ*mGiC{Jg<Rb " X, E {DWۤ|.[TcLlA+[ [4Z@&NJꡜŹdy=X,.x|,km ,r!(g78p"rt8rzH6ᘟ*)I4nVWn2M{N'  GB:=dK({tX/|b_E_t˗-qfsȹηb$C3UmUmG|C8`ҴxKR%EEs!8޽;~* \'ӫWe~Szg۸7nt,EN21ˈ8~+lŖN/+#@ǤBּMY9:+"#paեd PMPkbqC[35+ Jpbg ~h' ב}"]p.X1Jo3P JO`i%RXa"%3MD׎sm~i)ڹus%b]2u(T:ܜ{EMk73&/1CP%@@V!,W Y:}p\ZHQr[ƚ/!6m0hgF@`N7+(9 cVAH3>F03zn~'A$&} P(}tikog.8=ws~N#J$仓 ʥBƦĵ2')yz21Iu@M;߀¬3Ipz0"݁Nu,zx횕HZhB'0ppGo0~N %ds9 53w rI_ϓ!y`7fnh2:LUJj]f Qر؊]oVplo):G~Z T卬tt``]eoֽbkKx^uC 22v(k;c' q=ܒ *)QP`MJXI?n{*Efl7B P_`^+YطeoX͸CSnde|ye ]DX{ԩ#^E@yo>Y8:MAo$gx4. snY |] 閧VI%PnE`H>x{not}q6»7ZiRxHxߍ{%J_wE ҢIy]b۰tZM ݅*UhWŽi(v!c k,r-K]Ўsa1G"=3K~ Dcd޶ɻ rrC*Ca8yOga“;2:Dڠd :˜%1@)Az8Y)0EP_d>AD?*/.8jK-W'1ANh>/dCptd j6G)Ƌ -˵3[mudmQ}0ђ\OG(Þ= ~Y#v{wIh*3|{L-u^r>G#{Z?B̭Ki5)_JؚMGx>?sp@iʫöu %Fe i>VlK9Pw5OXމs9lcy-Hb)d\̂: ak9j1]ٕ!H,1e6EEPAvQkޑ k~!GTܸI6-44@jY:qm+sZ;kB7>еax[Q!TRκ"޷H,Pԯ?89rMgnTRb/2L"iQ_dD endstream endobj 262 0 obj <> stream xcd`aa`dduv M34,ILa!C<^Iy&Ú',AD@1PQ@44$83/='UH9(3=DXWHZ*$U*8)x%&ggg*$(x)34R3sBR#B]܃C5н&Ũ$Y|hj_e?X +|Rs^Ԫn@ؕ^{D;{Kod3OYz{DI3gr.<{qlr\,!> stream xڵWixTU"hjoP(҄( a'MȞdRIj_r[PD ,8QgV~\NœiTary{)S(HĮIٜ&%%]J/?s񢤍ٹK#;~~ ?( 1aNN~Z}|^,|7}D[~Yٙd~?+>E{CʼnDwgTٙ.ZĊbEi~n,qWRYNP$._:=solo~bzaV) ɏ 3$$nNKܲ配Wn\e umQiA>uOgDVRT.Ώ)fRwSQs|*$4ZA=O@^P Tz QSWD91q1ū',]{-wףBԢWy'M?twE3>)b~7xkJWEhīՂWVVoo ?D#{-%}N)k1X $l+;Ay%3dU,E]~x_^ tOd&=fã4g3Iч߈擒²ֲ.vr~mFVhUrt8K*/-*n.c{=f%EuRFfih*;#cEw5Σa `y#ߧ0&4 rq:erxvħ֎2fWA5ZįyEPI&7=B' 3M>lenhs6idU@KlJ-5u+pHxl>˷7[IqhK𭆂t!{D> 7˫xWy847=PAxD?2듁ȡ88}%O<ŏĽp$wsyX@oj x2hC4-^faTDKCKr2P8/:\tx(b '`I+2 gƵhcEh'(S$b)X)Rȇ:bmf&>d\UCDbi&5[X*ƗNn]ϖ@!_ @'k<anB]S#MԬ [G|Fa3a *"*jiJN7#|pDsAaZ'LV%r!n-fjKЛۗF{;Z +&:JwH}̷^̓h)@l:hOFJ$HդgP )Bz1pj=hep.Mє=ugu }B 0?8^8u :i~%;ʪMXt> M#u+mfՄI!;]lnЧ Jtu Ѷ_5|beS,(~$+\3j{ǝ> =vglj96fR*a")Osj/rGF>_#f"w=Q(Ҽ+?$I.SLT&,ח4qF1A˂Aox_ej!^p22if3IB<8Y"}qWnҎ4ASg36v; ^`ߥȎ:eׄP#!+JFĽQ<@ co9}4)jJeu tYuΡ8-~=+4oCjApI>j`a4|uX;)]V4ء114+d%ŕuU--ݬg7(HצT4x[8hsֵ15#59wh[R6I{js4tT*j1]G[dٟw\' Ŧ "2Y@^WG3ԁ@I6wzju7q Jk9ڪL;6n#Ԁ"i4Mť#Kɶu뉻ا[.X\Jo`ίZimi=vȡ$fIw2,`. t+4-{-]K2k ?犇F^!zg[?oհ Pkj4$:BCm}g<7軱s:0thOTkX3" .b/h0(9''GonyaZrAQZVR΂ai%K <5M6p: 塖R-6(vcZ ИJ>>8 s7{n#89x|%p4i},>A^CޡDA7yNPb(0Gn9dzdHoZ :ngQMl=-vi]M/z+}6GaRK>>>͆c4WG66%58 zM (VUz2Ѹ6TӸ*{qJĆ֫`tN}KuQ  N(~+~OMck[۬dw2u2u endstream endobj 266 0 obj <> stream xuT{LSgzƜIMYNm36| -h(RPr}@TDD*4.YXq7]L5dp~ +EgLv8|ws.MedP4MKYZ~ƍJcvsJ1,\0o[UMFO|(̠0K$#`UrE?G3ߢ()[zʖL%ZNLiT.MOlmᱛwjVm4kjFNk( ]zR|^ʨ65FR_YT`"DLxB2+>)PPV`Eq&Պ9:BUUT+tՊU;lۮXmss 7J Q,DYBj)3)1I0; rt'橩"2Uz}YDV|\ƛƌ[̵L3:#o ei"rBVyh55) QàVk0hhtp0ʦQ)ty[F2DL(Нll+8v;2UBn9nCݩx7ǹȷAx>B)@R.?䇻!yCC1ZȌUix )_NYGfrzP!|%LN4FQ@8Q4R$^$8ų`ҰΆS[.UX|KP͂GhNMS1w g)\Չot#7tg(g.'+UV$v_ܾw_5|ekXsdXrS/v9z,,Ú,dkL%%;y2yDGFFHn?(OHNM6Ԑ\ ~=¦:㔉"2<;˜ɜ` ءa|:anp1=vA7u Y Y>U +,6hudTp{h|=55.pO^y9#\(,W[banB\g W$~Π$pPY ^w``do^_ ~& Vv6,.b ̃uˁ%m8?;.4lVP[}`@u5 ωJYz0]F78M"k?WHokLNm8[ A`B zhXlp:؂d1p$r /Lm{ۯ-L^CRjA%CHu 8N^TڭYQQrgp`&%+)4pWv-6M!'ƨo(BP9h1&VJ"GvŽXp_M~"UCIGX+M^̎/4Ccj#,*JK$/) 2p/>m@-.;+7rsbٱ?ო~27wzá?}_JiH endstream endobj 268 0 obj <> stream xڍV TT>39*rski,G"PD@^S@3W<ŁAp@DASܨ-YW6I[[kٳo%%H\ׅ HL1= *:EO'PDIE^&| r O(XuM,1vCc3H59A1Q3=y*wՊĴ𭑪~UiejJV樘pUUPT*8pI@ji5 jYJMQҙ/DB0*b)T#I҉[5I9Rc)g8&S9+ԫrGRJB9PITeV2(}]9ٷ35.UvuzLbGq~Qӯ=RAg/ٻ笨zl }DWx:7p=/ZܐM sOoT[w=p)D4O@=ӯ:&^|%`Q"xt@>VW H(rsj̆`.,9?>7WTSZ,GpE Br4zօ` xЁޜqo98 k[x v޹ c J V!\gS嗊z;PoDwؓ$(!-UQ4H'ON43<)!5jE-ty'QbpH_Cd&tQ W |vgp a,*|ip.5}K_[ETFִ w=-|Co3Ϸ )dHdI}!猳Ӟv^9|x!f`( [)eG?!\*vrg11˳΅3灹}t< ֽ䇃pVK3"`})d?h6Y %K3\R]g(E3h֛KU+:SpЩcl ~ݦ:҉#4!=kh=wmɽĦT|~jH>;,1LFS\(lʌaЛ44FzTWϳ#( \ uiӖJ+А'CSr /tpe`K+w N5fAu 7;y+ڣQɉsfRRC 41GPܯpW6A+*GVـLFt"*jЬV{* Edfh/ /rv"/jWq5x4kHmBe[TJG+5}Аɐ+7R]ph*bs`lu|2BYC.e%jSII .pi,T와i4Cʱ:8nKGQ8 fw65&ZvoNX[.Vv֯< h qEJ8&p8=ݡݰ<~GcӞF [Ìں+E!% b5o[ W ;] S/ kq؍7#мRʺzVЀ|,k(W&zQʊ/f'GOnO8pA- ⱘyyNE4r:df~9rUE)z0?U#$m#H{Y+q6zyQa^9IQ.Xc Q 85B\Y r_rcBlZf;I7Y'dl;*'˴p><Ɏiyz~)Lv~LKkqF>eg~pluqky[ޕIoؔ<ٯ Oc1rVl>;GK?!ln?eŖ;^_lOB?}tU8V٢n@M H!t@t\ȝf?ɾǂɠ3$&!˲Zb/b2/[O'OszV 慠k,/\V 癶ΛBIm̭PbuP Qs('Hմ,݉RZ >f؋eb&$&b!7ϥUEe'g؅\ fc~71|HkNj.ҊEZW#DGߓK ]3:49<{t3x<^hwmj;IHv*j] f<y?t{o`gmIK&ZN#J}NȄ̜~ ]>ȭ (uڷ [m1K m&WB.XB׬֧/%&׹RjKs뇟_R΄f`X%ÆzWm,(A#G(ȾQh2L&CGGkPj*k endstream endobj 270 0 obj <> stream xڍ{pSUo6BAQ([tET^0 ֶiy&mMi74I>6I[Z[ZV⢻2;sw=)I+&3L{|N9[V)Vpn^TRZQVl=TԀU{lArNZ&tFaɍ4;kʘC3Sqm8ߞqVy!vVL.Z䱧d}ُ 쒺57JKejEyYrwEyò={e{7ٖME7o)(zpEgepJb`-!YB*(K$v:q;q/qq?0XL!"69D.O3N"tӞOZ(O2fSMɞ2HrESg7E"@BdRp@QUQURwhlYòV+XH vMgHP%_WsA-Hĭ:#ޢ2ʬd};%3/TJFTU‹b$|[D&K)K'&e=.w:4'5].!N^H 6c*3.9)R9е؃ VW5/G?cef5l%h`?h85pN3bB^ bR;!0z@RfKY8m+s%6O?7="X|e@. ̈=`[`&#ƈ snZ鄨?D5\i%MZzv B7@ְkdr0jE~ J{cGlv|}|^3ډEޖѳofה3h-Iim-,HhtłnvwmH]Q%/s<}PP?S0$'>WR/ݎ['Up8+>+,u DZΜ:=r Ŧ2h Ux_bx+ +ٶSZzDMU 6;6enjtBuu/Ps`6FrhBH&^_rnGehgy..) w?m?}ѦyN$uZI|ɛ f7;ZЫd퍹ZуTPݿ<>g{A%oOYrmݕ.sF/PWNGuG@ "zjfYV7k*dMKL<-99gs`V֠1Ⲧ=t endstream endobj 272 0 obj <> stream xm_HSqzjsdPؽ?3IW[naAijfD/wvmWe|-SC(*ҋ}%T(0.!Z|w9,ub4Q@LzY^sp3 &TMqA_aǧPDg*IWX@m<-ON˱ګJ(G@J"a<֚1\b_F8Aټw⣯]y#|s;LuK˟ν>Sb×7jd$0]ѶNMY)YGȄ RF 8%GOjʓ:48[}-k{}Xt_v8%֧{Y>ܶrrH2yy,9:<<4qF6q} vS endstream endobj 274 0 obj <> stream xڅT{lTU?3sd%3 EDPYX#4ȣB٬e3Cig>鴝ͫi˴TBDJ MAU&.93f_}|ichÆ_\ys -NKVpvK>ɞڴ.CILѧ4t!jIOEߞ⹇yD{=zal1@~ϒe&ݹG pHyqYi}N=$G$ʤ_ >Qz VDKP`8AH޷ߞ> ABP/t:.SwZ jTD'95 BxtFlRCߐXhJ*(u)?tn4 ߩTz!RĆbꁍװ};f}119Q9{Z,f%&I7=M4[zο^[Y%fְϫU6*`nJC  n &^^#ҎN|8.%% Fbg7@;.OJ w+ML0n8# ߙ9 wԼ~$x3 MJ7' *!%_fvad-3J@qlƒr=2M Ȃj$sw^/ eY@oJMLOP:zTq$0u=;8Ei4tM@H,uZk L2$NOyvt3l):ܠEcs MT*&HgH0q+~t!qUȚN?xNJ|.rmdMY9!stFrE!:1?~ cGk8~yrhINYp@7=2'[UnU!+E-'#=.wvz;!)V6K0#[=|M 0FPGF!‡fk}pߌvuԘ?f+Jڿ/ao Rrj \m-cN4c.d;9ʼ`40'C(o#Ni6 endstream endobj 276 0 obj <> stream x]]HSa߳t_ )NWV:0БMEB3,3{F$1EtR恊P/>>L"I I޷`M7???? d2dS8RVai uid" Id8BߢC5sc'MR c0 ͍v[M0SR̠+=oc3uv"Gr؋0Y «@.-|>Ձ~_mz>Ϝm)# endstream endobj 278 0 obj <> stream xcd`aa`dds v v54TC Yr?Yd { TX9j *23J45 --u ,sS2|K2RsKԒJ +}rbt;M̒ ԢT kiNj1A@g30110]gZ}?˺D׳}]3zqʴI}݋9˱͝=W{r뤦޴QgWL<ss|7slݕU')=%rv\FͩnjG{Ţ7޽@-Mrպr#T*8~~]н䃭~T6-˻peڽy.w6?ٻ &Wr|ZtROY {g*_~'Mg̵{ b1 endstream endobj 280 0 obj <> stream xڅT{LSW 耬FiLs8Dp {ۃ`X a1E@ȷ<+BE54C< 5POw*s PIh7پBEfNMMjਮ^<]lH̾ux0}nk~]5Zi$ 2*? ?>tmsk5Twc)2uTg ? [cd׋SHwodtDUgh2c0}]͵-콇BosK$(F8oe?xT\2b D@İJ8–:JE6rL.zF8)Ϣ8ذVmɔzƋoap@E(ae+Y\-PWAeA G>}:qU]ie6[;M(i.8Z"pB [nl]- vvh.y}} ڊ:Jz w߀1fjc>ZB*4c,e& aM0CGiT ӑPXmcՊˠjq8KDנcGccn|$y܁@%eR2 ZN0Z:.AŎ:C~{y E&9tڀ6tWU鋠PaGjɛ(0h<ي"Ö\6 yNURqV9!K(]$Y.ܪӺa?_JH2)Z,nCYX\&Iμ6 # @J/Q FBJ04k(Qv?-oEя *Aj&h^[֬#6_;nmruKz*i4Y$o O o| Wјȸh^2^Ҝ7|bkzLZɊiLse֪((UJ;QJD^.!{a49x~'+.fw;*mou H?TE2K~ tl4qZF=Mk R<~OZ2 endstream endobj 282 0 obj <> stream xcd`aa`dd p M34 JM/I,If!Cy?Nɰv*RD>@`b`edd-1300q/,L(Q0200 I Nz ^ٙ y) ^zz ~@L<ԌĜ44`נ` Ѐ`M= rsa:E̜e}+W:ccecs֭3gn.<{޴ XBy8'pupgRoo %[&N0Wp endstream endobj 284 0 obj <> stream x]Pn0[^@TR E w`X:-c >ؠ+Rj_3YtL1hH€!VVG3|H)-GRwvmW8.:G`gcX݀Oi4CnM'-Ru+w5iT$MDQ ڶfH߮è~e`<~ɸ"1$j aӐ_%!-|Bo= endstream endobj 296 0 obj <> stream x]Pn0 ӡ]@|,ю$yR HGc L'85bِ„!V7JGVҳKZ Imoq~T/Gh|8[Lh}XLaӇvd;h 8] -Rڶ(1z0HZ [}2$v0Yd`y;9C=Ҟ¾/rC|w>(l endstream endobj 304 0 obj <> stream x]P=o0 +<^*[" E8Q$n`K~~3o'00lhQ嬬7IZo&^/]yvؖɁ 68k7K¾`hӵ22"E(X]q"4.^* fd(j]W3$w(Id`3=g&Ur4QZn_'U?nz endstream endobj 307 0 obj <> stream x]Pn0+*^ M$ qC%c/R1ؠz5;&sj֢^(nqԓeB$.ۦp3I OjDPv8:ts;ez EAotv݉_> stream x]j0>YNEE/ԙ L.kEw[j΍ogxzL#t8H BrE$>d !EJ1& _.E[ܜy( Yp8 Sо@'kFB!K2#F-0)]omYFra„:9KȧodSプۤIpa[m.I筱*?P~ endstream endobj 259 0 obj <> stream x͚[o8W~aX N6Mm[BID;+'I9M°%9p3,gI+tGŤ8f1Oe60!|@#[+& Ggp:DŽ١գi&5In5A*ki.#)8A$)gJBLjA TP2yq5B+ L[bY$ZM}@ )`1r32̒sfhϬYΠ ÑT0At Ή̅ (kRY5jQP:`Pʂ2RAC\ QQ(45,F21c n%xA$%2/1fU7C"ߌ]Ŋ|yŒռ٦,Mzhou8Z.?pb$(w>K ; ] ը:@B9JЎ5W^ J-|-tAJB#y4b^cR9ړ+V}9:8)Hg+h:_#8_/JZ{/x=ET.dQJ'en de^yɰunR喋3X1*.E=]O.EY.&%"_.YPhpa8,Dcmk RFp 8,$cnBsY}-%G0x  n8}hђmӊէ go%؅&5Z'_nQf: #t+Nf/٩Uk`+A\7m}=;*HR@ٞ.nq=0CWeTDK՘~plzVhQET/hscHa+Ó\OYq_&V:4"$PY I:i!ILUdLee/}ma72MSG JY G@,ssx~xXp\37K>]-MH>[$SB Zĩ_ɻǽ gx+n!QHn*QA8+&Kvz; I|O,tboxVQ-y"rݳ/GugNf{FJrm(;啭{`Zz;<<0LAXUeF+,sUbǧL?fR4yzt-iԚ^;2[]fQM0/خve'￞oOH/W•u'O2ڴ[4_^,/{?? FG/ClR%PgBq$r,[?$q@k*j>:Vn??e]j"O , 8/d+!k2-Ѫ ˪0bpEp{ù8{?dO}Xl-]JtWU_+VedBwʄ|%41]@&yLWѿO,ly+]{2w&INRڃMv߷tq4%,l8| [w-3>z0jrcMG$8K0aṿt4K3k 6҇OLi,!ߤ4%䞦hL8ܦWh<(#Y^%Ruzq:=.$,G[_5Y% _<$㔄Log(b—j+շtC6{`V͊wfG"<0a365dce0e88b14c750c0bbef0b8efba>]/Size 321/W[1 3 2]/Filter/FlateDecode/Length 725>> stream x5UWUQ5`b+"&*`wwc6v](v bw<9g0gs̳9q63',dϱLV kcz`=zacVab<6jUrE翦VXka$( v1Xؐ6#bc# vM)6cm rslAؚ>dKv-&c+ےUm0ha{NB):ExS 3d|> / w%"wHfbO &ӱ7|_ױfbYq!8M!H,)f8F688whNI_5dL{L8 v3Kp68c.HV86܀p=.<܈qnb>j\kp- W*܍px ,B<0xOI|4b<^kx˱S1/RVkٓٷoe?x~'{)bRtR ϟ=R=R<S _>S+ s(siksi դSCҹ|ե<א.\SYTRHh18&`S̔Js>1[bpv_]pt;av Nl,'cOL>!U1 E endstream endobj startxref 112737 %%EOF fastcluster/inst/doc/fastcluster.Rtex0000644000176000001440000011630512147514412017575 0ustar ripleyusers\def\fastclusterversion{1.1.11} \documentclass[fontsize=10pt,paper=letter,BCOR=-6mm]{scrartcl} \usepackage[utf8]{inputenc} \usepackage{lmodern} \normalfont \usepackage[T1]{fontenc} \usepackage{textcomp} \newcommand*\q{\textquotesingle} \usepackage{amsmath} \usepackage{amsfonts} \usepackage{xcolor} \usepackage{ifpdf} \ifpdf \newcommand*\driver{} \else \newcommand*\driver{dvipdfmx} \fi \usepackage[% pdftitle={fastcluster manual}, pdfauthor={Daniel Müllner}, % pdfsubject={}, pdfdisplaydoctitle=true, % pdfduplex=DuplexFlipLongEdge, pdfstartview=FitH, colorlinks=True, pdfhighlight=/I, % pdfborder={0 0 1}, % linkbordercolor={1 .8 .8}, % citebordercolor={.5 .9 .5}, % urlbordercolor={.5 .7 1}, % linkcolor={blue}, % citecolor={blue}, urlcolor={blue!80!black}, linkcolor={red!80!black}, % runcolor={blue}, % filecolor={blue}, pdfpagemode=UseOutlines, bookmarksopen=true, bookmarksopenlevel=1, bookmarksdepth=2, breaklinks=true, unicode=true, \driver ]{hyperref} % Optimize the PDF targets and make the PDF file smaller \ifpdf\RequirePackage{hypdestopt}\fi \renewcommand*\sectionautorefname{Section} \usepackage{typearea} \DeclareMathOperator\size{size} \DeclareMathOperator\Var{Var} \newcommand*\linkage{\href{http://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html}{\texttt{linkage}}} \newcommand*\hierarchy{\href{http://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html}{\texttt{scipy.\hskip0pt cluster.\hskip0pt hierarchy}}} \newcommand*\hclust{\href{http://stat.ethz.ch/R-manual/R-patched/library/stats/html/hclust.html}{\texttt{hclust}}} \newcommand*\stats{\href{http://stat.ethz.ch/R-manual/R-devel/library/stats/html/00Index.html}{\texttt{stats}}} \newcommand*\flashClustPack{\href{http://cran.r-project.org/web/packages/flashClust/index.html}{\texttt{flashClust}}} \newcommand*\dist{\href{http://stat.ethz.ch/R-manual/R-devel/library/stats/html/dist.html}{\texttt{dist}}} \newcommand*\print{\href{http://stat.ethz.ch/R-manual/R-patched/library/base/html/print.html}{\texttt{print}}} \newcommand*\plot{\href{http://stat.ethz.ch/R-manual/R-patched/library/graphics/html/plot.html}{\texttt{plot}}} \newcommand*\identify{\href{http://stat.ethz.ch/R-manual/R-patched/library/stats/html/identify.hclust.html}{\texttt{identify}}} \newcommand*\rect{\href{http://stat.ethz.ch/R-manual/R-patched/library/stats/html/rect.hclust.html}{\texttt{rect.hclust}}} \newcommand*\NA{\href{http://stat.ethz.ch/R-manual/R-devel/library/base/html/NA.html}{\texttt{NA}}} %\usepackage{showframe} \makeatletter \newenvironment{methods}{% \list{}{\labelwidth\z@ \itemindent-\leftmargin \let\makelabel\methodslabel}% }{% \endlist } \newcommand*{\methodslabel}[1]{% %\hspace{\labelsep}% \hbox to \textwidth{\hspace{\labelsep}% \normalfont\bfseries\ttfamily #1\hskip-\labelsep\hfill}% } \makeatother \setkomafont{descriptionlabel}{\normalfont\ttfamily\bfseries} \begin{document} %\VignetteIndexEntry{User's manual} \title{The \textit{fastcluster} package: User's manual} \author{\href{http://math.stanford.edu/~muellner}{Daniel Müllner}} \date{May 23, 2013} \subtitle{Version \fastclusterversion} \maketitle \makeatletter \renewenvironment{quotation}{% \list{}{\listparindent 1em% \itemindent \listparindent \leftmargin2.5em \rightmargin \leftmargin \parsep \z@ \@plus\p@ }% \item\relax }{% \endlist } \makeatother \begin{abstract}\noindent\small The fastcluster package is a C++ library for hierarchical, agglomerative clustering. It efficiently implements the seven most widely used clustering schemes: single, complete, average, weighted/mcquitty, Ward, centroid and median linkage. The library currently has interfaces to two languages: R and Python/SciPy. Part of the functionality is designed as drop-in replacement for existing routines: \linkage{} in the SciPy package \hierarchy{}, \hclust{} in R's \stats{} package, and the \flashClustPack{} package. Once the fastcluster library is loaded at the beginning of the code, every program that uses hierarchical clustering can benefit immediately and effortlessly from the performance gain. Moreover, there are memory-saving routines for clustering of vector data, which go beyond what the existing packages provide. \end{abstract} \noindent This document describes the usage for the two interfaces for R and Python and is meant as the reference document for the end user. Installation instructions are given in the file INSTALL in the source distribution and are not repeated here. The sections about the two interfaces are independent and in consequence somewhat redundant, so that users who need a reference for one interface need to consult only one section. If you use the fastcluster package for scientific work, please cite it as: \begin{quote} Daniel Müllner, \textit{fastcluster: Fast Hierarchical, Agglomerative Clustering Routines for R and Python}, Journal of Statistical Software, \textbf{53} (2013), no.~9, 1--18, \url{http://www.jstatsoft.org/v53/i09/}. \end{quote} \textbf{The fastcluster package is considered stable and will undergo few changes from now on. If some years from now there have not been any updates, this does not necessarily mean that the package is unmaintained but maybe it just was not necessary to correct anything. Of course, please still report potential bugs and incompatibilities to \texttt{muellner@math.stanford.edu}.} \tableofcontents \section{The R interface} Load the package with the following command: \begin{quote} \texttt{library(\q fastcluster\q)} \end{quote} The package overwrites the function \hclust{} from the \stats{} package (in the same way as the \flashClustPack{} package does). Please remove any references to the \flashClustPack{} package in your R files to not accidentally overwrite the \hclust{} function with the \flashClustPack{} version. The \hyperref[hclust]{new \texttt{hclust} function} has exactly the same calling conventions as the old one. You may just load the package and immediately and effortlessly enjoy the performance improvements. The function is also an improvement to the \texttt{flashClust} function from the \flashClustPack{} package. Just replace every call to \texttt{flashClust} by \hyperref[hclust]{\texttt{hclust}} and expect your code to work as before, only faster.\footnote{If you are using flashClust prior to version 1.01, update it! See the change log for \flashClustPack{} at \url{http://cran.r-project.org/web/packages/flashClust/ChangeLog}.} In case the data includes infinite or NaN values, see \autoref{sec:infnan}. If you need to access the old function or make sure that the right function is called, specify the package as follows: \begin{quote} \texttt{\hyperref[hclust]{fastcluster::hclust}(…)}\\ \texttt{flashClust::hclust(…)}\\ \texttt{stats::hclust(…)} \end{quote} Vector data can be clustered with a memory-saving algorithm with the command: \begin{quote} \texttt{\hyperref[hclust.vector]{hclust.vector}(…)} \end{quote} The following sections contain comprehensive descriptions of these methods. \begin{methods} \item [\normalfont\texttt{\textbf{hclust}}\,(\textit{d, method=\q complete\q, members=NULL})] \phantomsection\label{hclust} \addcontentsline{toc}{subsection}{\texttt{hclust}} Hierarchical, agglomerative clustering on a condensed dissimilarity matrix. This method has the same specifications as the method \hclust{} in the package \stats{} and \texttt{hclust} alias \texttt{flashClust} in the package \flashClustPack{}. In particular, the \print{}, \plot{}, \rect{} and \identify{} methods work as expected. The argument $d$ is a condensed distance matrix, as it is produced by \dist. The argument \textit{method} is one of the strings \textit{\q single\q}, \textit{\q complete\q}, \textit{\q average\q}, \textit{\q mcquitty\q}, \textit{\q centroid\q}, \textit{\q median\q}, \textit{\q ward\q}, or an unambiguous abbreviation thereof. The argument \textit{members} specifies the sizes of the initial nodes, ie.\ the number of observations in the initial clusters. The default value \texttt{NULL} says that all initial nodes are singletons, ie.\ have size 1. Otherwise, \textit{members} must be a vector whose size is the number of input points. The vector is processed as a \href{http://stat.ethz.ch/R-manual/R-patched/library/base/html/double.html}{\texttt{double}} array so that not only integer cardinalities of nodes can be accounted for but also weighted nodes with real weights. The general scheme of the agglomerative clustering procedure is as follows: \begin{enumerate} \item Start with $N$ singleton clusters (nodes) labeled $-1,\ldots, -N$, which represent the input points. \item Find a pair of nodes with minimal distance among all pairwise distances. \item Join the two nodes into a new node and remove the two old nodes. The new nodes are labeled consecutively $1,2,\ldots$ \item The distances from the new node to all other nodes is determined by the \textit{method} parameter (see below). \item Repeat $N-1$ times from step 2, until there is one big node, which contains all original input points. \end{enumerate} The output of \texttt{hclust} is an object of class \texttt{\q hclust\q} and represents a \emph{stepwise dendrogram}. It contains the following fields: \begin{description} \item[\normalfont\textit{merge}] This is an $(N-1)\times 2$ array. Row $i$ specifies the labels of the nodes which are joined step $i$ of the clustering. \item[\normalfont\textit{height}] This is a vector of length $N-1$. It contains the sequence of dissimilarities at which every pair of nearest nodes is joined. \item[\normalfont\textit{order}] This is a vector of length $N$. It contains a permutation of the numbers $1,\ldots N$ for the \plot{} method. When the dendrogram is plotted, this is the order in which the singleton nodes are plotted as the leaves of a rooted tree. The order is computed so that the dendrogram is plotted without intersections (except the case when there are inversions for the \textit{\q centroid\q} and \textit{\q median\q} methods). The choice of the \textit{\q order\q} sequence follows the same scheme as the \texttt{stats} package does, only with a faster algorithm. Note that there are many valid choices to order the nodes in a dendrogram without intersections. Also, subsequent points in the \textit{\q order\q} field are not always close in the ultrametric given by the dendrogram. \item[\normalfont\textit{labels}] This copies the attribute \textit{\q Labels\q} from the first input parameter $d$. It contains the labels for the objects being clustered. \item[\normalfont\textit{method}] The (unabbreviated) string for the \textit{\q method\q} parameter. See below for a specification of all available methods. \item[\normalfont\textit{call}] The full command that produced the result. See \href{http://stat.ethz.ch/R-manual/R-patched/library/base/html/match.call.html}{\texttt{match.call}}. \item[\normalfont\textit{dist.method}] This \textit{\q method\q} attribute of the first input parameter $d$. This specifies which metric was used in the \texttt{dist} method which generated the first argument. \end{description} The parameter \textit{method} specifies which clustering scheme to use. The clustering scheme determines the distance from a new node to the other nodes. Denote the dissimilarities by $d$, the nodes to be joined by $I,J$, the new node by $K$ and any other node by $L$. The symbol $|I|$ denotes the size of the cluster $I$. \begin{description} \item [\normalfont\textit{method=\q single\q}:] $\displaystyle d(K,L) = \min(d(I,L), d(J,L))$ The distance between two clusters $A,B$ is the closest distance between any two points in each cluster: \[ d(A,B)=\min_{a\in A, b\in B}d(a,b) \] \item [\normalfont\textit{method=\q complete\q}:] $\displaystyle d(K,L) = \max(d(I,L), d(J,L))$ The distance between two clusters $A,B$ is the maximal distance between any two points in each cluster: \[ d(A,B)=\max_{a\in A, b\in B}d(a,b) \] \item [\normalfont\textit{method=\q average\q}:] $\displaystyle d(K,L) = \frac{|I|\cdot d(I,L)+|J|\cdot d(J,L)}{|I|+|J|}$ The distance between two clusters $A,B$ is the average distance between the points in the two clusters: \[ d(A,B)=\frac1{|A||B|}\sum_{a\in A, b\in B}d(a,b) \] \item [\normalfont\textit{method=\q mcquitty\q}:] $\displaystyle d(K,L) = \tfrac12(d(I,L)+d(J,L))$ There is no global description for the distance between clusters since the distance depends on the order of the merging steps. \end{description} The following three methods are intended for Euclidean data only, ie.\ when $X$ contains the pairwise \textbf{squared} distances between vectors in Euclidean space. The algorithm will work on any input, however, and it is up to the user to make sure that applying the methods makes sense. \begin{description} \item [\normalfont\textit{method=\q centroid\q}:] $\displaystyle d(K,L) = \frac{|I|\cdot d(I,L)+|J|\cdot d(J,L)}{|I|+|J|}-\frac{|I|\cdot|J|\cdot d(I,J)}{(|I|+|J|)^2}$ There is a geometric interpretation: $d(A,B)$ is the distance between the centroids (ie.\ barycenters) of the clusters in Euclidean space: \[ d(A,B) = \|\vec c_A-\vec c_B\|^2, \] where $\vec c_A$ denotes the centroid of the points in cluster $A$. \item [\normalfont\textit{method=\q median\q}:] $\displaystyle d(K,L) = \tfrac12 d(I,L)+\tfrac12 d(J,L)-\tfrac14 d(I,J)$ Define the midpoint $\vec w_K$ of a cluster $K$ iteratively as $\vec w_K=k$ if $K=\{k\}$ is a singleton and as the midpoint $\frac12(\vec w_I+\vec w_J)$ if $K$ is formed by joining $I$ and $J$. Then we have \[ d(A,B)=\|\vec w_A-\vec w_B\|^2 \] in Euclidean space for all nodes $A,B$. Notice however that this distance depends on the order of the merging steps. \item [\normalfont\textit{method=\q ward\q}:] $\displaystyle d(K,L) = \frac{(|I|+|L|)\cdot d(I,L)+(|J|+|L|)\cdot d(J,L)-|L|\cdot d(I,J)}{|I|+|J|+|L|}$ The global cluster dissimilarity can be expressed as \[ d(A,B)=\frac{2|A||B|}{|A|+|B|}\cdot\|\vec c_A-\vec c_B\|^2, \] where $\vec c_A$ again denotes the centroid of the points in cluster $A$. \end{description} \item [\normalfont\texttt{\textbf{hclust.vector}}\,(\textit{X, method=\q single\q, members=NULL, metric=\q euclidean\q, p=NULL})] \phantomsection\label{hclust.vector} \addcontentsline{toc}{subsection}{\texttt{hclust.vector}} This performs hierarchical, agglomerative clustering on vector data with memory-saving algorithms. While the \hyperref[hclust]{\texttt{hclust}} method requires $\Theta(N^2)$ memory for clustering of $N$ points, this method needs $\Theta(ND)$ for $N$ points in $\mathbb R^D$, which is usually much smaller. The argument $X$ must be a two-dimensional matrix with \href{http://stat.ethz.ch/R-manual/R-patched/library/base/html/double.html}{\texttt{double}} precision values. It describes $N$ data points in $\mathbb R^D$ as an $(N\times D)$ matrix. The parameter \textit{\q members\q} is the same as for \hyperref[hclust]{\texttt{hclust}}. The parameter \textit{\q method\q} is one of the strings \textit{\q single\q}, \textit{\q centroid\q}, \textit{\q median\q}, \textit{\q ward\q}, or an unambiguous abbreviation thereof. If \textit{method} is \textit{\q single\q}, single linkage clustering is performed on the data points with the metric which is specified by the \textit{metric} parameter. The choices are the same as in the \href{http://stat.ethz.ch/R-manual/R-patched/library/stats/html/dist.html}{\texttt{dist}} method: \textit{\q euclidean\q}, \textit{\q maximum\q}, \textit{\q manhattan\q}, \textit{\q canberra\q}, \textit{\q binary\q} and \textit{\q minkowski\q}. Any unambiguous substring can be given. The parameter \textit{p} is used for the \textit{\q minkowski\q} metric only.\pagebreak[2] The call \begin{quote} \texttt{hclust.vector(X, method=\q single\q, metric=[...])} \end{quote} is equivalent to \begin{quote} \texttt{hclust(dist(X, metric=[...]), method=\q single\q)} \end{quote} but uses less memory and is equally fast. Ties may be resolved differently, ie.\ if two pairs of nodes have equal, minimal dissimilarity values at some point, in the specific computer's representation for floating point numbers, either pair may be chosen for the next merging step in the dendrogram. If \textit{method} is one of \textit{\q centroid\q}, \textit{\q median\q}, \textit{\q ward\q}, clustering is performed with respect to Euclidean distances. In this case, the parameter \textit{metric} must be \textit{\q euclidean\q}. Notice that \texttt{hclust.vector} operates on Euclidean distances for compatibility reasons with the \dist{} method, while \hyperref[hclust]{\texttt{hclust}} assumes \textbf{squared} Euclidean distances for compatibility with the \href{http://stat.ethz.ch/R-manual/R-patched/library/stats/html/hclust.html}{\texttt{stats::hclust}} method! Hence, the call \phantomsection\label{squared} \begin{quote} \texttt{hc = hclust.vector(X, method=\q ward\q)} \end{quote} is, aside from the lesser memory requirements, equivalent to \begin{quote} \texttt{d = dist(X)}\\ \texttt{hc = hclust(d\textasciicircum 2, method=\q ward\q)}\\ \texttt{hc\$height = sqrt(hc\$height)} \end{quote} The same applies to the \textit{\q centroid\q} and \textit{\q median\q} methods. Differences may arise only from the resolution of ties (which may, however, in extreme cases affect the entire clustering result due to the inherently unstable nature of the clustering schemes). \end{methods} \section{The Python interface} The fastcluster package is imported as usual by: \begin{quote} \texttt{import fastcluster} \end{quote} It provides the following functions: \begin{quote} \hyperref[linkage]{\texttt{linkage}}\,(\textit{X, method=\q single\q, metric=\q euclidean\q, preserve\_input=True})\\ \hyperref[single]{\texttt{single}}\,($X$)\\ \hyperref[complete]{\texttt{complete}}\,($X$)\\ \hyperref[average]{\texttt{average}}\,($X$)\\ \hyperref[weighted]{\texttt{weighted}}\,($X$)\\ \hyperref[ward]{\texttt{ward}}\,($X$)\\ \hyperref[centroid]{\texttt{centroid}}\,($X$)\\ \hyperref[median]{\texttt{median}}\,($X$)\\ \hyperref[linkage_vector]{\texttt{linkage\_vector}}\,(\textit{X, method=\q single\q, metric=\q euclidean\q, extraarg=None}) \end{quote} The following sections contain comprehensive descriptions of these methods. \begin{methods} \item [\normalfont\texttt{fastcluster.\textbf{linkage}}\,(\textit{X, method=\q single\q, metric=\q euclidean\q, preserve\_input=\q True\q})] \phantomsection\label{linkage} \addcontentsline{toc}{subsection}{\texttt{linkage}} Hierarchical, agglomerative clustering on a condensed dissimilarity matrix or on vector data. Apart from the argument \textit{preserve\_input}, the method has the same input parameters and output format as the function of the same name in the module \hierarchy. The argument $X$ is preferably a \href{http://docs.scipy.org/doc/numpy/reference/generated/numpy.ndarray.html}{NumPy array} with floating point entries (\texttt{X.dtype\hskip0pt==\hskip0pt numpy.double}). Any other data format will be converted before it is processed. NumPy's \href{http://docs.scipy.org/doc/numpy/reference/maskedarray.html}{masked arrays} are not treated as special, and the mask is simply ignored. If $X$ is a one-dimensional array, it is considered a condensed matrix of pairwise dissimilarities in the format which is returned by \href{http://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.pdist.html}{\texttt{scipy.spatial.distance.pdist}}. It contains the flattened, upper-triangular part of a pairwise dissimilarity matrix. That is, if there are $N$ data points and the matrix $d$ contains the dissimilarity between the $i$-th and $j$-th observation at position $d_{i,j}$, the vector $X$ has length $\binom N2$ and is ordered as follows: \[ d = \begin{pmatrix} 0&d_{0,1}&d_{0,2}&\ldots&d_{0,n-1}\\ & 0&d_{1,2} & \ldots\\ &&0&\ldots\\ &&&\ddots\\ &&&&0 \end{pmatrix} = \begin{pmatrix} 0&X[0] &X[1]&\ldots&X[n-2]\\ & 0&X[n-1] & \ldots\\ &&0&\ldots\\ &&&\ddots\\ &&&&0 \end{pmatrix} \] The \textit{metric} argument is ignored in case of dissimilarity input. The optional argument \textit{preserve\_input} specifies whether the method makes a working copy of the dissimilarity vector or writes temporary data into the existing array. If the dissimilarities are generated for the clustering step only and are not needed afterward, approximately half the memory can be saved by specifying \textit{preserve\_input=False}. Note that the input array $X$ contains unspecified values after this procedure. It is therefore safer to write \begin{verbatim} linkage(X, method="...", preserve_input=False) del X \end{verbatim} to make sure that the matrix $X$ is not accessed accidentally after it has been used as scratch memory. (The single linkage algorithm does not write to the distance matrix or its copy anyway, so the \textit{preserve\_input} flag has no effect in this case.) If $X$ contains vector data, it must be a two-dimensional array with $N$ observations in $D$ dimensions as an $(N\times D)$ array. The \textit{preserve\_input} argument is ignored in this case. The specified \textit{metric} is used to generate pairwise distances from the input. The following two function calls yield equivalent output: \begin{verbatim} linkage(pdist(X, metric), method="...", preserve_input=False) linkage(X, metric=metric, method="...") \end{verbatim} The two results are identical in most cases, but differences occur if ties are resolved differently: if the minimum in step 2 below is attained for more than one pair of nodes, either pair may be chosen. It is not guaranteed that both \texttt{linkage} variants choose the same pair in this case. The general scheme of the agglomerative clustering procedure is as follows: \begin{enumerate} \item Start with $N$ singleton clusters (nodes) labeled $0,\ldots, N-1$, which represent the input points. \item Find a pair of nodes with minimal distance among all pairwise distances. \item Join the two nodes into a new node and remove the two old nodes. The new nodes are labeled consecutively $N,N+1,\ldots$ \item The distances from the new node to all other nodes is determined by the \textit{method} parameter (see below). \item Repeat $N-1$ times from step 2, until there is one big node, which contains all original input points. \end{enumerate} The output of \texttt{linkage} is \emph{stepwise dendrogram}, which is represented as an $(N-1)\times 4$ \href{http://docs.scipy.org/doc/numpy/reference/generated/numpy.ndarray.html}{NumPy array} with floating point entries (\texttt{dtype=numpy.double}). The first two columns contain the node indices which are joined in each step. The input nodes are labeled $0,\ldots,N-1$, and the newly generated nodes have the labels $N,\ldots, 2N-2$. The third column contains the distance between the two nodes at each step, ie.\ the current minimal distance at the time of the merge. The fourth column counts the number of points which comprise each new node. The parameter \textit{method} specifies which clustering scheme to use. The clustering scheme determines the distance from a new node to the other nodes. Denote the dissimilarities by $d$, the nodes to be joined by $I,J$, the new node by $K$ and any other node by $L$. The symbol $|I|$ denotes the size of the cluster $I$. \begin{description} \item [\normalfont\textit{method=\q single\q}:] $\displaystyle d(K,L) = \min(d(I,L), d(J,L))$ The distance between two clusters $A,B$ is the closest distance between any two points in each cluster: \[ d(A,B)=\min_{a\in A, b\in B}d(a,b) \] \item [\normalfont\textit{method=\q complete\q}:] $\displaystyle d(K,L) = \max(d(I,L), d(J,L))$ The distance between two clusters $A,B$ is the maximal distance between any two points in each cluster: \[ d(A,B)=\max_{a\in A, b\in B}d(a,b) \] \item [\normalfont\textit{method=\q average\q}:] $\displaystyle d(K,L) = \frac{|I|\cdot d(I,L)+|J|\cdot d(J,L)}{|I|+|J|}$ The distance between two clusters $A,B$ is the average distance between the points in the two clusters: \[ d(A,B)=\frac1{|A||B|}\sum_{a\in A, b\in B}d(a,b) \] \item [\normalfont\textit{method=\q weighted\q}:] $\displaystyle d(K,L) = \tfrac12(d(I,L)+d(J,L))$ There is no global description for the distance between clusters since the distance depends on the order of the merging steps. \end{description} The following three methods are intended for Euclidean data only, ie.\ when $X$ contains the pairwise (non-squared!)\ distances between vectors in Euclidean space. The algorithm will work on any input, however, and it is up to the user to make sure that applying the methods makes sense. \begin{description} \item [\normalfont\textit{method=\q centroid\q}:] $\displaystyle d(K,L) = \sqrt{\frac{|I|\cdot d(I,L)+|J|\cdot d(J,L)}{|I|+|J|}-\frac{|I|\cdot|J|\cdot d(I,J)}{(|I|+|J|)^2}}$ There is a geometric interpretation: $d(A,B)$ is the distance between the centroids (ie.\ barycenters) of the clusters in Euclidean space: \[ d(A,B) = \|\vec c_A-\vec c_B\|, \] where $\vec c_A$ denotes the centroid of the points in cluster $A$. \item [\normalfont\textit{method=\q median\q}:] $\displaystyle d(K,L) = \sqrt{\tfrac12 d(I,L)+\tfrac12 d(J,L)-\tfrac14 d(I,J)}$ Define the midpoint $\vec w_K$ of a cluster $K$ iteratively as $\vec w_K=k$ if $K=\{k\}$ is a singleton and as the midpoint $\frac12(\vec w_I+\vec w_J)$ if $K$ is formed by joining $I$ and $J$. Then we have \[ d(A,B)=\|\vec w_A-\vec w_B\| \] in Euclidean space for all nodes $A,B$. Notice however that this distance depends on the order of the merging steps. \item [\normalfont\textit{method=\q ward\q}:] $\displaystyle d(K,L) = \sqrt{\frac{(|I|+|L|)\cdot d(I,L)+(|J|+|L|)\cdot d(J,L)-|L|\cdot d(I,J)}{|I|+|J|+|L|}}$ The global cluster dissimilarity can be expressed as \[ d(A,B)=\sqrt{\frac{2|A||B|}{|A|+|B|}}\cdot\|\vec c_A-\vec c_B\|, \] where $\vec c_A$ again denotes the centroid of the points in cluster $A$. \end{description} \item [\normalfont\texttt{fastcluster.\textbf{single}}\,(\textit{X})] \phantomsection\addcontentsline{toc}{subsection}{\texttt{single}}\label{single} Alias for \texttt{fastcluster.\textbf{linkage}}\,(\textit{X, method=\q single\q}). \item [\normalfont\texttt{fastcluster.\textbf{complete}}\,(\textit{X})] \phantomsection\addcontentsline{toc}{subsection}{\texttt{complete}}\label{complete} Alias for \texttt{fastcluster.\textbf{linkage}}\,(\textit{X, method=\q complete\q}). \item [\normalfont\texttt{fastcluster.\textbf{average}}\,(\textit{X})] \phantomsection\addcontentsline{toc}{subsection}{\texttt{average}}\label{average} Alias for \texttt{fastcluster.\textbf{linkage}}\,(\textit{X, method=\q average\q}). \item [\normalfont\texttt{fastcluster.\textbf{weighted}}\,(\textit{X})] \phantomsection\addcontentsline{toc}{subsection}{\texttt{weighted}}\label{weighted} Alias for \texttt{fastcluster.\textbf{linkage}}\,(\textit{X, method=\q weighted\q}). \item [\normalfont\texttt{fastcluster.\textbf{centroid}}\,(\textit{X})] \phantomsection\addcontentsline{toc}{subsection}{\texttt{centroid}}\label{centroid} Alias for \texttt{fastcluster.\textbf{linkage}}\,(\textit{X, method=\q centroid\q}). \item [\normalfont\texttt{fastcluster.\textbf{median}}\,(\textit{X})] \phantomsection\addcontentsline{toc}{subsection}{\texttt{median}}\label{median} Alias for \texttt{fastcluster.\textbf{linkage}}\,(\textit{X, method=\q median\q}). \item [\normalfont\texttt{fastcluster.\textbf{ward}}\,(\textit{X})] \phantomsection\addcontentsline{toc}{subsection}{\texttt{ward}}\label{ward} Alias for \texttt{fastcluster.\textbf{linkage}}\,(\textit{X, method=\q ward\q}). \item [\normalfont\texttt{fastcluster.\textbf{linkage\_vector}}\,(\textit{X, method=\q single\q, metric=\q euclidean\q, extraarg=\q None\q})] \phantomsection\addcontentsline{toc}{subsection}{\texttt{linkage\_vector}}\label{linkage_vector} This performs hierarchical, agglomerative clustering on vector data with memory-saving algorithms. While the \hyperref[linkage]{\texttt{linkage}} method requires $\Theta(N^2)$ memory for clustering of $N$ points, this method needs $\Theta(ND)$ for $N$ points in $\mathbb R^D$, which is usually much smaller. The argument $X$ has the same format as before, when $X$ describes vector data, ie.\ it is an $(N\times D)$ array. Also the output array has the same format. The parameter \textit{method} must be one of \textit{\q single\q}, \textit{\q centroid\q}, \textit{\q median\q}, \textit{\q ward\q}, ie.\ only for these methods there exist memory-saving algorithms currently. If \textit{method}, is one of \textit{\q centroid\q}, \textit{\q median\q}, \textit{\q ward\q}, the \textit{metric} must be \textit{\q euclidean\q}. Like the \texttt{linkage} method, \texttt{linkage\_vector} does not treat NumPy's \href{http://docs.scipy.org/doc/numpy/reference/maskedarray.html}{masked arrays} as special and simply ignores the mask. For single linkage clustering, any dissimilarity function may be chosen. Basically, every metric which is implemented in the method \href{http://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.pdist.html}{\texttt{scipy.spatial.distance.pdist}} is reimplemented here. However, the metrics differ in some instances since a number of mistakes and typos (both in the code and in the documentation) were corrected in the \textit{fastcluster} package.\footnote{Hopefully, the SciPy metric will be corrected in future versions and some day coincide with the \textit{fastcluster} definitions. See the bug reports at \url{http://projects.scipy.org/scipy/ticket/1484}, \url{http://projects.scipy.org/scipy/ticket/1486}.} Therefore, the available metrics with their definitions are listed below as a reference. The symbols $u$ and $v$ mostly denote vectors in $\mathbb R^D$ with coordinates $u_j$ and $v_j$ respectively. See below for additional metrics for Boolean vectors. Unless otherwise stated, the input array $X$ is converted to a floating point array (\texttt{X.dtype==numpy.double}) if it does has have already the required data type. Some metrics accept Boolean input; in this case this is stated explicitly below. \begin{description} \item[\normalfont\textit{\q euclidean\q}:] Euclidean metric, $L_2$ norm \[ d(u,v) = \| u-v\|_2 = \sqrt{\sum_j (u_j-v_j)^2} \] \item[\normalfont\textit{\q sqeuclidean\q}:] squared Euclidean metric \[ d(u,v) = \| u-v\|^2_2 = \sum_j (u_j-v_j)^2 \] \item[\normalfont\textit{\q seuclidean\q}:] standardized Euclidean metric \[ d(u,v) = \sqrt{\sum_j (u_j-v_j)^2 /V_j} \] The vector $V=(V_0,\ldots,V_{D-1})$ is given as the \textit{extraarg} argument. If no \textit{extraarg} is given, $V_j$ is by default the unbiased sample variance of all observations in the $j$-th coordinate, $V_j = \Var_i(X_{i,j})=\frac1{N-1}\sum_i(X_{i,j}^2-\mu(X_j)^2)$. (Here, $\mu(X_j)$ denotes as usual the mean of $X_{i,j}$ over all rows $i$.) \item[\normalfont\textit{\q mahalanobis\q}:] Mahalanobis distance \[ d(u,v) = \sqrt{(u-v)^{\mkern-3mu\top}V (u-v)} \] Here, $V=\textit{extraarg}$, a $(D\times D)$-matrix. If $V$ is not specified, the inverse of the covariance matrix \texttt{numpy.linalg.inv(numpy.cov(X, rowvar=False))} is used: \[ (V^{-1})_{j,k} = \frac1{N-1} \sum_i (X_{i,j}-\mu(X_j))(X_{i,k}-\mu(X_k)) \] \item[\normalfont\textit{\q cityblock\q}:] the Manhattan distance, $L_1$ norm \[ d(u,v) = \sum_j |u_j-v_j| \] \item[\normalfont\textit{\q chebychev\q}:] the supremum norm, $L_\infty$ norm \[ d(u,v) = \max_j |u_j-v_j| \] \item[\normalfont\textit{\q minkowski\q}:] the $L_p$ norm \[ d(u,v) = \left(\sum_j |u_j-v_j|^p\right)^{1/p} \] This metric coincides with the \textit{cityblock}, \textit{euclidean} and \textit{chebychev} metrics for $p=1$, $p=2$ and $p=\infty$ (\texttt{numpy.inf}), respectively. The parameter $p$ is given as the \textit{\q extraarg\q} argument. \item[\normalfont\textit{\q cosine\q}] \[ d(u,v) = 1 - \frac{\langle u,v\rangle}{\|u\|\cdot\|v\|} = 1 - \frac{\sum_j u_jv_j}{\sqrt{\sum_j u_j^2\cdot \sum_j v_j^2}} \] \item[\normalfont\textit{\q correlation\q}:] This method first mean-centers the rows of $X$ and then applies the \textit{cosine} distance. Equivalently, the \textit{correlation} distance measures $1-{}$\textrm{(Pearson's correlation coefficient)}. \[ d(u,v) = 1 - \frac{\langle u-\mu(u),v-\mu(v)\rangle}{\|u-\mu(u)\|\cdot\|v-\mu(v)\|}, \] \item[\normalfont\textit{\q canberra\q}] \[ d(u,v) = \sum_j\frac{|u_j-v_j|}{|u_j|+|v_j|} \] Summands with $u_j=v_j=0$ contribute 0 to the sum. \item[\normalfont\textit{\q braycurtis\q}] \[ d(u,v) = \frac{\sum_j |u_j-v_j|}{\sum_j |u_j+v_j|} \] \item[\textnormal{(user function):}] The parameter \textit{metric} may also be a function which accepts two NumPy floating point vectors and returns a number. Eg.\ the Euclidean distance could be emulated with \begin{quote} \texttt{fn = lambda u, v: numpy.sqrt(((u-v)*(u-v)).sum())}\\ \texttt{linkage\_vector(X, method=\q single\q, metric=fn)} \end{quote} This method, however, is much slower than the built-in function. \item[\normalfont\textit{\q hamming\q}:] The Hamming distance accepts a Boolean array (\texttt{X.dtype==bool}) for efficient storage. Any other data type is converted to \texttt{numpy.double}. \[ d(u,v) = |\{j\mid u_j\neq v_j\}| \] \item[\normalfont\textit{\q jaccard\q}:] The Jaccard distance accepts a Boolean array (\texttt{X.dtype\hskip0pt ==\hskip0pt bool}) for efficient storage. Any other data type is converted to \texttt{numpy.double}. \[ d(u,v) = \frac{|\{j\mid u_j\neq v_j\}|}{|\{j\mid u_j\neq 0\text{ or } v_j\neq 0\}|} \] \[ d(0,0) = 0 \] Python represents \texttt{True} by 1 and \texttt{False} by 0. In the Boolean case, the Jaccard distance is therefore: \[ d(u,v) = \frac{|\{j\mid u_j\neq v_j\}|}{|\{j\mid u_j\lor v_j\}|} \] \end{description} The following metrics are designed for Boolean vectors. The input array is converted to the \texttt{bool} data type if it is not Boolean already. Use the following abbreviations for the entries of a contingency table: \begin{align*} a &= |\{j\mid u_j\land v_j \}| & b &= |\{j\mid u_j\land(\lnot v_j)\}|\\ c &= |\{j\mid (\lnot u_j)\land v_j \}| & d &= |\{j\mid (\lnot u_j)\land(\lnot v_j)\}| \end{align*} Recall that $D$ denotes the number of dimensions, hence $D=a+b+c+d$. \begin{description} \item[\normalfont\textit{\q yule\q}] \[ d(u,v) = \frac{2bc}{ad+bc} \] \item[\normalfont\textit{\q dice\q}] \begin{gather*} d(u,v) = \frac{b+c}{2a+b+c}\\ d(0,0) = 0 \end{gather*} \item[\normalfont\textit{\q rogerstanimoto\q}] \[ d(u,v) = \frac{2(b+c)}{b+c+D} \] \item[\normalfont\textit{\q russellrao\q}] \[ d(u,v) = \frac{b+c+d}{D} \] \item[\normalfont\textit{\q sokalsneath\q}] \begin{gather*} d(u,v) = \frac{2(b+c)}{a+2(b+c)}\\ d(0,0) = 0 \end{gather*} \item[\normalfont\textit{\q kulsinski\q}] \[ d(u,v) = \frac 12\cdot\left(\frac b{a+b} + \frac c{a+c}\right) \] \item[\normalfont\textit{\q matching\q}] \[ d(u,v) = \frac{b+c}{D} \] Notice that when given a Boolean array, the \textit{matching} and \textit{hamming} distance are the same. The \textit{matching} distance formula, however, converts every input to Boolean first. Hence, the vectors $(0,1)$ and $(0,2)$ have zero \textit{matching} distance since they are both converted to $(\mathrm{False}, \mathrm{True})$ but the \textit{hamming} distance is $0.5$. \item[\normalfont\textit{\q sokalmichener\q}] is an alias for \textit{\q matching\q}. \end{description} \end{methods} \section{Behavior for NaN and infinite values}\label{sec:infnan} Whenever the fastcluster package encounters a NaN value as the distance between nodes, either as the initial distance or as an updated distance after some merging steps, it raises an error. This was designed intentionally, even if there might be ways to propagate NaNs through the algorithms in a more or less sensible way. Indeed, since the clustering result depends on every single distance value, the presence of NaN values usually indicates a dubious clustering result, and therefore NaN values should be eliminated in preprocessing. In the R interface for vector input, coordinates with {\NA} value are interpreted as missing data and treated in the same way as R's {\dist} function does. This results in valid output whenever the resulting distances are not NaN. The Python interface does not provide any way of handling missing coordinates, and data should be processed accordingly and given as pairwise distances to the clustering algorithms in this case. The fastcluster package handles node distances and coordinates with infinite values correctly, as long as the formulas for the distance updates and the metric (in case of vector input) make sense. In concordance with the statement above, an error is produced if a NaN value results from performing arithmetic with infinity. Also, the usual proviso applies: internal formulas in the code are mathematically equivalent to the formulas as stated in the documentation only for finite, real numbers but might produce different results for $\pm\infty$. Apart from obvious cases like single or complete linkage, it is therefore recommended that users think about how they want infinite values to be treated by the distance update and metric formulas and then check whether the fastcluster code does exactly what they want in these special cases. \section{Differences between the two interfaces} \begin{itemize} \item The \textit{\q mcquitty\q} method in R is called \textit{\q weighted\q} in Python. \item R and SciPy use different conventions for the ``Euclidean'' methods \textit{\q centroid\q}, \textit{\q median\q} and \textit{\q Ward\q}! R assumes that the dissimilarity matrix consists of squared Euclidean distances, while SciPy expects non-squared Euclidean distances. The fastcluster package respects these conventions and uses different formulas in the two interfaces. If the same results in both interfaces ought to be obtained, then the \hyperref[hclust]{\texttt{hclust}} function in R must be fed with the entry-wise square of the distance matrix, \verb!d^2!, for the \textit{\q ward\q}, \textit{\q centroid\q} and \textit{\q median\q} methods, and later the square root of the height field in the dendrogram must be taken. The \hyperref[hclust.vector]{\texttt{hclust.vector}} method calculates non-squared Euclidean distances, like R's \dist{} method and the Python interface. See the \hyperref[squared]{example} in the \hyperref[hclust.vector]{\texttt{hclust.vector}} documentation above. For the \textit{\q average\q} and \textit{\q weighted\q} alias \textit{\q mcquitty\q} methods, the same, non-squared distance matrix \texttt{d} as in the Python interface must be used for the same results. The \textit{\q single\q} and \textit{\q complete\q} methods only depend on the relative order of the distances, hence it does not make a difference whether the method operates on the distances or the squared distances. The code example in the R documentation (enter \texttt{?hclust} or \texttt{example(hclust)} in R) contains another instance where the squared distance matrix is generated from Euclidean data. \item The Python interface is not designed to deal with missing values, and NaN values in the vector data raise an error message. The \hyperref[hclust.vector]{\texttt{hclust.vector}} method in the R interface, in contrast, deals with NaN and the (R specific) {\NA} values in the same way as the \dist{} method does. Confer the documentation for \dist{} for details. \end{itemize} \section{References} \begin{trivlist} \item \textit{NumPy: Scientific computing tools for Python}, \url{http://numpy.scipy.org/}. \item Eric Jones, Travis Oliphant, Pearu Peterson et al., \textit{SciPy: Open Source Scientific Tools for Python}, 2001, \url{http://www.scipy.org}. \item \textit{R: A Language and Environment for Statistical Computing}, R Foundation for Statistical Computing, Vienna, 2011, \url{http://www.r-project.org}. \end{trivlist} \end{document} %%% Local variables: %%% mode: latex %%% TeX-master: "fastcluster.Rtex" %%% TeX-PDF-mode: t %%% End: fastcluster/inst/CITATION0000644000176000001440000000133312147514400014751 0ustar ripleyuserscitHeader("To cite fastcluster in publications use:") citEntry(entry = "Article", title = "{fastcluster}: Fast Hierarchical, Agglomerative Clustering Routines for {R} and {Python}", author = personList(as.person("Daniel M\\\"ullner")), journal = "Journal of Statistical Software", year = "2013", volume = "53", number = "9", pages = "1--18", url = "http://www.jstatsoft.org/v53/i09/", textVersion = paste("Daniel Müllner (2013).", "fastcluster: Fast Hierarchical, Agglomerative Clustering Routines for R and Python.", "Journal of Statistical Software, 53(9), 1-18.", "URL http://www.jstatsoft.org/v53/i09/.") ) fastcluster/README0000644000176000001440000001416612147427621013537 0ustar ripleyusersfastcluster: Fast hierarchical clustering routines for R and Python Copyright © 2011 Daniel Müllner The fastcluster package is a C++ library for hierarchical, agglomerative clustering. It efficiently implements the seven most widely used clustering schemes: single, complete, average, weighted/McQuitty, Ward, centroid and median linkage. The library currently has interfaces to two languages: R and Python/NumPy. Part of the functionality is designed as drop-in replacement for existing routines: “linkage” in the SciPy package “scipy.cluster.hierarchy”, “hclust” in R's “stats” package, and the “flashClust” package. Once the fastcluster library is loaded at the beginning of the code, every program that uses hierarchical clustering can benefit immediately and effortlessly from the performance gain. Moreover, there are memory-saving routines for clustering of vector data, which go beyond what the existing packages provide. See the author's home page for more information, in particular a performance comparison with other clustering packages. The User's manual is the file inst/doc/fastcluster.pdf in the source distribution. The fastcluster package is distributed under the BSD license. See the file LICENSE in the source distribution or . Installation ‾‾‾‾‾‾‾‾‾‾‾‾ See the file INSTALL in the source distribution. Usage ‾‾‾‾‾ 1. R ‾‾‾‾ In R, load the package with the following command: library('fastcluster') The package overwrites the function hclust from the “stats” package (in the same way as the flashClust package does). Please remove any references to the flashClust package in your R files to not accidentally overwrite the hclust function with the flashClust version. The new hclust function has exactly the same calling conventions as the old one. You may just load the package and immediately and effortlessly enjoy the performance improvements. The function is also an improvement to the flashClust function from the “flashClust” package. Just replace every call to flashClust by hclust and expect your code to work as before, only faster. (If you are using flashClust prior to version 1.01, update it! See the change log for flashClust: http://cran.r-project.org/web/packages/flashClust/ChangeLog ) If you need to access the old function or make sure that the right function is called, specify the package as follows: fastcluster::hclust(…) flashClust::hclust(…) stats::hclust(…) Vector data can be clustered with a memory-saving algorithm with the command hclust.vector(…) See the User's manual inst/doc/fastcluster.pdf for further details. WARNING ‾‾‾‾‾‾‾ R and Matlab/SciPy use different conventions for the “Ward”, “centroid” and “median” methods. R assumes that the dissimilarity matrix consists of squared Euclidean distances, while Matlab and SciPy expect non-squared Euclidean distances. The fastcluster package respects these conventions and uses different formulas in the two interfaces. If you want the same results in both interfaces, then feed the hclust function in R with the entry-wise square of the distance matrix, D^2, for the “Ward”, “centroid” and “median” methods and later take the square root of the height field in the dendrogram. For the “average” and “weighted” alias “mcquitty” methods, you must still take the same distance matrix D as in the Python interface for the same results. The “single” and “complete” methods only depend on the relative order of the distances, hence it does not make a difference whether the method operates on the distances or the squared distances. The code example in the R documentation (enter ?hclust or example(hclust) in R) contains an instance where the squared distance matrix is generated from Euclidean data. 2. Python ‾‾‾‾‾‾‾‾‾ The fastcluster package is imported as usual by import fastcluster It provides the following functions: linkage(X, method='single', metric='euclidean', preserve_input=True) single(X) complete(X) average(X) weighted(X) ward(X) centroid(X) median(X) linkage_vector(X, method='single', metric='euclidean', extraarg=None) The argument X is either a compressed distance matrix or a collection of n observation vectors in d dimensions as an (n×d) array. Apart from the argument preserve_input, the methods have the same input and output as the functions of the same name in the package scipy.cluster.hierarchy. The additional, optional argument preserve_input specifies whether the fastcluster package first copies the distance matrix or writes into the existing array. If the dissimilarities are generated for the clustering step only and are not needed afterward, approximately half the memory can be saved by specifying preserve_input=False. Note that the input array X contains unspecified values after this procedure. You may want to write linkage(X, method='…', preserve_input=False) del X to make sure that the matrix X is not accessed accidentally after it has been used as scratch memory. The method linkage_vector(X, method='single', metric='euclidean', extraarg=None) provides memory-saving clustering for vector data. It also accepts a collection of n observation vectors in d dimensions as an (n×d) array as the first parameter. The parameter 'method' is either 'single', 'ward', 'centroid' or 'median'. The 'ward', 'centroid' and 'median' methods require the Euclidean metric. In case of single linkage, the 'metric' parameter can be chosen from all metrics which are implemented in scipy.spatial.dist.pdist. There may be differences between linkage(scipy.spatial.dist.pdist(X, metric='…')) and linkage_vector(X, metric='…') since there have been made a few corrections compared to the pdist function. Please consult the the User's manual inst/doc/fastcluster.pdf for comprehensive details. fastcluster/R/0000755000176000001440000000000012147324442013045 5ustar ripleyusersfastcluster/R/fastcluster.R0000644000176000001440000000361011727523223015530 0ustar ripleyusers# fastcluster: Fast hierarchical clustering routines for R and Python # # Copyright © 2011 Daniel Müllner # hclust <- function(d, method="complete", members=NULL) { # Hierarchical clustering, on raw input data. METHODS <- c("single", "complete", "average", "mcquitty", "ward", "centroid", "median") method <- pmatch(method, METHODS) if (is.na(method)) stop("Invalid clustering method.") if (method == -1) stop("Ambiguous clustering method.") dendrogram <- c( .Call(fastcluster, attr(d, "Size"), method, d, members), list( labels = attr(d, "Labels") ,method = METHODS[method] ,call = match.call() ,dist.method = attr(d, "method") ) ) class(dendrogram) <- "hclust" return (dendrogram) } hclust.vector <- function(X, method='single', members=NULL, metric='euclidean', p=NULL) { # Hierarchical clustering, on vector data. METHODS <- c("single", "ward", "centroid", "median") methodidx <- pmatch(method, METHODS) if (is.na(methodidx)) stop(paste("Invalid clustering method '", method, "' for vector data.", sep='')) if (methodidx == -1) stop("Ambiguous clustering method.") METRICS <- c("euclidean", "maximum", "manhattan", "canberra", "binary", "minkowski") metric = pmatch(metric, METRICS) if (is.na(metric)) stop("Invalid metric.") if (metric == -1) stop("Ambiguous metric.") if (methodidx!=1 && metric!=1) stop("The Euclidean methods 'ward', 'centroid' and 'median' require the 'euclidean' metric.") X <- as.matrix(X) dendrogram <- c( .Call(fastcluster_vector, methodidx, metric, X, members, p), list( labels = dimnames(X)[[1L]] ,method = METHODS[methodidx] ,call = match.call() ,dist.method = METRICS[metric] ) ) class(dendrogram) <- "hclust" return (dendrogram) } fastcluster/NEWS0000644000176000001440000001076512147427516013362 0ustar ripleyusersfastcluster: Fast hierarchical clustering routines for R and Python Copyright © 2011 Daniel Müllner Version history ‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾ Version 1.0.0, 03/14/2011: • Initial release, dependent on Rcpp. Not available on CRAN. Version 1.0.1, 03/15/2011: • Removed the dependence on Rcpp; only R's original C interface is used. Version 1.0.2, 03/17/2011: • File DESCRIPTION: Fixed a typo Version 1.0.3, 03/20/2011: • File README: Removed the warning about false results from the flashClust package since the new flashClust version 1.01 has this error corrected. • Cleaned the test file fastcluster_test.R up. (No dependence on the MASS package any more) Version 1.0.4, 03/21/2011: • Changed the name of the external function from the outdated "Rcpp_linkage" to "fastcluster". • Registered the external function "fastcluster" in R. • Configured the C header inclusions to work on Fedora (thanks to Peter Langfelder). Version 1.1.0, 08/21/2011 • Routines for clustering vector data. • Added a User's manual • Revision of all files Version 1.1.1, 10/08/2011 • Fixed test scripts, which indicated an error on some architectures, even if results were correct. (The assumption was that ties in single linkage clustering are resolved in the same way, both for dissimilarity input and for vector input. This is not necessarily true if the floating point unit uses "excess precision". Now the test scripts are content with arbitrary resolution of ties and do not assume a specific scheme.) • Bug fix: uninitialized function pointer in Version 1.1.0 Version 1.1.2, 10/11/2011 • Fix for Solaris: replaced ssize_t by ptrdiff_t in the C++ code. • Removed the NN-chain algorithm for vector input: it was not clear that it would work under all circumstances with the intricacies of floating- point arithmetic. Especially the effects of the excess precision on the x87 are impossible to control in a portable way. Now, the memory-saving routines for the “Ward” linkage use the generic algorithm, as “centroid” and “median” linkage do. Version 1.1.3, 12/10/2011 • Replaced ptrdiff_t by std::ptrdiff_t, as GCC 4.6.1 complains about this. Version 1.1.4, 02/01/2012 • Release the GIL in the Python package, so that it can be used efficiently in multithreaded applications. • Improved performance for the "Ward" method with vector input. • The "members" parameter in the R interface is now treated as a double array, not an integer array as before. This was a slight incompatibility with the stats::hclust function. Thanks to Matthias Studer, University of Geneva, for pointing this out. Version 1.1.5, 02/14/2012 • Updated the "members" specification in the User's manual to reflect the recent change. Version 1.1.6, 03/12/2012 • Bug fix related to GIL release in the Python wrapper. Thanks to Massimo Di Stefano for the bug report. • Small compatibility changes in the Python test scripts (again thanks to Massimo Di Stefano for the report). Version 1.1.7, 09/17/2012 • Scipy import is now optional (suggested by Forest Gregg) • Compatibility fix for NumPy 1.7. Thanks to Semihcan Doken for the bug report. Version 1.1.8, 08/28/2012 • Test for NaN dissimilarity values: Now the algorithms produce an error message instead of silently giving false results. The documentation was updated accordingly. This is the final design as intended: the fastcluster package handles infinity values correctly but complains about NaNs. • The Python interface now works with both Python 2 and Python 3. • Changed the license to BSD. Version 1.1.9, 03/15/2013 • Compatibility fix for the MSVC compilers on Windows. • Simplified GIL release in the Python interface. Version 1.1.10, 05/22/2013 • Updated citation information (JSS paper). • Suppress warnings where applicable. Compilation with GCC should not produce any warning at all, even if all compiler warnings are enabled. (The switch -pedantic still does not work, but this is due to the Python headers.) • Optimization: Hidden symbols. Only the interface functions are exported to the symbol table with GCC. Version 1.1.11, 05/23/2013 • Compatibility fix for Solaris. fastcluster/NAMESPACE0000644000176000001440000000011711727523223014063 0ustar ripleyusersuseDynLib(fastcluster, .registration=TRUE) export('hclust', 'hclust.vector') fastcluster/LICENSE0000644000176000001440000000241612076370772013664 0ustar ripleyusersCopyright © 2011, Daniel Müllner All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fastcluster/INSTALL0000644000176000001440000000733012147520402013672 0ustar ripleyusersfastcluster: Fast hierarchical clustering routines for R and Python Copyright © 2011 Daniel Müllner Installation ‾‾‾‾‾‾‾‾‾‾‾‾ Installation procedures were tested under 64-bit Ubuntu. CRAN also hosts precompiled binaries (of the R library, not the Python module) for Windows and MacOS X. In principle, it should be possible to install the fastcluster package on any system that has a C++ compiler and R respectively Python with NumPy. There are no unusual libraries needed to compile the package, only the STL library, which every C++ compiler should have by default. Please send me feedback if you accomplish to install the fastcluster package on a certain platform but needed to tweak the configuration! I will update the installation instructions and modify the package if needed (eg. include the right compiler flags for various operating systems). Installation for R ‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾ Enter the command install.packages("fastcluster") in R, and R will download the package automatically, then install it. That's it! If this does not work, please consult R's help function by typing ?INSTALL from within R or read the “R installation and administration” manual: http://cran.r-project.org/doc/manuals/R-admin.html#Installing-packages For manual download, you can get the fastcluster package from the download page at CRAN: http://cran.r-project.org/web/packages/fastcluster/ You may need to start R with administrator rights to be able to install packages. There are ways to install R packages without administrator privileges in your user directories. See this help page for example: http://csg.sph.umich.edu/docs/R/localpackages.html Installation for Python ‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾ Make sure that you have both Python and NumPy installed. 1. With setuptools ‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾ If setuptools are installed, type easy_install --upgrade --user fastcluster in a terminal, which automatically downloads the latest version from PyPI, compiles the C++ library and installs the package for a single user without administrator rights. If you cannot make this work, use method 2. Also, this method gives you access to all accompanying information, in particular the documentation in docs/fastcluster.pdf. Moreover, it installs (insignificantly) fewer files. 2. Without setuptools ‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾ If you have not done so already, download the fastcluster package from PyPI here: http://pypi.python.org/pypi/fastcluster/ Open a terminal, go to the directory with the downloaded file and extract the contents of the archive with: tar -xvf fastcluster-(version).tar.gz Alternatively, use your favorite archive manager for unpacking, eg. on Windows. This will generate a new directory “fastcluster-(version)”. Switch to this subdirectory: cd fastcluster-(...) Now compile and install the Python module by: python setup.py install You may need to precede this command with sudo or install the package in your home directory. For example, the line python setup.py install --user installs the package in the path lib/python/ under your home directory. See the chapter “Installing Python modules” in the Python documentation for further help: http://docs.python.org/install/index.html 3. Microsoft Windows ‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾ Christoph Gohlke provides installation files for Windows on his web page: http://www.lfd.uci.edu/~gohlke/pythonlibs/#fastcluster fastcluster/DESCRIPTION0000644000176000001440000000262712147576001014361 0ustar ripleyusersPackage: fastcluster Encoding: UTF-8 Type: Package Version: 1.1.11 Date: 2013-05-23 Title: Fast hierarchical clustering routines for R and Python Authors@R: person("Daniel", "Müllner", email = "muellner@math.stanford.edu", role = c("aut", "cph", "cre")) Author: Daniel Müllner, http://math.stanford.edu/~muellner Maintainer: Daniel Müllner Enhances: stats, flashClust Description: This is a two-in-one package which provides interfaces to both R and Python. It implements fast hierarchical, agglomerative clustering routines. Part of the functionality is designed as drop-in replacement for existing routines: “linkage” in the SciPy package “scipy.cluster.hierarchy”, “hclust” in R's “stats” package, and the “flashClust” package. It provides the same functionality with the benefit of a much faster implementation. Moreover, there are memory-saving routines for clustering of vector data, which go beyond what the existing packages provide. For information on how to install the Python files, see the file INSTALL in the source distribution. License: FreeBSD | GPL-2 | file LICENSE URL: http://math.stanford.edu/~muellner/fastcluster.html BuildVignettes: False Packaged: 2013-05-23 23:13:18 UTC; muellner NeedsCompilation: yes Repository: CRAN Date/Publication: 2013-05-24 01:24:06