genie/ 0000755 0001762 0000144 00000000000 15113354772 011351 5 ustar ligges users genie/tests/ 0000755 0001762 0000144 00000000000 14700200755 012503 5 ustar ligges users genie/tests/testthat/ 0000755 0001762 0000144 00000000000 15113354772 014353 5 ustar ligges users genie/tests/testthat/test-single.R 0000644 0001762 0000144 00000001376 14700200755 016733 0 ustar ligges users library("testthat")
library("genie")
library("stats")
context("hclust2 vs single linkage")
test_that("single_iris_distmat", {
library("datasets")
data("iris")
d <- as.matrix(iris[,1:4])
d[,] <- jitter(d) # otherwise we get a non-unique solution
d <- dist(d)
h1 <- hclust2(d, thresholdGini=1.0)
h2 <- hclust(d, method='single')
expect_equal(h1$merge, h2$merge)
expect_equal(h1$order, h2$order)
})
test_that("single_iris_defaultdist", {
library("datasets")
data("iris")
d <- as.matrix(iris[,2:3])
d[,] <- jitter(d) # otherwise we get a non-unique solution
h1 <- hclust2(objects=d, thresholdGini=1.0)
h2 <- hclust(dist(d), method='single')
expect_equal(h1$merge, h2$merge)
expect_equal(h1$order, h2$order)
})
genie/tests/testthat.R 0000644 0001762 0000144 00000000071 14700200755 014464 0 ustar ligges users library("testthat")
library("genie")
test_check("genie")
genie/MD5 0000644 0001762 0000144 00000003406 15113354772 011664 0 ustar ligges users e029c6b50a0b2f3fcd6375f62b6d7926 *DESCRIPTION
0cce106eda8a0102d80da3495233fff5 *NAMESPACE
27f47ab4b58c0edf97d3add4deac51ea *NEWS
1d29001a8b2960673856de5c93b19fc8 *R/RcppExports.R
ebd61f78f6aa3f40520b2245cd77f2e4 *R/genie-package.R
885bd658b3dbe4868767b57319e793fc *R/hclust2.R
13e66c864ae44f36588e4ba53abdf5fa *inst/CITATION
b515038389d74fe9d8cfed472ea32b35 *man/genie-package.Rd
9e7e26bb241a59479ed35cf804a6b5d9 *man/hclust2.Rd
e2d347f9ad519d45dd6bcf3703472671 *src/Makevars
e2d347f9ad519d45dd6bcf3703472671 *src/Makevars.win
6ab5e681b92cf54e2577bb5855d717e1 *src/RcppExports.cpp
f75e43cdbffbed5a88833191821e4277 *src/defs.h
19da696feb39964eaddf74753fbcc9e1 *src/disjoint_sets.cpp
a0f2862b8ccca8df68b6b5ab19cbf801 *src/disjoint_sets.h
49aaf1500a53ad7c16c916969781ac98 *src/hclust2_common.cpp
425449d3460b875451245a9e47d57aa0 *src/hclust2_common.h
26767a736d26bca88ff22d99e8d96023 *src/hclust2_distance.cpp
99e1c70992b7ac410d411d4430710e97 *src/hclust2_distance.h
10c7d13868fdc278ab6d1013160dc98c *src/hclust2_mstbased_gini.cpp
c35e21e035c416d3034b7cdf9588786a *src/hclust2_mstbased_gini.h
6b2dc47728ae00ef9d7f9ee4dff2d99b *src/hclust2_nnbased_gini.h
b97f1eee13606a015e4303287dcd6c59 *src/hclust2_nnbased_single.cpp
466ca75c790c4484d2353fbdd65af2f5 *src/hclust2_nnbased_single.h
20e177cb965249d834f5ec343e6bff8b *src/hclust2_rcpp_gini.cpp
81e2f743988de6587d1f4604aab57d54 *src/hclust2_result.cpp
34a422f0c54d17bc456006ef326c1438 *src/hclust2_result.h
bd1f6a05d45a343aaa1f6e2535adc769 *src/hclust2_vptree_gini.h
7e6c2b8f4a40908627519dbc51d48c66 *src/hclust2_vptree_single.cpp
eda081a40e264ca13aa79255bf43bfea *src/hclust2_vptree_single.h
f42c9bc4435bb983274895a0f4fc9255 *src/init.cpp
ac0b48e83e47e2efe2d9e15b791155ac *tests/testthat.R
5a2370f089507fe281c93e3c512a0314 *tests/testthat/test-single.R
genie/R/ 0000755 0001762 0000144 00000000000 15113346026 011543 5 ustar ligges users genie/R/RcppExports.R 0000644 0001762 0000144 00000000376 15113346026 014165 0 ustar ligges users # Generated by using Rcpp::compileAttributes() -> do not edit by hand
# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
.hclust2_gini <- function(distance, objects, control = NULL) {
.Call(`_genie_hclust2_gini`, distance, objects, control)
}
genie/R/genie-package.R 0000644 0001762 0000144 00000000351 15113324536 014347 0 ustar ligges users #' @title The Genie Package
#'
#' @description
#' See \code{\link{hclust2}()} for details.
#'
#' @author Marek Gagolewski, Maciej Bartoszuk, Anna Cena
#'
#' @useDynLib genie, .registration=TRUE
#' @importFrom Rcpp evalCpp
"_PACKAGE"
genie/R/hclust2.R 0000644 0001762 0000144 00000013306 15015566476 013273 0 ustar ligges users #' @title
#' Fast Hierarchical Clustering in Spaces Equipped With
#' a Dissimilarity Measure
#'
#' @description
#' The reference implementation of the fast, robust and outlier resistant
#' Genie algorithm described in (Gagolewski, Bartoszuk, Cena, 2016).
#' Note that the \code{genie} package has been superseded by \code{genieclust},
#' see \code{\link[genieclust]{gclust}} and \code{\link[genieclust]{genie}}
#' for more details.
#'
#' @param d an object of class \code{\link[stats]{dist}},
#' \code{NULL}, or a single string, see below
#' @param objects \code{NULL}, numeric matrix, a list, or a character vector
#' @param thresholdGini single numeric value in [0,1],
#' threshold for the Gini index, 1 gives the standard single linkage algorithm
#' @param ... internal parameters used to tune up the algorithm
#'
#' @details
#' The time needed to apply a hierarchical clustering algorithm
#' is most often dominated by the number of computations of a pairwise
#' dissimilarity measure. Such a constraint, for larger data sets,
#' puts at a disadvantage the use of all the classical linkage
#' criteria but the single linkage one. However, it is known that the single
#' linkage clustering algorithm is very sensitive to outliers, produces highly
#' skewed dendrograms, and therefore usually does not reflect the true
#' underlying data structure -- unless the clusters are well-separated.
#'
#' To overcome its limitations, in (Gagolewski, Bartoszuk, Cena, 2016)
#' we proposed a new hierarchical clustering linkage
#' criterion. Namely, our algorithm links two clusters in such a way that a chosen
#' economic inequity measure (here, the Gini index) of the cluster
#' sizes does not increase drastically above a given threshold. The
#' benchmarks indicate a high practical usefulness of the introduced method:
#' it most often outperforms the Ward or average linkage in terms of the
#' clustering quality while retaining the single linkage speed.
#' The algorithm can be run in parallel (via OpenMP) on multiple threads
#' to speed up its execution further on.
#' Its memory overhead is small: there is no need to precompute the complete
#' distance matrix to perform the computations in order to obtain a desired
#' clustering.
#'
#' For compatibility with \code{\link[stats]{hclust}}, \code{d} may be an object
#' of class \code{\link[stats]{dist}}. In such a case, the \code{objects}
#' argument is ignored. Note that such an object requires ca. \emph{8n(n-1)/2}
#' bytes of computer's memory, where \emph{n} is the number of objects to cluster,
#' and therefore this setting can be used to analyse data sets of sizes
#' up to about 10,000-50,000.
#'
#' If \code{objects} is a character vector or a list, then \code{d}
#' should be a single string, one of: \code{levenshtein} (or \code{NULL}),
#' \code{hamming}, \code{dinu} (Dinu, Sgarro, 2006),
#' or \code{euclinf} (Cena et al., 2015).
#' Note that the list must consist
#' either of integer or of numeric vectors only (depending on the dissimilarity
#' measure of choice). On the other hand, each string must be in ASCII,
#' but you can always convert it to UTF-32 with
#' \code{\link[stringi]{stri_enc_toutf32}}.
#'
#' Otherwise, if \code{objects} is a numeric matrix (here, each row
#' denotes a distinct observation), then \code{d} should be
#' a single string, one of: \code{euclidean_squared} (or \code{NULL}),
#' \code{euclidean} (which yields the same results as \code{euclidean_squared})
#' \code{manhattan}, \code{maximum}, or \code{hamming}.
#'
#' @return
#' A named list of class \code{hclust}, see \code{\link[stats]{hclust}},
#' with additional components:
#' \itemize{
#' \item \code{stats} - performance statistics
#' \item \code{control} - internal parameters used
#' }
#'
#' @examples
#' library("datasets")
#' data("iris")
#' h <- hclust2(objects=as.matrix(iris[,2:3]), thresholdGini=0.2)
#' plot(iris[,2], iris[,3], col=cutree(h, 3), pch=as.integer(iris[,5]), asp=1, las=1)
#'
#' @references
#' Cena A., Gagolewski M., Mesiar R., Problems and challenges of information
#' resources producers' clustering, \emph{Journal of Informetrics} 9(2), 2015,
#' pp. 273-284.
#'
#' Dinu L.P., Sgarro A., A Low-complexity Distance for DNA Strings,
#' \emph{Fundamenta Informaticae} 73(3), 2006, pp. 361-372.
#'
#' Gagolewski M., Bartoszuk M., Cena A.,
#' Genie: A new, fast, and outlier-resistant hierarchical clustering algorithm,
#' \emph{Information Sciences} 363, 2016, pp. 8-23.
#'
#' Gagolewski M., Cena A., Bartoszuk M.
#' \emph{Hierarchical clustering via penalty-based aggregation and the Genie
#' approach}, In: Torra V. et al. (Eds.), \emph{Modeling Decisions for
#' Artificial Intelligence} (\emph{Lecture Notes in Artificial Intelligence}
#' 9880), Springer, 2016.
#'
#' @importFrom stats approx
#' @importFrom genieclust gclust
#' @importFrom genieclust genie
#' @export
hclust2 <- function(d=NULL, objects=NULL, thresholdGini=0.3, ...)
{
opts <- list(thresholdGini=thresholdGini, useVpTree=FALSE, ...)
result <- .hclust2_gini(d, objects, opts)
result[["call"]] <- match.call()
result[["method"]] <- "gini"
if (any(result[["height"]]<0)) {
# corrections for departures from ultrametricity
# negative heights denote force Genie merges
# we could just use have used cummax, but then we'd get multiple
# merges at the same level; instead we'll linearly interpolate
# between the points
nonNegative <- which(result[["height"]]>=0)
lastNonNegative <- nonNegative[length(nonNegative)]
result[["height"]][1:lastNonNegative] <-
approx(nonNegative, # linear interpolation
result[["height"]][nonNegative],
1:lastNonNegative)$y
result[["height"]][result[["height"]] < 0] <- cummax(-result[["height"]][result[["height"]] < 0])
}
result
}
genie/NEWS 0000644 0001762 0000144 00000002257 15113315764 012054 0 ustar ligges users genie package NEWS and CHANGELOG
===============================================================================
## 1.0.6 (2025-12-01)
* 'useVpTree' has been removed.
## 1.0.5 (2020-08-02)
* Updated documentation and package metadata.
* This package has been superseded by `genieclust`, which is faster and
more feature-rich (and also available for Python).
## 1.0.4 (2017-04-27)
* Invalid DOI corrected.
## 1.0.3 (2017-04-27)
* [BUILD TIME] Registering native routines and disabling symbol search.
## 1.0.1 (2016-05-25)
* Updated documentation and package metadata.
The algorithm's description can now be found in:
Gagolewski M., Bartoszuk M., Cena A., Genie: A new, fast, and outlier-resistant
hierarchical clustering algorithm, Information Sciences 363, 2016, pp. 8-23,
doi:10.1016/j.ins.2016.05.003
See also:
Gagolewski M., Cena A., Bartoszuk M., Hierarchical clustering via penalty-based
aggregation and the Genie approach, In: Torra V. et al. (Eds.),
Modeling Decisions for Artificial Intelligence (Lecture Notes in Artificial
Intelligence 9880), Springer, 2016, pp. 191-202,
doi:10.1007/978-3-319-45656-0_16.
## 1.0.0 (2016-03-07)
* Initial release.
genie/src/ 0000755 0001762 0000144 00000000000 15113346031 012125 5 ustar ligges users genie/src/disjoint_sets.h 0000644 0001762 0000144 00000010656 15113316206 015170 0 ustar ligges users /* ************************************************************************* *
* This file is part of the `genie` package for R. *
* *
* Copyright 2015-2025 Marek Gagolewski, Maciej Bartoszuk, Anna Cena *
* *
* 'genie' is free software: you can redistribute it and/or *
* modify it under the terms of the GNU General Public License *
* as published by the Free Software Foundation, either version 3 *
* of the License, or (at your option) any later version. *
* *
* 'genie' is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with 'genie'. If not, see . *
* ************************************************************************* */
#ifndef __DISJOINT_SETS_H
#define __DISJOINT_SETS_H
#include "defs.h"
#include
#include
#include
#include
/* see defs.h */
#ifndef DISJOINT_SETS_DEBUG
#define DISJOINT_SETS_DEBUG_CONST const
#else
#define DISJOINT_SETS_DEBUG_CONST /* const */
#endif
namespace grup {
class DisjointSets {
private:
std::vector< std::size_t > clusterParent;
protected:
std::size_t n;
public:
DisjointSets(std::size_t n);
virtual ~DisjointSets();
virtual std::size_t link(std::size_t x, std::size_t y, std::size_t z);
virtual std::size_t link(std::size_t x, std::size_t y);
std::size_t union_set(std::size_t x, std::size_t y);
inline std::size_t find_set(std::size_t x) {
if (clusterParent[x] != x)
return clusterParent[x] = find_set(clusterParent[x]);
else
return clusterParent[x];
}
};
class PhatDisjointSets : public DisjointSets {
private:
std::vector< std::size_t > clusterSize;
std::vector< std::size_t* > clusterMembers;
std::vector< std::size_t > clusterNext;
std::vector< std::size_t > clusterPrev;
std::size_t clusterCount;
std::size_t minClusterSize;
std::size_t minClusterCount;
void recomputeMinClusterSize();
public:
PhatDisjointSets(std::size_t n);
virtual ~PhatDisjointSets();
virtual std::size_t link(std::size_t x, std::size_t y);
virtual std::size_t link(std::size_t x, std::size_t y, std::size_t z);
inline std::size_t getClusterCount() const { return clusterCount; }
inline std::size_t getMinClusterSize() const { return minClusterSize; }
inline const std::size_t* getClusterMembers(std::size_t x) DISJOINT_SETS_DEBUG_CONST {
#ifdef DISJOINT_SETS_DEBUG
STOPIFNOT(find_set(x) == x);
STOPIFNOT(clusterMembers[x]);
#endif
return clusterMembers[x];
}
inline std::size_t getClusterSize(std::size_t x) DISJOINT_SETS_DEBUG_CONST {
#ifdef DISJOINT_SETS_DEBUG
STOPIFNOT(find_set(x) == x);
STOPIFNOT(clusterSize[x] == 0 || clusterMembers[x] != NULL);
#endif
return clusterSize[x];
}
inline std::size_t getClusterPrev(std::size_t x) DISJOINT_SETS_DEBUG_CONST {
#ifdef DISJOINT_SETS_DEBUG
STOPIFNOT(find_set(x) == x);
STOPIFNOT(find_set(clusterPrev[x]) == clusterPrev[x]);
STOPIFNOT(find_set(clusterNext[x]) == clusterNext[x]);
#endif
return clusterPrev[x];
}
inline std::size_t getClusterNext(std::size_t x) DISJOINT_SETS_DEBUG_CONST {
/*
to iterate over all clusters starting from x, use something like:
for (size_t nx = ds.getClusterNext(x); nx != x; nx = ds.getClusterNext(nx)) {
// e.g.:
for (auto it = ds.getClusterMembers(nx).cbegin(); it != ds.getClusterMembers(nx).cend(); ++it)
// play with *it
}
*/
#ifdef DISJOINT_SETS_DEBUG
STOPIFNOT(find_set(x) == x);
STOPIFNOT(find_set(clusterPrev[x]) == clusterPrev[x]);
STOPIFNOT(find_set(clusterNext[x]) == clusterNext[x]);
#endif
return clusterNext[x];
}
};
} /* namespace grup */
#endif
genie/src/hclust2_nnbased_single.cpp 0000644 0001762 0000144 00000015006 15113316206 017253 0 ustar ligges users /* ************************************************************************* *
* This file is part of the `genie` package for R. *
* *
* Copyright 2015-2025 Marek Gagolewski, Maciej Bartoszuk, Anna Cena *
* *
* 'genie' is free software: you can redistribute it and/or *
* modify it under the terms of the GNU General Public License *
* as published by the Free Software Foundation, either version 3 *
* of the License, or (at your option) any later version. *
* *
* 'genie' is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with 'genie'. If not, see . *
* ************************************************************************* */
#include "hclust2_nnbased_single.h"
using namespace grup;
// constructor (OK, we all know what this is, but I label it for faster in-code search)
HClustNNbasedSingle::HClustNNbasedSingle(Distance* dist, HClustOptions* opts) :
opts(opts),
n(dist->getObjectCount()),
distance(dist),
indices(dist->getObjectCount()),
neighborsCount(dist->getObjectCount(), 0),
minRadiuses(dist->getObjectCount(), -INFINITY),
shouldFind(dist->getObjectCount(), true),
#ifdef GENERATE_STATS
stats(),
#endif
ds(dist->getObjectCount())
{
// starting indices: random permutation of {0,1,...,_n-1}
for (size_t i=0;i= 1; i--)
swap(indices[i], indices[(size_t)(unif_rand()*(i+1))]);
#ifdef _OPENMP
omp_init_lock(&pqwritelock);
#endif
}
HClustNNbasedSingle::~HClustNNbasedSingle() {
#ifdef _OPENMP
omp_destroy_lock(&pqwritelock);
#endif
}
void HClustNNbasedSingle::getNearestNeighbors(
std::priority_queue< HeapHierarchicalItem > & pq,
size_t index)
{
if (!shouldFind[index])
return;
size_t clusterIndex = ds.find_set(index);
#ifdef GENERATE_STATS
#ifdef _OPENMP
#pragma omp atomic
#endif
++stats.nnCals;
#endif
NNHeap nnheap;
getNearestNeighborsFromMinRadius(index, clusterIndex, minRadiuses[index], nnheap);
size_t newNeighborsCount = 0.0;
#ifdef _OPENMP
omp_set_lock(&pqwritelock);
#endif
while (!nnheap.empty()) {
if (isfinite(nnheap.top().dist) && nnheap.top().index != SIZE_MAX) {
++newNeighborsCount;
pq.push(HeapHierarchicalItem(index, nnheap.top().index, nnheap.top().dist));
minRadiuses[index] = std::max(minRadiuses[index], nnheap.top().dist);
}
nnheap.pop();
}
neighborsCount[index] += newNeighborsCount;
#ifdef GENERATE_STATS
stats.nnCount += newNeighborsCount;
#endif
if (neighborsCount[index] > n - index || newNeighborsCount == 0)
shouldFind[index] = false;
else {
pq.push(HeapHierarchicalItem(index, SIZE_MAX, minRadiuses[index])); // to be continued...
}
#ifdef _OPENMP
omp_unset_lock(&pqwritelock);
#endif
}
void HClustNNbasedSingle::computePrefetch(std::priority_queue< HeapHierarchicalItem > & pq)
{
// INIT: Pre-fetch a few nearest neighbors for each point
MESSAGE_2("[%010.3f] prefetching NNs\n", clock()/(float)CLOCKS_PER_SEC);
#ifdef _OPENMP
omp_set_dynamic(0); /* the runtime will not dynamically adjust the number of threads */
#pragma omp parallel for schedule(dynamic)
#endif
for (size_t i=0; i & pq,
HClustResult& res)
{
MESSAGE_2("[%010.3f] merging clusters\n", clock()/(float)CLOCKS_PER_SEC);
volatile bool go=true;
volatile size_t i = 0;
#ifdef _OPENMP
#pragma omp parallel
#endif
while (go)
{
#ifdef _OPENMP
omp_set_lock(&pqwritelock);
#endif
STOPIFNOT(!pq.empty())
HeapHierarchicalItem hhi = pq.top();
if (hhi.index2 == SIZE_MAX) {
pq.pop();
#ifdef _OPENMP
omp_unset_lock(&pqwritelock);
#endif
getNearestNeighbors(pq, hhi.index1);
continue;
}
size_t s1 = ds.find_set(hhi.index1);
size_t s2 = ds.find_set(hhi.index2);
if (s1 == s2)
{
pq.pop();
#ifdef _OPENMP
omp_unset_lock(&pqwritelock);
#endif
continue;
}
#ifdef _OPENMP
omp_unset_lock(&pqwritelock); //different threads will be unable to put data into pq without it
#pragma omp barrier
#pragma omp single
#endif
{
hhi = pq.top(); //it can change, because other threads can push something
pq.pop();
s1 = ds.find_set(hhi.index1);
s2 = ds.find_set(hhi.index2);
STOPIFNOT(s1 != s2);
STOPIFNOT(s2 != SIZE_MAX);
STOPIFNOT(hhi.index1 < hhi.index2);
res.link(indices[hhi.index1], indices[hhi.index2], hhi.dist);
ds.link(s1, s2);
++i;
if (i == n-1)
go = false;/* avoids computing unnecessary nn */
} // #pragma omp single
if (MASTER_OR_SINGLE_THREAD) {
if (i % 512 == 0) MESSAGE_7("\r merge clusters: %d / %d", i+1, n-1);
Rcpp::checkUserInterrupt(); // may throw an exception, fast op, not thread safe
}
}
MESSAGE_7("\r merge clusters: %d / %d \n", n-1, n-1);
Rcpp::checkUserInterrupt();
}
HClustResult HClustNNbasedSingle::compute(bool lite)
{
std::priority_queue< HeapHierarchicalItem > pq;
// HclustPriorityQueue pq(n);
HClustResult res(n, distance, lite);
#if VERBOSE >= 5
distance->getStats().print();
#endif
prefetch = true;
computePrefetch(pq);
prefetch = false;
#if VERBOSE >= 5
distance->getStats().print();
#endif
computeMerge(pq, res);
return res;
}
genie/src/hclust2_vptree_gini.h 0000644 0001762 0000144 00000011155 15113316206 016261 0 ustar ligges users /* ************************************************************************* *
* This file is part of the `genie` package for R. *
* *
* Copyright 2015-2025 Marek Gagolewski, Maciej Bartoszuk, Anna Cena *
* *
* 'genie' is free software: you can redistribute it and/or *
* modify it under the terms of the GNU General Public License *
* as published by the Free Software Foundation, either version 3 *
* of the License, or (at your option) any later version. *
* *
* 'genie' is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with 'genie'. If not, see . *
* ************************************************************************* */
#ifndef __HCLUST2_VPTREE_GINI_H
#define __HCLUST2_VPTREE_GINI_H
// ************************************************************************
#include "hclust2_nnbased_gini.h"
namespace grup
{
struct HClustVpTreeGiniNode
{
size_t vpindex;
size_t left;
size_t right;
double radius;
bool sameCluster;
size_t maxindex;
HClustVpTreeGiniNode* childL;
HClustVpTreeGiniNode* childR;
HClustVpTreeGiniNode() :
vpindex(SIZE_MAX), left(SIZE_MAX), right(SIZE_MAX), radius(-INFINITY),
sameCluster(false), maxindex(SIZE_MAX), childL(NULL), childR(NULL) { }
HClustVpTreeGiniNode(size_t left, size_t right) :
vpindex(SIZE_MAX), left(left), right(right), radius(-INFINITY),
sameCluster(false), maxindex(SIZE_MAX), childL(NULL), childR(NULL) { }
HClustVpTreeGiniNode(size_t vpindex, size_t left, size_t right, double radius) :
vpindex(vpindex), left(left), right(right), radius(radius),
sameCluster(false), maxindex(SIZE_MAX), childL(NULL), childR(NULL) { }
~HClustVpTreeGiniNode() {
if (childL) delete childL;
if (childR) delete childR;
}
};
class HClustVpTreeGini : public HClustNNbasedGini
{
protected:
HClustVpTreeGiniNode* root;
// bool visitAll; // for testing only
size_t chooseNewVantagePoint(size_t left, size_t right);
HClustVpTreeGiniNode* buildFromPoints(size_t left, size_t right, std::vector& distances);
inline void getNearestNeighborsFromMinRadiusRecursive(HClustVpTreeGiniNode* node,
size_t index, size_t clusterIndex, double minR, std::priority_queue& bestR, double& maxR, NNHeap& nnheap)
{
// search within (minR, maxR]
STOPIFNOT(node != NULL);
#ifdef GENERATE_STATS
#ifdef _OPENMP
#pragma omp atomic
#endif
++stats.nodeVisit;
#endif
if (!prefetch && node->sameCluster && clusterIndex == ds.find_set(node->left))
return;
if (node->vpindex == SIZE_MAX) { // leaf
getNearestNeighborsFromMinRadiusRecursiveLeaf(node, index, clusterIndex,
minR, bestR, maxR, nnheap);
}
else {
getNearestNeighborsFromMinRadiusRecursiveNonLeaf(node, index, clusterIndex,
minR, bestR, maxR, nnheap);
}
}
void getNearestNeighborsFromMinRadiusRecursiveLeaf(HClustVpTreeGiniNode* node,
size_t index, size_t clusterIndex, double minR, std::priority_queue& bestR, double& maxR, NNHeap& nnheap);
void getNearestNeighborsFromMinRadiusRecursiveNonLeaf(HClustVpTreeGiniNode* node,
size_t index, size_t clusterIndex, double minR, std::priority_queue& bestR, double& maxR, NNHeap& nnheap);
virtual void getNearestNeighborsFromMinRadius(size_t index, size_t clusterIndex, double minR, double& maxR, NNHeap& nnheap) {
std::priority_queue bestR;
size_t minNN = (prefetch)?opts->minNNPrefetch:opts->minNNMerge;
for (size_t i=0; i