densityClust/0000755000176200001440000000000013173633643012755 5ustar liggesusersdensityClust/tests/0000755000176200001440000000000013173617741014120 5ustar liggesusersdensityClust/tests/testthat.R0000644000176200001440000000005013173617741016076 0ustar liggesuserslibrary(testthat) library(densityClust) densityClust/tests/testthat/0000755000176200001440000000000013173633643015757 5ustar liggesusersdensityClust/tests/testthat/generateReference.R0000644000176200001440000000656113173617741021524 0ustar liggesusers# This code generates expected results from the initial reference implementation # (i.e. https://github.com/thomasp85/densityClust/commit/b038fb30ea6f59d60a3a4b45eaa3ac9a504951f6) # It is called by the testing code to compare the old results with the new. set.seed(123) dists <- list( dist(matrix(rnorm(1000), ncol = 4)), dist(matrix(rnorm(1000), ncol = 20)), dist(matrix(rnorm(10000), ncol = 40)), dist(matrix(sample(1:100000, 1000), ncol = 4)), dist(matrix(sample(1:100000, 1000), ncol = 20)), dist(matrix(sample(1:100000, 1000), ncol = 50)) ) referenceImplementation <- function(distance, dc, gaussian=FALSE) { if(missing(dc)) { dc <- reference_estimateDc(distance) } rho <- reference_localDensity(distance, dc, gaussian=gaussian) delta <- reference_distanceToPeak(distance, rho) res <- list(rho=rho, delta=delta, distance=distance, dc=dc, threshold=c(rho=NA, delta=NA), peaks=NA, clusters=NA, halo=NA, knn_graph = NA, nearest_higher_density_neighbor = NA, nn.index = NA, nn.dist = NA) class(res) <- 'densityCluster' res } reference_estimateDc <- function(distance, neighborRateLow=0.01, neighborRateHigh=0.02) { comb <- as.matrix(distance) size <- attr(distance, 'Size') dc <- min(distance) dcMod <- as.numeric(summary(distance)['Median']*0.01) while(TRUE) { neighborRate <- mean((apply(comb < dc, 1, sum)-1)/size) if(neighborRate > neighborRateLow && neighborRate < neighborRateHigh) break if(neighborRate > neighborRateHigh) { dc <- dc - dcMod dcMod <- dcMod/2 } dc <- dc + dcMod } cat('Distance cutoff calculated to', dc, '\n') dc } reference_distanceToPeak <- function(distance, rho) { comb <- as.matrix(distance) res <- sapply(1:length(rho), function(i) { peaks <- comb[rho>rho[i], i] if(length(peaks) == 0) { max(comb[,i]) } else { min(peaks) } }) names(res) <- names(rho) res } reference_localDensity <- function(distance, dc, gaussian=FALSE) { comb <- as.matrix(distance) if(gaussian) { res <- apply(exp(-(comb/dc)^2), 1, sum)-1 } else { res <- apply(comb < dc, 1, sum)-1 } if(is.null(attr(distance, 'Labels'))) { names(res) <- NULL } else { names(res) <- attr(distance, 'Labels') } res } # Because the new implementation of estimateDc does not maintain equality with # the previous implementation, calculate the cutoffs using the new # implementation. Then pass the calculated cutoffs into the reference # implementation and the new implementation to test that the rest of the # implementations are the same. dcs <- lapply(dists, estimateDc) # Reference DCs for comparison referenceDcs <- lapply(dists, reference_estimateDc) # non-Gaussian densityClustReference <- Map(referenceImplementation, dists, dcs) # convenient for debugging, but calling non-exported functions not allowed in CRAN # localDensityReference <- Map(reference_localDensity, dists, dcs) # # distanceToPeakReference <- Map(reference_distanceToPeak, dists, localDensityReference) # Gaussian gaussianDensityClustReference <- Map(referenceImplementation, dists, dcs, TRUE) # convenient for debugging, but calling non-exported functions not allowed in CRAN # gaussianLocalDensityReference <- Map(f = function(x, y) reference_localDensity(x, y, gaussian = TRUE), dists, estimateDcReference) densityClust/tests/testthat/testEquivalenceToReferenceImplementation.R0000644000176200001440000000550213173617741026276 0ustar liggesusersset.seed(123) dists <- list( dist(matrix(stats::rnorm(1000), ncol = 4)), dist(matrix(stats::rnorm(1000), ncol = 20)), dist(matrix(stats::rnorm(10000), ncol = 40)), dist(matrix(sample(1:100000, 1000), ncol = 4)), dist(matrix(sample(1:100000, 1000), ncol = 20)), dist(matrix(sample(1:100000, 1000), ncol = 50)) ) context("Reference implementation") # get dcs and reference targets source("generateReference.R") dcComparison <- simplify2array(Map(function(x, y) abs(1 - x / y) <= 0.15, dcs, referenceDcs)) test_that("Reference DCs and new DCs are within 15% of each other", { expect_true(all(dcComparison)) }) densityClustNewImp <- lapply(dists, densityClust) test_that("Test equivalence to reference implementation of densityClust", { expect_equal(densityClustReference, densityClustNewImp) }) # convenient for debugging, but calling non-exported functions not allowed in CRAN # localDensityNewImp <- Map(densityClust:::localDensity, dists, estimateDcNewImp) # test_that("Test equivalence to reference implementation of localDensity", { # expect_equal(localDensityReference, localDensityNewImp) # }) # # distanceToPeakNewImp <- Map(densityClust:::distanceToPeak, dists, localDensityNewImp) # test_that("Test equivalence to reference implementation of distanc eToPeak", { # expect_equal(distanceToPeakReference, distanceToPeakNewImp) # }) gaussianDensityClustNewImp <- lapply(dists, FUN = function(x) densityClust(x, gaussian = TRUE)) test_that("Test equivalence to reference implementation of gaussianDensityClust", { expect_equal(gaussianDensityClustReference, gaussianDensityClustNewImp) }) # convenient for debugging, but calling non-exported functions not allowed in CRAN # gaussianLocalDensityNewImp <- Map(f = function(x, y) densityClust:::localDensity(x, y, gaussian = TRUE), dists, estimateDcReference) # test_that("Test equivalence to reference implementation of localDensity", { # expect_equal(gaussianLocalDensityReference, gaussianLocalDensityNewImp) # }) #check the findDistValueByRowColInd return the correct index as desired: test_that("Test equivalence to reference implementation of gaussianDensityClust", { test <- dist(c(1:100)) test_mat <- as.matrix(test) cluster <- test_mat[, 1] newImp_res <- densityClust:::findDistValueByRowColInd(test, attr(test, 'Size'), which(cluster == 1), which(cluster != 1)) <= 4 oriImp_res <- as.vector(test_mat[cluster == 1, cluster != 1] <= 4) expect_equal(newImp_res, oriImp_res ) newImp_res <- densityClust:::findDistValueByRowColInd(test, attr(test, 'Size'), which(cluster == 4), which(cluster == 5)) oriImp_re <- as.vector(test_mat[cluster == 4, cluster == 5]) expect_equal(newImp_res, oriImp_re) dist_vals <- densityClust:::findDistValueByRowColInd(test, attr(test, 'Size'), 1:100, 1:100) expect_equal(dist_vals, as.vector(test_mat)) }) densityClust/src/0000755000176200001440000000000013173624332013537 5ustar liggesusersdensityClust/src/distanceToPeak.cpp0000644000176200001440000000216313173624332017143 0ustar liggesusers #include using namespace Rcpp; // [[Rcpp::export]] NumericVector distanceToPeakCpp(NumericVector distance, NumericVector rho) { int size = rho.size(); NumericVector peaks(size); NumericVector maximum(size); int i = 0; for (int col = 0; col < size; col++) { for (int row = col + 1; row < size; row++) { double newValue = distance[i]; double rhoRow = rho[row]; double rhoCol = rho[col]; if (rhoRow > rhoCol) { double peaksCol = peaks[col]; if (newValue < peaksCol || peaksCol == 0) { peaks[col] = newValue; } } else if (newValue > maximum[col]) { maximum[col] = newValue; } if (rhoCol > rhoRow) { double peaksRow = peaks[row]; if (newValue < peaksRow || peaksRow == 0) { peaks[row] = newValue; } } else if (newValue > maximum[row]) { maximum[row] = newValue; } i++; } } for (int j = 0; j < size; j++) { if (peaks[j] == 0) { peaks[j] = maximum[j]; } else { // do nothing, peaks is already min } } return peaks; } densityClust/src/findDistValueByRowColInd.cpp0000644000176200001440000001436013173624332021064 0ustar liggesusers#include using namespace Rcpp; // This is a simple example of exporting a C++ function to R. You can // source this function into an R session using the Rcpp::sourceCpp // function (or via the Source button on the editor toolbar). Learn // more about Rcpp at: // // http://www.rcpp.org/ // http://adv-r.had.co.nz/Rcpp.html // http://gallery.rcpp.org/ // // [[Rcpp::export]] NumericVector findDistValueByRowColInd(NumericVector distance, int num_row, NumericVector row_inds, NumericVector col_inds) { int row_inds_len = row_inds.size(); int col_inds_len = col_inds.size(); NumericVector res(row_inds_len * col_inds_len); // Rcout << "distance is " << distance << "col_inds index is " << col_inds << std::endl; // Rcout << "row_inds is " << row_inds << "col_inds index is " << col_inds << std::endl; // Rcout << "col_inds_len is " << col_inds_len << "col_inds_len index is " << col_inds_len << "res length is " << res.size() << std::endl; int i = 0; int dist_ind; for (int row = 0; row < row_inds_len; row++) { int row_ind = row_inds[row]; for (int col = 0; col < col_inds_len; col++) { int col_ind = col_inds[col]; // Rcout << "col is " << col << "row is " << row << std::endl; if(row_ind == col_ind){ res[i] = 0; } else{ int row_ind_new; int col_ind_new; if(col_ind > row_ind) { int row_ind_tmp = row_ind; int col_ind_tmp = col_ind; row_ind_new = col_ind_tmp; col_ind_new = row_ind_tmp; } else{ row_ind_new = row_ind; col_ind_new = col_ind; } dist_ind = num_row * (col_ind_new - 1) + row_ind_new - 0.5 * (1 + col_ind_new) * col_ind_new - 1; // if(row_ind == 3 && col_ind == 2){ // Rcout << "num_row * (col_ind_new - 1) is " << num_row * (col_ind_new - 1) << " 1/2 * (1 + col_ind_new) * col_ind_new is " << 0.5 * (1 + 1) * col_ind_new << " num_row is " << num_row << " dist_ind is " << num_row * (col_ind_new - 1) + row_ind_new - 1/2 * (1 + col_ind_new) * col_ind_new - 1 << std::endl; // Rcout << "col_ind_new is " << col_ind_new << " row_ind_new index is " << row_ind_new << " num_row is " << num_row << " dist_ind is " << dist_ind << " distance under the current index " << distance[dist_ind] << std::endl; // } res[i] = distance[dist_ind]; } i++; } } return res; } // You can include R code blocks in C++ files processed with sourceCpp // (useful for testing and development). The R code will be automatically // run after the compilation. // NumericVector all_finite(NumericVector x) { return x[x < R_PosInf]; } // [[Rcpp::export]] List smallest_dist_rho_order_coords(NumericVector ordered_rho, NumericVector ordered_coords) { int sample_size = ordered_rho.size(); int dim_num = ordered_coords.size() / sample_size; NumericVector smallest_dist(sample_size); NumericVector nearest_higher_density_sample(sample_size); // Rcout << "sample_size is " << sample_size << "dim_num is " << dim_num << std::endl; // Rcout << "ordered_coords is " << ordered_coords << std::endl; // Rcout << "smallest distances across cells are " << smallest_dist << std::endl; double current_dist; for (int cell_ind = 0; cell_ind < sample_size; cell_ind ++) { smallest_dist[cell_ind] = R_PosInf; nearest_higher_density_sample[cell_ind] = cell_ind; if(cell_ind == sample_size - 1) { // assign the last distance to the highest density peak cell NumericVector all_finite_vals(sample_size - 1); all_finite_vals = all_finite(smallest_dist); // Rcout << "all_finite_vals are " << all_finite_vals << std::endl; NumericVector::iterator maximal = std::max_element(all_finite_vals.begin(), all_finite_vals.end()); // Rcout << "maximal index is " << all_finite_vals[maximal - all_finite_vals.begin()] << std::endl; smallest_dist[cell_ind] = all_finite_vals[maximal - all_finite_vals.begin()]; nearest_higher_density_sample[cell_ind] = maximal - all_finite_vals.begin(); } for (int higher_local_density_cell_ind = cell_ind + 1; higher_local_density_cell_ind < sample_size; higher_local_density_cell_ind ++) { // Rcout << "current cell ind is " << cell_ind << "current cell ind with higher density is " << higher_local_density_cell_ind << std::endl; NumericVector source_coord(dim_num); NumericVector target_coord(dim_num); current_dist = 0; double tmp; for(int dim_num_tmp = 0; dim_num_tmp < dim_num; dim_num_tmp ++) { source_coord[dim_num_tmp] = ordered_coords[cell_ind + dim_num_tmp * sample_size]; target_coord[dim_num_tmp] = ordered_coords[higher_local_density_cell_ind + dim_num_tmp * sample_size]; tmp = source_coord[dim_num_tmp] - target_coord[dim_num_tmp]; // tmp = (ordered_coords[cell_ind + 1 + cell_ind * sample_size] - ordered_coords[higher_local_density_cell_ind + 1 + higher_local_density_cell_ind * sample_size]); current_dist += tmp * tmp; } current_dist = sqrt(current_dist); // Rcout << "current source cell coord is " << source_coord << "current target cell coord is " << target_coord << std::endl; // Rcout << "current_dist is " << current_dist << std::endl; if(smallest_dist[cell_ind] > current_dist) { smallest_dist[cell_ind] = current_dist; nearest_higher_density_sample[cell_ind] = higher_local_density_cell_ind; } } } // Rcout << "smallest distances across cells are " << smallest_dist << std::endl; // Rcout << "smallest distances across cells are " << nearest_higher_density_sample << std::endl; // Rcout << "smallest distances across cells are " << current_dist << std::endl; // return List::create(Named("smallest_dist") = smallest_dist, // Named("nearest_higher_density_sample") = nearest_higher_density_sample, // Named("current_dist") = current_dist); return List::create(Named("smallest_dist") = smallest_dist, Named("nearest_higher_density_sample") = nearest_higher_density_sample); } /*** R findDistValueByRowColInd(test, attr(test, 'Size'), 1:100, 1:100) smallest_dist_rho_order_coords(rho, master_tsne$Y) */ densityClust/src/localDensity.cpp0000644000176200001440000000306213173624332016676 0ustar liggesusers#include using namespace Rcpp; // [[Rcpp::export]] NumericVector gaussianLocalDensity(NumericVector distance, int nrow, double dc) { int size = distance.size(); NumericVector half(size); for (int i = 0; i < size; i++) { double combOver = distance[i] / dc; double negSq = pow(combOver, 2) * -1; half[i] = exp(negSq); } int ncol = nrow; NumericVector result(nrow); int i = 0; for (int col = 0; col < ncol; col++) { for (int row = col + 1; row < nrow; row++) { double temp = half[i]; result[row] += temp; result[col] += temp; i++; } } return result; } // [[Rcpp::export]] NumericVector nonGaussianLocalDensity(NumericVector distance, int nrow, double dc) { int ncol = nrow; NumericVector result(nrow); int i = 0; for (int col = 0; col < ncol; col++) { for (int row = col + 1; row < nrow; row++) { if((i % 10000) == 0){ // if(verbose){ // Rcout << "index is " << i << " distance under the current index " << distance[i] << std::endl; // } } if(i > distance.size()){ // Rcout << "Warning: index is larger than the length of the distance vector" << distance[i] << std::endl; } if (distance[i] < dc) { result[row] += 1; result[col] += 1; } else { // do nothing } i++; } } // if(verbose){ // Rcout << "last index is " << i << " length of distance is " << distance.size() << "number of rows is " << nrow << "number of columns is " << ncol << std::endl; // } return result; } densityClust/src/RcppExports.cpp0000644000176200001440000000776713173624332016555 0ustar liggesusers// Generated by using Rcpp::compileAttributes() -> do not edit by hand // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 #include using namespace Rcpp; // distanceToPeakCpp NumericVector distanceToPeakCpp(NumericVector distance, NumericVector rho); RcppExport SEXP densityClust_distanceToPeakCpp(SEXP distanceSEXP, SEXP rhoSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< NumericVector >::type distance(distanceSEXP); Rcpp::traits::input_parameter< NumericVector >::type rho(rhoSEXP); rcpp_result_gen = Rcpp::wrap(distanceToPeakCpp(distance, rho)); return rcpp_result_gen; END_RCPP } // findDistValueByRowColInd NumericVector findDistValueByRowColInd(NumericVector distance, int num_row, NumericVector row_inds, NumericVector col_inds); RcppExport SEXP densityClust_findDistValueByRowColInd(SEXP distanceSEXP, SEXP num_rowSEXP, SEXP row_indsSEXP, SEXP col_indsSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< NumericVector >::type distance(distanceSEXP); Rcpp::traits::input_parameter< int >::type num_row(num_rowSEXP); Rcpp::traits::input_parameter< NumericVector >::type row_inds(row_indsSEXP); Rcpp::traits::input_parameter< NumericVector >::type col_inds(col_indsSEXP); rcpp_result_gen = Rcpp::wrap(findDistValueByRowColInd(distance, num_row, row_inds, col_inds)); return rcpp_result_gen; END_RCPP } // smallest_dist_rho_order_coords List smallest_dist_rho_order_coords(NumericVector ordered_rho, NumericVector ordered_coords); RcppExport SEXP densityClust_smallest_dist_rho_order_coords(SEXP ordered_rhoSEXP, SEXP ordered_coordsSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< NumericVector >::type ordered_rho(ordered_rhoSEXP); Rcpp::traits::input_parameter< NumericVector >::type ordered_coords(ordered_coordsSEXP); rcpp_result_gen = Rcpp::wrap(smallest_dist_rho_order_coords(ordered_rho, ordered_coords)); return rcpp_result_gen; END_RCPP } // gaussianLocalDensity NumericVector gaussianLocalDensity(NumericVector distance, int nrow, double dc); RcppExport SEXP densityClust_gaussianLocalDensity(SEXP distanceSEXP, SEXP nrowSEXP, SEXP dcSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< NumericVector >::type distance(distanceSEXP); Rcpp::traits::input_parameter< int >::type nrow(nrowSEXP); Rcpp::traits::input_parameter< double >::type dc(dcSEXP); rcpp_result_gen = Rcpp::wrap(gaussianLocalDensity(distance, nrow, dc)); return rcpp_result_gen; END_RCPP } // nonGaussianLocalDensity NumericVector nonGaussianLocalDensity(NumericVector distance, int nrow, double dc); RcppExport SEXP densityClust_nonGaussianLocalDensity(SEXP distanceSEXP, SEXP nrowSEXP, SEXP dcSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< NumericVector >::type distance(distanceSEXP); Rcpp::traits::input_parameter< int >::type nrow(nrowSEXP); Rcpp::traits::input_parameter< double >::type dc(dcSEXP); rcpp_result_gen = Rcpp::wrap(nonGaussianLocalDensity(distance, nrow, dc)); return rcpp_result_gen; END_RCPP } static const R_CallMethodDef CallEntries[] = { {"densityClust_distanceToPeakCpp", (DL_FUNC) &densityClust_distanceToPeakCpp, 2}, {"densityClust_findDistValueByRowColInd", (DL_FUNC) &densityClust_findDistValueByRowColInd, 4}, {"densityClust_smallest_dist_rho_order_coords", (DL_FUNC) &densityClust_smallest_dist_rho_order_coords, 2}, {"densityClust_gaussianLocalDensity", (DL_FUNC) &densityClust_gaussianLocalDensity, 3}, {"densityClust_nonGaussianLocalDensity", (DL_FUNC) &densityClust_nonGaussianLocalDensity, 3}, {NULL, NULL, 0} }; RcppExport void R_init_densityClust(DllInfo *dll) { R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); R_useDynamicSymbols(dll, FALSE); } densityClust/NAMESPACE0000644000176200001440000000230513173617741014175 0ustar liggesusers# Generated by roxygen2: do not edit by hand S3method(clustered,densityCluster) S3method(clusters,densityCluster) S3method(findClusters,densityCluster) S3method(labels,densityCluster) S3method(plot,densityCluster) S3method(plotMDS,densityCluster) S3method(plotTSNE,densityCluster) S3method(print,densityCluster) export(clustered) export(clusters) export(densityClust) export(estimateDc) export(findClusters) export(plotDensityClust) export(plotMDS) export(plotTSNE) importFrom(FNN,get.knn) importFrom(RColorBrewer,brewer.pal) importFrom(Rcpp,sourceCpp) importFrom(Rtsne,Rtsne) importFrom(ggplot2,aes_string) importFrom(ggplot2,geom_label) importFrom(ggplot2,geom_line) importFrom(ggplot2,geom_point) importFrom(ggplot2,geom_segment) importFrom(ggplot2,geom_text) importFrom(ggplot2,ggplot) importFrom(ggplot2,labs) importFrom(ggplot2,scale_color_manual) importFrom(ggplot2,theme) importFrom(ggplot2,theme_bw) importFrom(ggrepel,geom_label_repel) importFrom(grDevices,rainbow) importFrom(graphics,legend) importFrom(graphics,locator) importFrom(graphics,plot) importFrom(graphics,points) importFrom(gridExtra,grid.arrange) importFrom(stats,cmdscale) importFrom(stats,dist) importFrom(stats,rnorm) useDynLib(densityClust) densityClust/R/0000755000176200001440000000000013173617741013157 5ustar liggesusersdensityClust/R/densityClust.R0000644000176200001440000006443513173617741016010 0ustar liggesusers#' Clustering by fast search and find of density peaks #' #' This package implement the clustering algorithm described by Alex Rodriguez #' and Alessandro Laio (2014). It provides the user with tools for generating #' the initial rho and delta values for each observation as well as using these #' to assign observations to clusters. This is done in two passes so the user is #' free to reassign observations to clusters using a new set of rho and delta #' thresholds, without needing to recalculate everything. #' #' @section Plotting: #' Two types of plots are supported by this package, and both mimics the types of #' plots used in the publication for the algorithm. The standard plot function #' produces a decision plot, with optional colouring of cluster peaks if these #' are assigned. Furthermore [plotMDS()] performs a multidimensional #' scaling of the distance matrix and plots this as a scatterplot. If clusters #' are assigned observations are coloured according to their assignment. #' #' @section Cluster detection: #' The two main functions for this package are [densityClust()] and #' [findClusters()]. The former takes a distance matrix and optionally #' a distance cutoff and calculates rho and delta for each observation. The #' latter takes the output of [densityClust()] and make cluster #' assignment for each observation based on a user defined rho and delta #' threshold. If the thresholds are not specified the user is able to supply #' them interactively by clicking on a decision plot. #' #' @examples #' irisDist <- dist(iris[,1:4]) #' irisClust <- densityClust(irisDist, gaussian=TRUE) #' plot(irisClust) # Inspect clustering attributes to define thresholds #' #' irisClust <- findClusters(irisClust, rho=2, delta=2) #' plotMDS(irisClust) #' split(iris[,5], irisClust$clusters) #' #' @seealso [densityClust()], [findClusters()], [plotMDS()] #' @aliases NULL #' #' @references Rodriguez, A., & Laio, A. (2014). *Clustering by fast search and find of density peaks.* Science, **344**(6191), 1492-1496. doi:10.1126/science.1242072 #' #' @useDynLib densityClust #' @importFrom Rcpp sourceCpp #' '_PACKAGE' #' Computes the local density of points in a distance matrix #' #' This function takes a distance matrix and a distance cutoff and calculate the #' local density for each point in the matrix. The computation can either be #' done using a simple summation of the points with the distance cutoff for each #' observation, or by applying a gaussian kernel scaled by the distance cutoff #' (more robust for low-density data) #' #' @param distance A distance matrix #' #' @param dc A numeric value specifying the distance cutoff #' #' @param gaussian Logical. Should a gaussian kernel be used to estimate the #' density (defaults to `FALSE`) #' #' @return A vector of local density values, the index matching row and column #' indexes in the distance matrix #' #' @noRd #' localDensity <- function(distance, dc, gaussian = FALSE) { # These implementations are faster by virtue of being written in C++ # They also avoid the need to convert `distance` to a matrix. if (gaussian) { res <- gaussianLocalDensity(distance, attr(distance, "Size"), dc) } else { res <- nonGaussianLocalDensity(distance, attr(distance, "Size"), dc) } if (is.null(attr(distance, 'Labels'))) { names(res) <- NULL } else { names(res) <- attr(distance, 'Labels') } res } #' Calculate distance to closest observation of higher density #' #' This function finds, for each observation, the minimum distance to an #' observation of higher local density. #' #' @param distance A distance matrix #' #' @param rho A vector of local density values as outputted by [localDensity()] #' #' @return A vector of distances with index matching the index in rho #' #' @noRd #' distanceToPeak <- function(distance, rho) { # This implementation is faster by virtue of being written in C++. # It also avoids the need to convert `distance` to a matrix. res <- distanceToPeakCpp(distance, rho); names(res) <- names(rho) res } #' Estimate the distance cutoff for a specified neighbor rate #' #' This function calculates a distance cutoff value for a specific distance #' matrix that makes the average neighbor rate (number of points within the #' distance cutoff value) fall between the provided range. The authors of the #' algorithm suggests aiming for a neighbor rate between 1 and 2 percent, but #' also states that the algorithm is quite robust with regards to more extreme #' cases. #' #' @note If the number of points is larger than 448 (resulting in 100,128 #' pairwise distances), 100,128 distance pairs will be randomly selected to #' speed up computation time. Use [set.seed()] prior to calling #' `estimateDc` in order to ensure reproducable results. #' #' @param distance A distance matrix #' #' @param neighborRateLow The lower bound of the neighbor rate #' #' @param neighborRateHigh The upper bound of the neighbor rate #' #' @return A numeric value giving the estimated distance cutoff value #' #' @examples #' irisDist <- dist(iris[,1:4]) #' estimateDc(irisDist) #' #' @references Rodriguez, A., & Laio, A. (2014). *Clustering by fast search and find of density peaks.* Science, **344**(6191), 1492-1496. doi:10.1126/science.1242072 #' #' @export #' estimateDc <- function(distance, neighborRateLow = 0.01, neighborRateHigh = 0.02) { # This implementation uses binary search instead of linear search. size <- attr(distance, 'Size') # If size is greater than 448, there will be >100000 elements in the distance # object. Subsampling to 100000 elements will speed performance for very # large dist objects while retaining good accuracy in estimating the cutoff if (size > 448) { distance <- distance[sample.int(length(distance), 100128)] size <- 448 } low <- min(distance) high <- max(distance) dc <- 0 while (TRUE) { dc <- (low + high) / 2 # neighborRate = average of number of elements of comb per row that are # less than dc minus 1 divided by size. # This implementation avoids converting `distance` to a matrix. The matrix is # symmetrical, so doubling the result from `distance` (half of the matrix) is # equivalent. The diagonal of the matrix will always be 0, so as long as dc # is greater than 0, we add 1 for every element of the diagonal, which is # the same as size neighborRate <- (((sum(distance < dc) * 2 + (if (0 <= dc) size)) / size - 1)) / size if (neighborRate >= neighborRateLow && neighborRate <= neighborRateHigh) break if (neighborRate < neighborRateLow) { low <- dc } else { high <- dc } } cat('Distance cutoff calculated to', dc, '\n') dc } #' Calculate clustering attributes based on the densityClust algorithm #' #' This function takes a distance matrix and optionally a distance cutoff and #' calculates the values necessary for clustering based on the algorithm #' proposed by Alex Rodrigues and Alessandro Laio (see references). The actual #' assignment to clusters are done in a later step, based on user defined #' threshold values. If a distance matrix is passed into `distance` the #' original algorithm described in the paper is used. If a matrix or data.frame #' is passed instead it is interpretted as point coordinates and rho will be #' estimated based on k-nearest neighbors of each point (rho is estimated as #' `exp(-mean(x))` where `x` is the distance to the nearest #' neighbors). This can be useful when data is so large that calculating the #' full distance matrix can be prohibitive. #' #' @details #' The function calculates rho and delta for the observations in the provided #' distance matrix. If a distance cutoff is not provided this is first estimated #' using [estimateDc()] with default values. #' #' The information kept in the densityCluster object is: #' \describe{ #' \item{`rho`}{A vector of local density values} #' \item{`delta`}{A vector of minimum distances to observations of higher density} #' \item{`distance`}{The initial distance matrix} #' \item{`dc`}{The distance cutoff used to calculate rho} #' \item{`threshold`}{A named vector specifying the threshold values for rho and delta used for cluster detection} #' \item{`peaks`}{A vector of indexes specifying the cluster center for each cluster} #' \item{`clusters`}{A vector of cluster affiliations for each observation. The clusters are referenced as indexes in the peaks vector} #' \item{`halo`}{A logical vector specifying for each observation if it is considered part of the halo} #' \item{`knn_graph`}{kNN graph constructed. It is only applicable to the case where coordinates are used as input. Currently it is set as NA.} #' \item{`nearest_higher_density_neighbor`}{index for the nearest sample with higher density. It is only applicable to the case where coordinates are used as input.} #' \item{`nn.index`}{indices for each cell's k-nearest neighbors. It is only applicable for the case where coordinates are used as input.} #' \item{`nn.dist`}{distance to each cell's k-nearest neighbors. It is only applicable for the case where coordinates are used as input.} #' } #' Before running findClusters the threshold, peaks, clusters and halo data is #' `NA`. #' #' @param distance A distance matrix or a matrix (or data.frame) for the #' coordinates of the data. If a matrix or data.frame is used the distances and #' local density will be estimated using a fast k-nearest neighbor approach. #' #' @param dc A distance cutoff for calculating the local density. If missing it #' will be estimated with `estimateDc(distance)` #' #' @param gaussian Logical. Should a gaussian kernel be used to estimate the #' density (defaults to FALSE) #' #' @param verbose Logical. Should the running details be reported #' #' @param ... Additional parameters passed on to [get.knn][FNN::get.knn] #' #' @return A densityCluster object. See details for a description. #' #' @examples #' irisDist <- dist(iris[,1:4]) #' irisClust <- densityClust(irisDist, gaussian=TRUE) #' plot(irisClust) # Inspect clustering attributes to define thresholds #' #' irisClust <- findClusters(irisClust, rho=2, delta=2) #' plotMDS(irisClust) #' split(iris[,5], irisClust$clusters) #' #' @seealso [estimateDc()], [findClusters()] #' #' @references Rodriguez, A., & Laio, A. (2014). *Clustering by fast search and find of density peaks.* Science, **344**(6191), 1492-1496. doi:10.1126/science.1242072 #' #' @export #' densityClust <- function(distance, dc, gaussian=FALSE, verbose = FALSE, ...) { if (class(distance) %in% c('data.frame', 'matrix')) { dp_knn_args <- list(mat = distance, verbose = verbose, ...) res <- do.call(densityClust.knn, dp_knn_args) } else { if (missing(dc)) { if (verbose) message('Calculating the distance cutoff') dc <- estimateDc(distance) } if (verbose) message('Calculating the local density for each sample based on distance cutoff') rho <- localDensity(distance, dc, gaussian = gaussian) if (verbose) message('Calculating the minimal distance of a sample to another sample with higher density') delta <- distanceToPeak(distance, rho) if (verbose) message('Returning result...') res <- list( rho = rho, delta = delta, distance = distance, dc = dc, threshold = c(rho = NA, delta = NA), peaks = NA, clusters = NA, halo = NA, knn_graph = NA, nearest_higher_density_neighbor = NA, nn.index = NA, nn.dist = NA ) class(res) <- 'densityCluster' } res } #' @export #' @importFrom graphics plot points #' plot.densityCluster <- function(x, ...) { plot(x$rho, x$delta, main = 'Decision graph', xlab = expression(rho), ylab = expression(delta)) if (!is.na(x$peaks[1])) { points(x$rho[x$peaks], x$delta[x$peaks], col = 2:(1 + length(x$peaks)), pch = 19) } } #' Plot observations using multidimensional scaling and colour by cluster #' #' This function produces an MDS scatterplot based on the distance matrix of the #' densityCluster object (if there is only the coordinates information, a distance #' matrix will be calculate first), and, if clusters are defined, colours each #' observation according to cluster affiliation. Observations belonging to a cluster #' core is plotted with filled circles and observations belonging to the halo with #' hollow circles. This plotting is not suitable for running large datasets (for example #' datasets with > 1000 samples). Users are suggested to use other methods, for example #' tSNE, etc. to visualize their clustering results too. #' #' @param x A densityCluster object as produced by [densityClust()] #' #' @param ... Additional parameters. Currently ignored #' #' @examples #' irisDist <- dist(iris[,1:4]) #' irisClust <- densityClust(irisDist, gaussian=TRUE) #' plot(irisClust) # Inspect clustering attributes to define thresholds #' #' irisClust <- findClusters(irisClust, rho=2, delta=2) #' plotMDS(irisClust) #' split(iris[,5], irisClust$clusters) #' #' @seealso [densityClust()] for creating `densityCluster` #' objects, and [plotTSNE()] for an alternative plotting approach. #' #' @export #' plotMDS <- function(x, ...) { UseMethod('plotMDS') } #' @export #' @importFrom stats cmdscale #' @importFrom graphics plot points legend #' @importFrom stats dist plotMDS.densityCluster <- function(x, ...) { if (class(x$distance) %in% c('data.frame', 'matrix')) { mds <- cmdscale(dist(x$distance)) } else { mds <- cmdscale(x$distance) } plot(mds[,1], mds[,2], xlab = '', ylab = '', main = 'MDS plot of observations') if (!is.na(x$peaks[1])) { for (i in 1:length(x$peaks)) { ind <- which(x$clusters == i) points(mds[ind, 1], mds[ind, 2], col = i + 1, pch = ifelse(x$halo[ind], 1, 19)) } legend('topright', legend = c('core', 'halo'), pch = c(19, 1), horiz = TRUE) } } #' Plot observations using t-distributed neighbor embedding and colour by cluster #' #' This function produces an t-SNE scatterplot based on the distance matrix of the #' densityCluster object (if there is only the coordinates information, a distance #' matrix will be calculate first), and, if clusters are defined, colours each #' observation according to cluster affiliation. Observations belonging to a cluster #' core is plotted with filled circles and observations belonging to the halo with #' hollow circles. #' #' @param x A densityCluster object as produced by [densityClust()] #' #' @param ... Additional parameters. Currently ignored #' #' @examples #' irisDist <- dist(iris[,1:4]) #' irisClust <- densityClust(irisDist, gaussian=TRUE) #' plot(irisClust) # Inspect clustering attributes to define thresholds #' #' irisClust <- findClusters(irisClust, rho=2, delta=2) #' plotTSNE(irisClust) #' split(iris[,5], irisClust$clusters) #' #' @seealso [densityClust()] for creating `densityCluster` #' objects, and [plotMDS()] for an alternative plotting approach. #' #' @export #' plotTSNE <- function(x, ...) { UseMethod('plotTSNE') } #' @export #' @importFrom graphics plot points legend #' @importFrom stats dist #' @importFrom stats rnorm #' @importFrom Rtsne Rtsne plotTSNE.densityCluster <- function(x, max_components = 2, ...) { if (class(x$distance) %in% c('data.frame', 'matrix')) { data <- as.matrix(dist(x$distance)) } else { data <- as.matrix(x$distance) } # avoid issues related to repetitions dup_id <- which(duplicated(data)) if (length(dup_id) > 0) { data[dup_id, ] <- data[dup_id, ] + rnorm(length(dup_id) * ncol(data), sd = 1e-10) } tsne_res <- Rtsne::Rtsne(as.matrix(data), dims = max_components, pca = T) tsne_data <- tsne_res$Y[, 1:max_components] plot(tsne_data[,1], tsne_data[,2], xlab = '', ylab = '', main = 'tSNE plot of observations') if (!is.na(x$peaks[1])) { for (i in 1:length(x$peaks)) { ind <- which(x$clusters == i) points(tsne_data[ind, 1], tsne_data[ind, 2], col = i + 1, pch = ifelse(x$halo[ind], 1, 19)) } legend('topright', legend = c('core', 'halo'), pch = c(19, 1), horiz = TRUE) } } #' @export #' print.densityCluster <- function(x, ...) { if (is.na(x$peaks[1])) { cat('A densityCluster object with no clusters defined\n\n') cat('Number of observations:', length(x$rho), '\n') } else { cat('A densityCluster object with', length(x$peaks), 'clusters defined\n\n') cat('Number of observations:', length(x$rho), '\n') cat('Observations in core: ', sum(!x$halo), '\n\n') cat('Parameters:\n') cat('dc (distance cutoff) rho threshold delta threshold\n') cat(formatC(x$dc, width = -22), formatC(x$threshold[1], width = -22), x$threshold[2]) } } #' Detect clusters in a densityCluster obejct #' #' This function uses the supplied rho and delta thresholds to detect cluster #' peaks and assign the rest of the observations to one of these clusters. #' Furthermore core/halo status is calculated. If either rho or delta threshold #' is missing the user is presented with a decision plot where they are able to #' click on the plot area to set the treshold. If either rho or delta is set, #' this takes presedence over the value found by clicking. #' #' @param x A densityCluster object as produced by [densityClust()] #' #' @param ... Additional parameters passed on #' #' @return A densityCluster object with clusters assigned to all observations #' #' @examples #' irisDist <- dist(iris[,1:4]) #' irisClust <- densityClust(irisDist, gaussian=TRUE) #' plot(irisClust) # Inspect clustering attributes to define thresholds #' #' irisClust <- findClusters(irisClust, rho=2, delta=2) #' plotMDS(irisClust) #' split(iris[,5], irisClust$clusters) #' #' @references Rodriguez, A., & Laio, A. (2014). *Clustering by fast search and find of density peaks.* Science, **344**(6191), 1492-1496. doi:10.1126/science.1242072 #' #' @export #' findClusters <- function(x, ...) { UseMethod("findClusters") } #' @rdname findClusters #' #' @param rho The threshold for local density when detecting cluster peaks #' #' @param delta The threshold for minimum distance to higher density when detecting cluster peaks #' #' @param plot Logical. Should a decision plot be shown after cluster detection #' #' @param peaks A numeric vector indicates the index of density peaks used for clustering. This vector should be retrieved from the decision plot with caution. No checking involved. #' #' @param verbose Logical. Should the running details be reported #' #' @export #' @importFrom graphics plot locator findClusters.densityCluster <- function(x, rho, delta, plot = FALSE, peaks = NULL, verbose = FALSE, ...) { if (class(x$distance) %in% c('data.frame', 'matrix')) { peak_ind <- which(x$rho > rho & x$delta > delta) x$peaks <- peak_ind # Assign observations to clusters runOrder <- order(x$rho, decreasing = TRUE) cluster <- rep(NA, length(x$rho)) for (i in x$peaks) { cluster[i] <- match(i, x$peaks) } for (ind in setdiff(runOrder, x$peaks)) { target_lower_density_samples <- which(x$nearest_higher_density_neighbor == ind) #all the target cells should have the same cluster id as current higher density cell cluster[ind] <- cluster[x$nearest_higher_density_neighbor[ind]] } potential_duplicates <- which(is.na(cluster)) for (ind in potential_duplicates) { res <- as.integer(names(which.max(table(cluster[x$nn.index[ind, ]])))) if (length(res) > 0) { cluster[ind] <- res #assign NA samples to the majority of its clusters } else { message('try to increase the number of kNN (through argument k) at step of densityClust.') cluster[ind] <- NA } } x$clusters <- factor(cluster) # Calculate core/halo status of observation border <- rep(0, length(x$peaks)) if (verbose) message('Identifying core and halo for each cluster') for (i in 1:length(x$peaks)) { if (verbose) message('the current index of the peak is ', i) connect_samples_ind <- intersect(unique(x$nn.index[cluster == i, ]), which(cluster != i)) averageRho <- outer(x$rho[cluster == i], x$rho[connect_samples_ind], '+') / 2 if (any(connect_samples_ind)) border[i] <- max(averageRho[connect_samples_ind]) } x$halo <- x$rho < border[cluster] x$threshold['rho'] <- rho x$threshold['delta'] <- delta } else { # Detect cluster peaks if (!is.null(peaks)) { if (verbose) message('peaks are provided, clustering will be performed based on them') x$peaks <- peaks } else { if (missing(rho) || missing(delta)) { x$peaks <- NA plot(x) cat('Click on plot to select thresholds\n') threshold <- locator(1) if (missing(rho)) rho <- threshold$x if (missing(delta)) delta <- threshold$y plot = TRUE } x$peaks <- which(x$rho > rho & x$delta > delta) x$threshold['rho'] <- rho x$threshold['delta'] <- delta } if (plot) { plot(x) } # Assign observations to clusters runOrder <- order(x$rho, decreasing = TRUE) cluster <- rep(NA, length(x$rho)) if (verbose) message('Assigning each sample to a cluster based on its nearest density peak') for (i in runOrder) { if ((i %% round(length(runOrder) / 25)) == 0) { if (verbose) message(paste('the runOrder index is', i)) } if (i %in% x$peaks) { cluster[i] <- match(i, x$peaks) } else { higherDensity <- which(x$rho > x$rho[i]) cluster[i] <- cluster[higherDensity[which.min(findDistValueByRowColInd(x$distance, attr(x$distance, 'Size'), i, higherDensity))]] } } x$clusters <- cluster # Calculate core/halo status of observation border <- rep(0, length(x$peaks)) if (verbose) message('Identifying core and halo for each cluster') for (i in 1:length(x$peaks)) { if (verbose) message('the current index of the peak is ', i) averageRho <- outer(x$rho[cluster == i], x$rho[cluster != i], '+')/2 index <- findDistValueByRowColInd(x$distance, attr(x$distance, 'Size'), which(cluster == i), which(cluster != i)) <= x$dc if (any(index)) border[i] <- max(averageRho[index]) } x$halo <- x$rho < border[cluster] } x$halo <- x$rho < border[cluster] # Sort cluster designations by gamma (= rho * delta) gamma <- x$rho * x$delta pk.ordr <- order(gamma[x$peaks], decreasing = TRUE) x$peaks <- x$peaks[pk.ordr] x$clusters <- match(x$clusters, pk.ordr) x } #' Extract cluster membership from a densityCluster object #' #' This function allows the user to extract the cluster membership of all the #' observations in the given densityCluster object. The output can be formatted #' in two ways as described below. Halo observations can be chosen to be removed #' from the output. #' #' @details #' Two formats for the output are available. Either a vector of integers #' denoting for each observation, which cluster the observation belongs to. If #' halo observations are removed, these are set to NA. The second format is a #' list with a vector for each group containing the index for the member #' observations in the group. If halo observations are removed their indexes are #' omitted. The list format correspond to the following transform of the vector #' format `split(1:length(clusters), clusters)`, where `clusters` are #' the cluster information in vector format. #' #' @param x The densityCluster object. [findClusters()] must have #' been performed prior to this call to avoid throwing an error. #' #' @param ... Currently ignored #' #' @return A vector or list with cluster memberships for the observations in the #' initial distance matrix #' #' @export #' clusters <- function(x, ...) { UseMethod("clusters") } #' @rdname clusters #' #' @param as.list Should the output be in the list format. Defaults to FALSE #' #' @param halo.rm Logical. should halo observations be removed. Defaults to TRUE #' #' @export #' clusters.densityCluster <- function(x, as.list = FALSE, halo.rm = TRUE, ...) { if (!clustered(x)) stop('x must be clustered prior to cluster extraction') res <- x$clusters if (halo.rm) { res[x$halo] <- NA } if (as.list) { res <- split(1:length(res), res) } res } #' Check whether a densityCluster object have been clustered #' #' This function checks whether [findClusters()] has been performed on #' the given object and returns a boolean depending on the outcome #' #' @param x A densityCluster object #' #' @return `TRUE` if [findClusters()] have been performed, otherwise #' `FALSE` #' #' @export #' clustered <- function(x) { UseMethod("clustered") } #' @rdname clustered #' #' @export #' clustered.densityCluster <- function(x) { !any(is.na(x$peaks[1]), is.na(x$clusters[1]), is.na(x$halo[1])) } #' Extract labels #' #' @noRd #' #' @export #' labels.densityCluster <- function(object, ...) { labels(object$distance) } #' Fast knn version of densityClust #' #' This function will be called by densityClust if a matrix or data.frame is #' passed in rather than a distance object #' #' @noRd #' #' @importFrom FNN get.knn densityClust.knn <- function(mat, k = NULL, verbose = F, ...) { if (is.null(k)) { k <- round(sqrt(nrow(mat)) / 2) # empirical way to select the number of neighbor points k <- max(10, k) # ensure k is at least 10 } if (verbose) message('Finding kNN using FNN with ', k, ' neighbors') dx <- get.knn(mat, k = k, ...) nn.index <- dx$nn.index nn.dist <- dx$nn.dist N <- nrow(nn.index) knn_graph <- NULL if (verbose) message('Calculating the local density for each sample based on kNNs ...') rho <- apply(nn.dist, 1, function(x) { exp(-mean(x)) }) if (verbose) message('Calculating the minimal distance of a sample to another sample with higher density ...') rho_order <- order(rho) delta <- vector(mode = 'integer', length = N) nearest_higher_density_neighbor <- vector(mode = 'integer', length = N) delta_neighbor_tmp <- smallest_dist_rho_order_coords(rho[rho_order], as.matrix(mat[rho_order, ])) delta[rho_order] <- delta_neighbor_tmp$smallest_dist nearest_higher_density_neighbor[rho_order] <- rho_order[delta_neighbor_tmp$nearest_higher_density_sample + 1] if (verbose) message('Returning result...') res <- list( rho = rho, delta = delta, distance = mat, dc = NULL, threshold = c(rho = NA, delta = NA), peaks = NA, clusters = NA, halo = NA, knn_graph = knn_graph, nearest_higher_density_neighbor = nearest_higher_density_neighbor, nn.index = nn.index, nn.dist = nn.dist ) class(res) <- 'densityCluster' res } densityClust/R/RcppExports.R0000644000176200001440000000167413173620001015562 0ustar liggesusers# Generated by using Rcpp::compileAttributes() -> do not edit by hand # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 distanceToPeakCpp <- function(distance, rho) { .Call('densityClust_distanceToPeakCpp', PACKAGE = 'densityClust', distance, rho) } findDistValueByRowColInd <- function(distance, num_row, row_inds, col_inds) { .Call('densityClust_findDistValueByRowColInd', PACKAGE = 'densityClust', distance, num_row, row_inds, col_inds) } smallest_dist_rho_order_coords <- function(ordered_rho, ordered_coords) { .Call('densityClust_smallest_dist_rho_order_coords', PACKAGE = 'densityClust', ordered_rho, ordered_coords) } gaussianLocalDensity <- function(distance, nrow, dc) { .Call('densityClust_gaussianLocalDensity', PACKAGE = 'densityClust', distance, nrow, dc) } nonGaussianLocalDensity <- function(distance, nrow, dc) { .Call('densityClust_nonGaussianLocalDensity', PACKAGE = 'densityClust', distance, nrow, dc) } densityClust/R/plotDensityClust.R0000644000176200001440000001512713173617741016641 0ustar liggesusers#' @name plotDensityClust #' @title Plot densityCluster results #' @description Generate a single panel of up to three diagnostic plots for a #' \code{densityClust} object. #' #' @param x A densityCluster object as produced by \code{\link{densityClust}} #' @param type A character vector designating which figures to produce. Valid #' options include \code{"dg"} for a decision graph of \eqn{\delta} vs. #' \eqn{\rho}, \code{"gg"} for a gamma graph depicting the decrease of #' \eqn{\gamma} (= \eqn{\delta} * \eqn{\rho}) across samples, and \code{"mds"}, #' for a Multi-Dimensional Scaling (MDS) plot of observations. Any combination #' of these three can be included in the vector, or to produce all plots, #' specify \code{type = "all"}. #' @param n Number of observations to plot in the gamma graph. #' @param mds A matrix of scores for observations from a Principal Components #' Analysis or MDS. If omitted, and a MDS plot has been requested, one will #' be calculated. #' @param dim.x,dim.y The numbers of the dimensions to plot on the x and y #' axes of the MDS plot. #' @param col Vector of colors for clusters. #' @param alpha Value in \code{0:1} controlling transparency of points in the #' decision graph and MDS plot. #' #' @return A panel of the figures specified in \code{type} are produced. #' If designated, clusters are color-coded and labelled. If present in #' \code{x}, the rho and delta thresholds are designated in the #' decision graph by a set of solid black lines. #' #' @author Eric Archer \email{eric.archer@@noaa.gov} #' #' @examples #' data(iris) #' data.dist <- dist(iris[, 1:4]) #' pca <- princomp(iris[, 1:4]) #' #' # Run initial density clustering #' dens.clust <- densityClust(data.dist) # #' op <- par(ask = TRUE) #' #' # Show the decision graph #' plotDensityClust(dens.clust, type = "dg") #' #' # Show the decision graph and the gamma graph #' plotDensityClust(dens.clust, type = c("dg", "gg")) #' #' # Cluster based on rho and delta #' new.clust <- findClusters(dens.clust, rho = 4, delta = 2) #' #' # Show all graphs with clustering #' plotDensityClust(new.clust, mds = pca$scores) #' #' par(op) #' #' @importFrom RColorBrewer brewer.pal #' @importFrom ggplot2 ggplot aes_string geom_text geom_point geom_segment labs #' theme_bw theme scale_color_manual geom_line geom_label #' @importFrom ggrepel geom_label_repel #' @importFrom gridExtra grid.arrange #' @importFrom grDevices rainbow #' @export #' plotDensityClust <- function(x, type = "all", n = 20, mds = NULL, dim.x = 1, dim.y = 2, col = NULL, alpha = 0.8) { type <- tolower(type) if(any(pmatch(type, "all", nomatch = 0))) type <- c("dg", "gg", "mds") df <- data.frame( rho = x$rho, delta = x$delta, gamma = x$rho * x$delta, peaks = FALSE, cluster = factor(x$clusters), halo = x$halo ) df$peaks[x$peaks] <- TRUE if(is.null(col)) { num.cols <- max(nlevels(df$cluster), 3) col <- if(num.cols <= 8) { brewer.pal(num.cols, "Set2") } else if(num.cols <= 12) { brewer.pal(num.cols, "Set3") } else rainbow(num.cols + 1)[1:num.cols] } plots <- list(dg = NULL, gg = NULL, mds = NULL) # Plot decision graph (dg) if(any(pmatch(type, "dg", nomatch = 0))) { plots$dg <- ggplot(df, aes_string(x = "rho", y = "delta")) if(!any(is.na(x$threshold))) { rho <- x$threshold["rho"] delta <- x$threshold["delta"] thresh.df <- data.frame( x = c(rho, rho), y = c(delta, delta), xend = c(rho, Inf), yend = c(Inf, delta) ) plots$dg <- plots$dg + geom_segment( aes_string(x = "x", xend = "xend", y = "y", yend = "yend"), data = thresh.df, inherit.aes = F, lineend = "butt" ) } if(any(df$peaks)) { plots$dg <- plots$dg + geom_label( aes_string(label = "cluster", color = "cluster"), data = df[df$peaks, ], fontface = "bold", alpha = alpha ) + scale_color_manual(values = col) } plots$dg <- plots$dg + geom_point( data = df[!df$peaks, ], size = 3, color = "gray50", alpha = alpha ) + labs(x = expression(rho), y = expression(delta), color = "Cluster") + theme(legend.position = "none") } # Plot gamma graph (gg) if(any(pmatch(type, "gg", nomatch = 0))) { gg.df <- df[order(df$gamma, decreasing = TRUE), ] gg.df <- gg.df[1:n, , drop = FALSE] gg.df$Sample <- 1:nrow(gg.df) plots$gg <- ggplot(gg.df, aes_string(x = "Sample", y = "gamma")) + geom_line() if(any(gg.df$peaks)) { plots$gg <- plots$gg + geom_label( aes_string(label = "cluster", color = "cluster"), data = gg.df[gg.df$peaks, , drop = FALSE], fontface = "bold", alpha = alpha ) + scale_color_manual(values = col) } plots$gg <- plots$gg + geom_point( data = gg.df[!gg.df$peaks, , drop = FALSE], size = 3, color = "gray50" ) + labs(y = expression(gamma), color = "Cluster") + theme(legend.position = "none") } # Plot MDS (mds) if(any(pmatch(type, "mds", nomatch = 0))) { if(is.null(mds)) mds <- cmdscale(x$distance, k = max(dim.x, dim.y)) df$x <- mds[, dim.x] df$y <- mds[, dim.y] plots$mds <- ggplot() plots$mds <- if(all(is.na(df$cluster))) { plots$mds + geom_point( aes_string(x = "x", y = "y"), data = df, size = 3, color = "gray50", alpha = alpha ) } else { plots$mds + geom_point( aes_string(x = "x", y = "y", color = "cluster"), data = df[df$halo, , drop = FALSE], shape = 21, size = 3 ) + geom_point( aes_string(x = "x", y = "y", color = "cluster"), data = df[!df$halo, , drop = FALSE], size = 3, alpha = alpha ) + geom_label_repel( aes_string(x = "x", y = "y", label = "cluster", color = "cluster"), data = df[df$peaks, , drop = FALSE], size = 6, fontface = "bold", alpha = alpha ) + scale_color_manual(values = col, na.value = "gray50") } plots$mds <- plots$mds + labs(x = paste("Dimension", dim.x), y = paste("Dimension", dim.y)) + theme(legend.position = "none") } has.plot <- !sapply(plots, is.null) switch( sum(has.plot), print(plots[[which(has.plot)]]), { plots <- plots[has.plot] if("mds" %in% names(plots)) plots$nrow <- 2 else plots$ncol <-2 do.call(grid.arrange, plots) }, { plots$layout_matrix <- matrix(c(1, 3, 2, 3), nrow = 2) do.call(grid.arrange, plots) } ) }densityClust/README.md0000644000176200001440000000551413173617741014242 0ustar liggesusersClustering by fast search and find of density peaks ============ [![Travis-CI Build Status](https://travis-ci.org/thomasp85/densityClust.svg?branch=master)](https://travis-ci.org/thomasp85/densityClust) [![AppVeyor Build Status](https://ci.appveyor.com/api/projects/status/github/thomasp85/densityClust?branch=master&svg=true)](https://ci.appveyor.com/project/thomasp85/densityClust) [![CRAN\_Release\_Badge](http://www.r-pkg.org/badges/version-ago/densityClust)](https://CRAN.R-project.org/package=densityClust) [![CRAN\_Download\_Badge](http://cranlogs.r-pkg.org/badges/densityClust)](https://CRAN.R-project.org/package=densityClust) [![Coverage Status](https://img.shields.io/codecov/c/github/thomasp85/densityClust/master.svg)](https://codecov.io/github/thomasp85/densityClust?branch=master) This package implement the clustering algorithm described by Alex Rodriguez and Alessandro Laio (2014). It provides the user with tools for generating the initial rho and delta values for each observation as well as using these to assign observations to clusters. This is done in two passes so the user is free to reassign observations to clusters using a new set of rho and delta thresholds, without needing to recalculate everything. Plotting ------------ Two types of plots are supported by this package, and both mimics the types of plots used in the publication for the algorithm. The standard plot function produces a decision plot, with optional colouring of cluster peaks if these are assigned. Furthermore `plotMDS()` performs a multidimensional scaling of the distance matrix and plots this as a scatterplot. If clusters are assigned observations are coloured according to their assignment. Cluster detection ------------ The two main functions for this package are `densityClust()` and `findClusters()`. The former takes a distance matrix and optionally a distance cutoff and calculates rho and delta for each observation. The latter takes the output of `densityClust()` and make cluster assignment for each observation based on a user defined rho and delta threshold. If the thresholds are not specified the user is able to supply them interactively by clicking on a decision plot. Usage ------------ ```R irisDist <- dist(iris[,1:4]) irisClust <- densityClust(irisDist, gaussian=TRUE) plot(irisClust) # Inspect clustering attributes to define thresholds irisClust <- findClusters(irisClust, rho=2, delta=2) plotMDS(irisClust) split(iris[,5], irisClust$clusters) ``` Note that while the iris dataset contains information on three different species of iris, only two clusters are detected by the algorithm. This is because two of the species (versicolor and virginica) are not clearly seperated by their data. Refences ------------ Rodriguez, A., & Laio, A. (2014). Clustering by fast search and find of density peaks. Science, 344(6191), 1492-1496. doi:10.1126/science.1242072 densityClust/MD50000644000176200001440000000227713173633643013275 0ustar liggesusers1ab269386aff4aeb528238611823b5ee *DESCRIPTION 61844946d8093955863c0a91d3dd8705 *NAMESPACE 30c23fdd0c2c702f96a661278d7a5e4c *R/RcppExports.R ff4698819ff59d77100c7a06c6485d55 *R/densityClust.R 630a945038bf37962f37efcc07b99108 *R/plotDensityClust.R 024cd59461d3888628526adf8365d818 *README.md 42be68192407d5489d6a2b74f7040922 *man/clustered.Rd 082efd30afc73ebd651d4492c2159e84 *man/clusters.Rd a18007c60288ef1ac16f9020867dfdaa *man/densityClust-package.Rd 711ae585690ba33eaffde4bebd17de7a *man/densityClust.Rd 4dd8395b14598aa5a338ff6d83bcbe80 *man/estimateDc.Rd ee83bbf7642b2872eb82552fce1b9146 *man/findClusters.Rd 83472650d192a2b26d1595c5c10d5298 *man/plotDensityClust.Rd 16ac60de9a02992eb1673d6ffe9d333a *man/plotMDS.Rd 91b2473f7511045df290d8c8be1d1236 *man/plotTSNE.Rd cdfa9919625ec6d4e4216756b4d2b76f *src/RcppExports.cpp 3ce3f7f78180b7945ced8da583931283 *src/distanceToPeak.cpp 33a3cd8bb3b103104f7d246fb1e1823b *src/findDistValueByRowColInd.cpp db8aca722baac26b4f3e9c34ac6d3058 *src/localDensity.cpp 45a9550a0d9fb5e170e463c3c03a7f5b *tests/testthat.R f8e7596ac614b2ab51c1fc76d8e233f0 *tests/testthat/generateReference.R 92e64d8c4280f014ff75f720b7aeab62 *tests/testthat/testEquivalenceToReferenceImplementation.R densityClust/DESCRIPTION0000644000176200001440000000230113173633643014457 0ustar liggesusersPackage: densityClust Type: Package Title: Clustering by Fast Search and Find of Density Peaks Version: 0.3 Date: 2017-10-24 Authors@R: c( person("Thomas Lin", "Pedersen", email = "thomasp85@gmail.com", role = c("aut", "cre")), person("Sean", "Hughes", email = "", role = c("aut")), person("Xiaojie", "Qiu", email = "xqiu@uw.edu", role = c("aut"))) Maintainer: Thomas Lin Pedersen Description: An improved implementation (based on k-nearest neighbors) of the density peak clustering algorithm, originally described by Alex Rodriguez and Alessandro Laio (Science, 2014 vol. 344). It can handle large datasets (> 100, 000 samples) very efficiently. It was initially implemented by Thomas Lin Pedersen, with inputs from Sean Hughes and later improved by Xiaojie Qiu to handle large datasets with kNNs. License: GPL (>= 2) Suggests: testthat LinkingTo: Rcpp Imports: Rcpp, FNN, Rtsne, ggplot2, ggrepel, grDevices, gridExtra, RColorBrewer RoxygenNote: 6.0.1 NeedsCompilation: yes Packaged: 2017-10-24 11:49:46 UTC; w22066 Author: Thomas Lin Pedersen [aut, cre], Sean Hughes [aut], Xiaojie Qiu [aut] Repository: CRAN Date/Publication: 2017-10-24 12:52:51 UTC densityClust/man/0000755000176200001440000000000013173617741013531 5ustar liggesusersdensityClust/man/clustered.Rd0000644000176200001440000000116113173617741016011 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/densityClust.R \name{clustered} \alias{clustered} \alias{clustered.densityCluster} \title{Check whether a densityCluster object have been clustered} \usage{ clustered(x) \method{clustered}{densityCluster}(x) } \arguments{ \item{x}{A densityCluster object} } \value{ \code{TRUE} if \code{\link[=findClusters]{findClusters()}} have been performed, otherwise \code{FALSE} } \description{ This function checks whether \code{\link[=findClusters]{findClusters()}} has been performed on the given object and returns a boolean depending on the outcome } densityClust/man/plotMDS.Rd0000644000176200001440000000270513173617741015346 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/densityClust.R \name{plotMDS} \alias{plotMDS} \title{Plot observations using multidimensional scaling and colour by cluster} \usage{ plotMDS(x, ...) } \arguments{ \item{x}{A densityCluster object as produced by \code{\link[=densityClust]{densityClust()}}} \item{...}{Additional parameters. Currently ignored} } \description{ This function produces an MDS scatterplot based on the distance matrix of the densityCluster object (if there is only the coordinates information, a distance matrix will be calculate first), and, if clusters are defined, colours each observation according to cluster affiliation. Observations belonging to a cluster core is plotted with filled circles and observations belonging to the halo with hollow circles. This plotting is not suitable for running large datasets (for example datasets with > 1000 samples). Users are suggested to use other methods, for example tSNE, etc. to visualize their clustering results too. } \examples{ irisDist <- dist(iris[,1:4]) irisClust <- densityClust(irisDist, gaussian=TRUE) plot(irisClust) # Inspect clustering attributes to define thresholds irisClust <- findClusters(irisClust, rho=2, delta=2) plotMDS(irisClust) split(iris[,5], irisClust$clusters) } \seealso{ \code{\link[=densityClust]{densityClust()}} for creating \code{densityCluster} objects, and \code{\link[=plotTSNE]{plotTSNE()}} for an alternative plotting approach. } densityClust/man/densityClust-package.Rd0000644000176200001440000000464013173617741020107 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/densityClust.R \docType{package} \name{densityClust-package} \title{Clustering by fast search and find of density peaks} \description{ This package implement the clustering algorithm described by Alex Rodriguez and Alessandro Laio (2014). It provides the user with tools for generating the initial rho and delta values for each observation as well as using these to assign observations to clusters. This is done in two passes so the user is free to reassign observations to clusters using a new set of rho and delta thresholds, without needing to recalculate everything. } \section{Plotting}{ Two types of plots are supported by this package, and both mimics the types of plots used in the publication for the algorithm. The standard plot function produces a decision plot, with optional colouring of cluster peaks if these are assigned. Furthermore \code{\link[=plotMDS]{plotMDS()}} performs a multidimensional scaling of the distance matrix and plots this as a scatterplot. If clusters are assigned observations are coloured according to their assignment. } \section{Cluster detection}{ The two main functions for this package are \code{\link[=densityClust]{densityClust()}} and \code{\link[=findClusters]{findClusters()}}. The former takes a distance matrix and optionally a distance cutoff and calculates rho and delta for each observation. The latter takes the output of \code{\link[=densityClust]{densityClust()}} and make cluster assignment for each observation based on a user defined rho and delta threshold. If the thresholds are not specified the user is able to supply them interactively by clicking on a decision plot. } \examples{ irisDist <- dist(iris[,1:4]) irisClust <- densityClust(irisDist, gaussian=TRUE) plot(irisClust) # Inspect clustering attributes to define thresholds irisClust <- findClusters(irisClust, rho=2, delta=2) plotMDS(irisClust) split(iris[,5], irisClust$clusters) } \references{ Rodriguez, A., & Laio, A. (2014). \emph{Clustering by fast search and find of density peaks.} Science, \strong{344}(6191), 1492-1496. doi:10.1126/science.1242072 } \seealso{ \code{\link[=densityClust]{densityClust()}}, \code{\link[=findClusters]{findClusters()}}, \code{\link[=plotMDS]{plotMDS()}} } \author{ \strong{Maintainer}: Thomas Lin Pedersen \email{thomasp85@gmail.com} Authors: \itemize{ \item Sean Hughes \item Xiaojie Qiu \email{xqiu@uw.edu} } } densityClust/man/clusters.Rd0000644000176200001440000000312713173617741015667 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/densityClust.R \name{clusters} \alias{clusters} \alias{clusters.densityCluster} \title{Extract cluster membership from a densityCluster object} \usage{ clusters(x, ...) \method{clusters}{densityCluster}(x, as.list = FALSE, halo.rm = TRUE, ...) } \arguments{ \item{x}{The densityCluster object. \code{\link[=findClusters]{findClusters()}} must have been performed prior to this call to avoid throwing an error.} \item{...}{Currently ignored} \item{as.list}{Should the output be in the list format. Defaults to FALSE} \item{halo.rm}{Logical. should halo observations be removed. Defaults to TRUE} } \value{ A vector or list with cluster memberships for the observations in the initial distance matrix } \description{ This function allows the user to extract the cluster membership of all the observations in the given densityCluster object. The output can be formatted in two ways as described below. Halo observations can be chosen to be removed from the output. } \details{ Two formats for the output are available. Either a vector of integers denoting for each observation, which cluster the observation belongs to. If halo observations are removed, these are set to NA. The second format is a list with a vector for each group containing the index for the member observations in the group. If halo observations are removed their indexes are omitted. The list format correspond to the following transform of the vector format \code{split(1:length(clusters), clusters)}, where \code{clusters} are the cluster information in vector format. } densityClust/man/densityClust.Rd0000644000176200001440000000761313173617741016521 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/densityClust.R \name{densityClust} \alias{densityClust} \title{Calculate clustering attributes based on the densityClust algorithm} \usage{ densityClust(distance, dc, gaussian = FALSE, verbose = FALSE, ...) } \arguments{ \item{distance}{A distance matrix or a matrix (or data.frame) for the coordinates of the data. If a matrix or data.frame is used the distances and local density will be estimated using a fast k-nearest neighbor approach.} \item{dc}{A distance cutoff for calculating the local density. If missing it will be estimated with \code{estimateDc(distance)}} \item{gaussian}{Logical. Should a gaussian kernel be used to estimate the density (defaults to FALSE)} \item{verbose}{Logical. Should the running details be reported} \item{...}{Additional parameters passed on to \link[FNN:get.knn]{get.knn}} } \value{ A densityCluster object. See details for a description. } \description{ This function takes a distance matrix and optionally a distance cutoff and calculates the values necessary for clustering based on the algorithm proposed by Alex Rodrigues and Alessandro Laio (see references). The actual assignment to clusters are done in a later step, based on user defined threshold values. If a distance matrix is passed into \code{distance} the original algorithm described in the paper is used. If a matrix or data.frame is passed instead it is interpretted as point coordinates and rho will be estimated based on k-nearest neighbors of each point (rho is estimated as \code{exp(-mean(x))} where \code{x} is the distance to the nearest neighbors). This can be useful when data is so large that calculating the full distance matrix can be prohibitive. } \details{ The function calculates rho and delta for the observations in the provided distance matrix. If a distance cutoff is not provided this is first estimated using \code{\link[=estimateDc]{estimateDc()}} with default values. The information kept in the densityCluster object is: \describe{ \item{\code{rho}}{A vector of local density values} \item{\code{delta}}{A vector of minimum distances to observations of higher density} \item{\code{distance}}{The initial distance matrix} \item{\code{dc}}{The distance cutoff used to calculate rho} \item{\code{threshold}}{A named vector specifying the threshold values for rho and delta used for cluster detection} \item{\code{peaks}}{A vector of indexes specifying the cluster center for each cluster} \item{\code{clusters}}{A vector of cluster affiliations for each observation. The clusters are referenced as indexes in the peaks vector} \item{\code{halo}}{A logical vector specifying for each observation if it is considered part of the halo} \item{\code{knn_graph}}{kNN graph constructed. It is only applicable to the case where coordinates are used as input. Currently it is set as NA.} \item{\code{nearest_higher_density_neighbor}}{index for the nearest sample with higher density. It is only applicable to the case where coordinates are used as input.} \item{\code{nn.index}}{indices for each cell's k-nearest neighbors. It is only applicable for the case where coordinates are used as input.} \item{\code{nn.dist}}{distance to each cell's k-nearest neighbors. It is only applicable for the case where coordinates are used as input.} } Before running findClusters the threshold, peaks, clusters and halo data is \code{NA}. } \examples{ irisDist <- dist(iris[,1:4]) irisClust <- densityClust(irisDist, gaussian=TRUE) plot(irisClust) # Inspect clustering attributes to define thresholds irisClust <- findClusters(irisClust, rho=2, delta=2) plotMDS(irisClust) split(iris[,5], irisClust$clusters) } \references{ Rodriguez, A., & Laio, A. (2014). \emph{Clustering by fast search and find of density peaks.} Science, \strong{344}(6191), 1492-1496. doi:10.1126/science.1242072 } \seealso{ \code{\link[=estimateDc]{estimateDc()}}, \code{\link[=findClusters]{findClusters()}} } densityClust/man/estimateDc.Rd0000644000176200001440000000264613173617741016112 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/densityClust.R \name{estimateDc} \alias{estimateDc} \title{Estimate the distance cutoff for a specified neighbor rate} \usage{ estimateDc(distance, neighborRateLow = 0.01, neighborRateHigh = 0.02) } \arguments{ \item{distance}{A distance matrix} \item{neighborRateLow}{The lower bound of the neighbor rate} \item{neighborRateHigh}{The upper bound of the neighbor rate} } \value{ A numeric value giving the estimated distance cutoff value } \description{ This function calculates a distance cutoff value for a specific distance matrix that makes the average neighbor rate (number of points within the distance cutoff value) fall between the provided range. The authors of the algorithm suggests aiming for a neighbor rate between 1 and 2 percent, but also states that the algorithm is quite robust with regards to more extreme cases. } \note{ If the number of points is larger than 448 (resulting in 100,128 pairwise distances), 100,128 distance pairs will be randomly selected to speed up computation time. Use \code{\link[=set.seed]{set.seed()}} prior to calling \code{estimateDc} in order to ensure reproducable results. } \examples{ irisDist <- dist(iris[,1:4]) estimateDc(irisDist) } \references{ Rodriguez, A., & Laio, A. (2014). \emph{Clustering by fast search and find of density peaks.} Science, \strong{344}(6191), 1492-1496. doi:10.1126/science.1242072 } densityClust/man/findClusters.Rd0000644000176200001440000000367413173617741016477 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/densityClust.R \name{findClusters} \alias{findClusters} \alias{findClusters.densityCluster} \title{Detect clusters in a densityCluster obejct} \usage{ findClusters(x, ...) \method{findClusters}{densityCluster}(x, rho, delta, plot = FALSE, peaks = NULL, verbose = FALSE, ...) } \arguments{ \item{x}{A densityCluster object as produced by \code{\link[=densityClust]{densityClust()}}} \item{...}{Additional parameters passed on} \item{rho}{The threshold for local density when detecting cluster peaks} \item{delta}{The threshold for minimum distance to higher density when detecting cluster peaks} \item{plot}{Logical. Should a decision plot be shown after cluster detection} \item{peaks}{A numeric vector indicates the index of density peaks used for clustering. This vector should be retrieved from the decision plot with caution. No checking involved.} \item{verbose}{Logical. Should the running details be reported} } \value{ A densityCluster object with clusters assigned to all observations } \description{ This function uses the supplied rho and delta thresholds to detect cluster peaks and assign the rest of the observations to one of these clusters. Furthermore core/halo status is calculated. If either rho or delta threshold is missing the user is presented with a decision plot where they are able to click on the plot area to set the treshold. If either rho or delta is set, this takes presedence over the value found by clicking. } \examples{ irisDist <- dist(iris[,1:4]) irisClust <- densityClust(irisDist, gaussian=TRUE) plot(irisClust) # Inspect clustering attributes to define thresholds irisClust <- findClusters(irisClust, rho=2, delta=2) plotMDS(irisClust) split(iris[,5], irisClust$clusters) } \references{ Rodriguez, A., & Laio, A. (2014). \emph{Clustering by fast search and find of density peaks.} Science, \strong{344}(6191), 1492-1496. doi:10.1126/science.1242072 } densityClust/man/plotDensityClust.Rd0000644000176200001440000000432313173617741017353 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/plotDensityClust.R \name{plotDensityClust} \alias{plotDensityClust} \title{Plot densityCluster results} \usage{ plotDensityClust(x, type = "all", n = 20, mds = NULL, dim.x = 1, dim.y = 2, col = NULL, alpha = 0.8) } \arguments{ \item{x}{A densityCluster object as produced by \code{\link{densityClust}}} \item{type}{A character vector designating which figures to produce. Valid options include \code{"dg"} for a decision graph of \eqn{\delta} vs. \eqn{\rho}, \code{"gg"} for a gamma graph depicting the decrease of \eqn{\gamma} (= \eqn{\delta} * \eqn{\rho}) across samples, and \code{"mds"}, for a Multi-Dimensional Scaling (MDS) plot of observations. Any combination of these three can be included in the vector, or to produce all plots, specify \code{type = "all"}.} \item{n}{Number of observations to plot in the gamma graph.} \item{mds}{A matrix of scores for observations from a Principal Components Analysis or MDS. If omitted, and a MDS plot has been requested, one will be calculated.} \item{dim.x, dim.y}{The numbers of the dimensions to plot on the x and y axes of the MDS plot.} \item{col}{Vector of colors for clusters.} \item{alpha}{Value in \code{0:1} controlling transparency of points in the decision graph and MDS plot.} } \value{ A panel of the figures specified in \code{type} are produced. If designated, clusters are color-coded and labelled. If present in \code{x}, the rho and delta thresholds are designated in the decision graph by a set of solid black lines. } \description{ Generate a single panel of up to three diagnostic plots for a \code{densityClust} object. } \examples{ data(iris) data.dist <- dist(iris[, 1:4]) pca <- princomp(iris[, 1:4]) # Run initial density clustering dens.clust <- densityClust(data.dist) op <- par(ask = TRUE) # Show the decision graph plotDensityClust(dens.clust, type = "dg") # Show the decision graph and the gamma graph plotDensityClust(dens.clust, type = c("dg", "gg")) # Cluster based on rho and delta new.clust <- findClusters(dens.clust, rho = 4, delta = 2) # Show all graphs with clustering plotDensityClust(new.clust, mds = pca$scores) par(op) } \author{ Eric Archer \email{eric.archer@noaa.gov} } densityClust/man/plotTSNE.Rd0000644000176200001440000000240013173617741015464 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/densityClust.R \name{plotTSNE} \alias{plotTSNE} \title{Plot observations using t-distributed neighbor embedding and colour by cluster} \usage{ plotTSNE(x, ...) } \arguments{ \item{x}{A densityCluster object as produced by \code{\link[=densityClust]{densityClust()}}} \item{...}{Additional parameters. Currently ignored} } \description{ This function produces an t-SNE scatterplot based on the distance matrix of the densityCluster object (if there is only the coordinates information, a distance matrix will be calculate first), and, if clusters are defined, colours each observation according to cluster affiliation. Observations belonging to a cluster core is plotted with filled circles and observations belonging to the halo with hollow circles. } \examples{ irisDist <- dist(iris[,1:4]) irisClust <- densityClust(irisDist, gaussian=TRUE) plot(irisClust) # Inspect clustering attributes to define thresholds irisClust <- findClusters(irisClust, rho=2, delta=2) plotTSNE(irisClust) split(iris[,5], irisClust$clusters) } \seealso{ \code{\link[=densityClust]{densityClust()}} for creating \code{densityCluster} objects, and \code{\link[=plotMDS]{plotMDS()}} for an alternative plotting approach. }