uwot/0000755000176200001440000000000014757010752011260 5ustar liggesusersuwot/tests/0000755000176200001440000000000014445654522012425 5ustar liggesusersuwot/tests/testthat/0000755000176200001440000000000014757010752014262 5ustar liggesusersuwot/tests/testthat/test_neighbors.R0000644000176200001440000005473314730166740017440 0ustar liggesuserslibrary(uwot) library(RSpectra) context("neighbors") i10nn4dist <- matrix(c( 0, 0.1414214, 0.1732051, 0.4690416, 0, 0.1732051, 0.3000000, 0.3316625, 0, 0.2449490, 0.2645751, 0.3000000, 0, 0.2449490, 0.3000000, 0.3162278, 0, 0.1414214, 0.2236068, 0.4582576, 0, 0.6164414, 0.6164414, 0.7000000, 0, 0.2645751, 0.3316625, 0.4242641, 0, 0.1732051, 0.2236068, 0.3316625, 0, 0.3000000, 0.4358899, 0.5099020, 0, 0.1732051, 0.3162278, 0.3162278 ), nrow = 10, byrow = TRUE) i10nn4idx <- matrix(c( 1, 5, 8, 10, 2, 10, 3, 4, 3, 4, 7, 2, 4, 3, 9, 10, 5, 1, 8, 7, 6, 1, 5, 8, 7, 3, 4, 8, 8, 1, 5, 10, 9, 4, 3, 2, 10, 2, 3, 4 ), nrow = 10, byrow = TRUE) ## Test specialized functions res <- FNN_nn(iris10, k = 4, include_self = TRUE) expect_equal(res$dist, i10nn4dist, tol = 1e-6) expect_equal(res$idx[-6, ], i10nn4idx[-6, ]) res <- dist_nn(diris10, k = 4, include_self = TRUE) expect_equal(res$dist, i10nn4dist, tol = 1e-6) expect_equal(res$idx[-6, ], i10nn4idx[-6, ]) res <- sparse_nn(dmiris10z, k = 4, include_self = TRUE) expect_equal(res$dist, i10nn4dist, tol = 1e-6) expect_equal(res$idx[-6, ], i10nn4idx[-6, ]) sparse_to_tri <- function(m, lower = TRUE) { sm <- summary(m) if (lower) { subtri <- subset(sm, i >= j) } else { subtri <- subset(sm, i <= j) } Matrix::sparseMatrix(i = subtri$i, j = subtri$j, x = subtri$x, dims = dim(m)) } dmiris10zu <- sparse_to_tri(dmiris10z, lower = FALSE) res <- sparse_tri_nn(dmiris10zu, k = 4, include_self = TRUE) expect_equal(res$dist, i10nn4dist, tol = 1e-6) expect_equal(res$idx[-6, ], i10nn4idx[-6, ]) dmiris10zl <- sparse_to_tri(dmiris10z, lower = TRUE) res <- sparse_tri_nn(dmiris10zl, k = 4, include_self = TRUE) expect_equal(res$dist, i10nn4dist, tol = 1e-6) expect_equal(res$idx[-6, ], i10nn4idx[-6, ]) # Test overall function res <- find_nn(iris10, k = 4, include_self = TRUE) expect_equal(res$dist, i10nn4dist, tol = 1e-6) expect_equal(res$idx[-6, ], i10nn4idx[-6, ]) res <- find_nn(diris10, k = 4, include_self = TRUE) expect_equal(res$dist, i10nn4dist, tol = 1e-6) expect_equal(res$idx[-6, ], i10nn4idx[-6, ]) res <- find_nn(dmiris10z, k = 4, include_self = TRUE) expect_equal(res$dist, i10nn4dist, tol = 1e-6) expect_equal(res$idx[-6, ], i10nn4idx[-6, ]) res <- find_nn(dmiris10z, k = 4, include_self = TRUE) expect_equal(res$dist, i10nn4dist, tol = 1e-6) expect_equal(res$idx[-6, ], i10nn4idx[-6, ]) res <- find_nn(dmiris10zu, k = 4, include_self = TRUE) expect_equal(res$dist, i10nn4dist, tol = 1e-6) expect_equal(res$idx[-6, ], i10nn4idx[-6, ]) res <- find_nn(dmiris10zl, k = 4, include_self = TRUE) expect_equal(res$dist, i10nn4dist, tol = 1e-6) expect_equal(res$idx[-6, ], i10nn4idx[-6, ]) # Test Annoy res <- annoy_nn(ui10, k = 4, n_threads = 0) expect_equal(res$idx, self_unn4$idx, check.attributes = FALSE) expect_equal(res$dist, self_unn4$dist, check.attributes = FALSE, tol = 1e-6) res <- annoy_nn(ui10, k = 4, n_threads = 0, ret_index = TRUE) expect_equal(res$idx, self_unn4$idx, check.attributes = FALSE) expect_equal(res$dist, self_unn4$dist, check.attributes = FALSE, tol = 1e-6) expect_true(!is.null(res$index)) expect_is(res$index, "list") expect_is(res$index$ann, "Rcpp_AnnoyEuclidean") expect_equal(res$index$metric, "euclidean") res <- annoy_nn(ui10, k = 4, n_threads = 1) expect_equal(res$idx, self_unn4$idx, check.attributes = FALSE) expect_equal(res$dist, self_unn4$dist, check.attributes = FALSE, tol = 1e-6) res <- annoy_nn(ui10, k = 4, n_threads = 1, ret_index = TRUE) expect_equal(res$idx, self_unn4$idx, check.attributes = FALSE) expect_equal(res$dist, self_unn4$dist, check.attributes = FALSE, tol = 1e-6) expect_true(!is.null(res$index)) expect_is(res$index, "list") expect_is(res$index$ann, "Rcpp_AnnoyEuclidean") expect_equal(res$index$metric, "euclidean") cos_index <- matrix( c( 1, 2, 7, 3, 2, 1, 7, 3, 3, 6, 4, 7, 4, 3, 5, 7, 5, 8, 4, 3, 6, 3, 9, 4, 7, 3, 1, 4, 8, 5, 4, 3, 9, 6, 10, 3, 10, 9, 6, 3 ), byrow = TRUE, ncol = 4, nrow = 10 ) # Cosine distances from HNSW cos_dist <- matrix( c( 0, 0.000131368637084961, 0.00048297643661499, 0.000737011432647705, 0, 0.000131368637084961, 0.000680804252624512, 0.000909507274627686, 0, 0.000168740749359131, 0.000244021415710449, 0.000422179698944092, 0, 0.000244021415710449, 0.000383198261260986, 0.000549376010894775, 0, 7.09891319274902e-05, 0.000383198261260986, 0.000682294368743896, 0, 0.000168740749359131, 0.000265955924987793, 0.000767052173614502, 0, 0.000422179698944092, 0.00048297643661499, 0.000549376010894775, 0, 7.09891319274902e-05, 0.000611364841461182, 0.000812351703643799, 0, 0.000265955924987793, 0.00078284740447998, 0.000819146633148193, 0, 0.00078284740447998, 0.00160372257232666, 0.00279802083969116 ), byrow = TRUE, ncol = 4, nrow = 10 ) res <- annoy_nn(ui10, k = 4, n_threads = 0, ret_index = TRUE, metric = "cosine") expect_equal(res$idx, cos_index, check.attributes = FALSE) expect_equal(res$dist, cos_dist, check.attributes = FALSE, tol = 1e-6) expect_true(!is.null(res$index)) expect_is(res$index, "list") expect_is(res$index$ann, "Rcpp_AnnoyAngular") expect_equal(res$index$metric, "cosine") # Correlation distances from sklearn cor_index <- matrix( c( 1, 3, 6, 8, 5, 7, 4, 9, 10, 2, 2, 10, 9, 4, 8, 3, 1, 6, 5, 7, 3, 1, 8, 6, 5, 7, 9, 4, 10, 2, 4, 9, 8, 10, 3, 1, 6, 2, 5, 7, 5, 7, 6, 1, 3, 8, 4, 9, 10, 2, 6, 5, 7, 1, 3, 8, 4, 9, 10, 2, 7, 5, 6, 1, 3, 8, 4, 9, 10, 2, 8, 3, 4, 1, 9, 6, 5, 10, 7, 2, 9, 4, 10, 8, 2, 3, 1, 6, 5, 7, 10, 9, 4, 2, 8, 3, 1, 6, 5, 7 ), byrow = TRUE, ncol = 10, nrow = 10 ) cor_dist <- matrix( c( 0.00000000e+00, 2.60889537e-05, 4.13946853e-04, 4.61852891e-04, 6.52668500e-04, 1.18880691e-03, 1.83154822e-03, 1.92335771e-03, 3.44803882e-03, 4.00133876e-03, 0.00000000e+00, 9.67144550e-04, 1.45365862e-03, 2.60336976e-03, 2.88194472e-03, 3.39291433e-03, 4.00133876e-03, 6.40810993e-03, 7.76732119e-03, 9.27944638e-03, 0.00000000e+00, 2.60889537e-05, 3.95491414e-04, 6.22703492e-04, 9.38868047e-04, 1.56232107e-03, 1.64390757e-03, 1.66652079e-03, 3.01440442e-03, 3.39291433e-03, 0.00000000e+00, 1.66690505e-04, 4.54440488e-04, 6.93152364e-04, 1.66652079e-03, 1.83154822e-03, 2.16742062e-03, 2.60336976e-03, 3.28117923e-03, 3.86063303e-03, 0.00000000e+00, 8.60443273e-05, 1.16680316e-04, 6.52668500e-04, 9.38868047e-04, 1.49684289e-03, 3.28117923e-03, 3.96913822e-03, 6.23882524e-03, 7.76732119e-03, 0.00000000e+00, 1.16680316e-04, 2.77394147e-04, 4.13946853e-04, 6.22703492e-04, 8.21174669e-04, 2.16742062e-03, 2.78434617e-03, 4.73942264e-03, 6.40810993e-03, 1.11022302e-16, 8.60443273e-05, 2.77394147e-04, 1.18880691e-03, 1.56232107e-03, 2.04787836e-03, 3.86063303e-03, 4.78602797e-03, 7.27277830e-03, 9.27944638e-03, 0.00000000e+00, 3.95491414e-04, 4.54440488e-04, 4.61852891e-04, 5.93789371e-04, 8.21174669e-04, 1.49684289e-03, 1.62634825e-03, 2.04787836e-03, 2.88194472e-03, 0.00000000e+00, 1.66690505e-04, 2.60225275e-04, 5.93789371e-04, 1.45365862e-03, 1.64390757e-03, 1.92335771e-03, 2.78434617e-03, 3.96913822e-03, 4.78602797e-03, 0.00000000e+00, 2.60225275e-04, 6.93152364e-04, 9.67144550e-04, 1.62634825e-03, 3.01440442e-03, 3.44803882e-03, 4.73942264e-03, 6.23882524e-03, 7.27277830e-03 ), byrow = TRUE, ncol = 10, nrow = 10 ) res <- annoy_nn(iris10, k = 10, n_threads = 0, ret_index = TRUE, metric = "correlation") expect_equal(res$idx, cor_index, check.attributes = FALSE) expect_equal(res$dist, cor_dist, check.attributes = FALSE, tol = 1e-6) expect_true(!is.null(res$index)) expect_is(res$index, "list") expect_is(res$index$ann, "Rcpp_AnnoyAngular") expect_equal(res$index$metric, "correlation") test_that("hnsw gives correct euclidean neighbor results", { testthat::skip_if_not_installed("RcppHNSW") iris10_annoy <- umap( iris10, n_neighbors = 4, nn_method = "annoy", ret_extra = c("nn"), ret_model = TRUE, n_epochs = 0 ) iris10_hnsw <- umap( iris10, n_neighbors = 4, nn_method = "hnsw", ret_extra = c("nn"), ret_model = TRUE, n_epochs = 0 ) expect_equal(iris10_annoy$nn$euclidean$idx, iris10_hnsw$nn$euclidean$idx, check.attributes = FALSE) expect_equal(iris10_annoy$nn$euclidean$dist, iris10_hnsw$nn$euclidean$dist, check.attributes = FALSE, tol = 1e-7) iris10_transform_hnsw <- umap_transform(iris10, iris10_hnsw, n_epochs = 0, ret_extra = c("nn")) expect_equal( iris10_hnsw$nn$euclidean$idx, iris10_transform_hnsw$nn$euclidean$idx, check.attributes = FALSE ) expect_equal( iris10_hnsw$nn$euclidean$dist, iris10_transform_hnsw$nn$euclidean$dist, check.attributes = FALSE ) iris10_transform_annoy <- umap_transform(iris10, iris10_annoy, n_epochs = 0, ret_extra = c("nn")) expect_equal( iris10_transform_annoy$nn$euclidean$idx, iris10_transform_hnsw$nn$euclidean$idx, check.attributes = FALSE ) expect_equal( iris10_transform_annoy$nn$euclidean$dist, iris10_transform_hnsw$nn$euclidean$dist, check.attributes = FALSE, tol = 1e-6 ) }) test_that("hnsw gives correct cosine neighbor results", { testthat::skip_if_not_installed("RcppHNSW") iris10_annoy <- umap( iris10, n_neighbors = 4, nn_method = "annoy", ret_extra = c("nn"), ret_model = TRUE, n_epochs = 0, metric = "cosine" ) iris10_hnsw <- umap( iris10, n_neighbors = 4, nn_method = "hnsw", ret_extra = c("nn"), ret_model = TRUE, n_epochs = 0, metric = "cosine" ) expect_equal(iris10_annoy$nn$cosine$idx, iris10_hnsw$nn$cosine$idx, check.attributes = FALSE) expect_equal(iris10_annoy$nn$cosine$dist, iris10_hnsw$nn$cosine$dist, check.attributes = FALSE, tol = 1e-6) iris10_transform_hnsw <- umap_transform(iris10, iris10_hnsw, n_epochs = 0, ret_extra = c("nn")) expect_equal( iris10_hnsw$nn$cosine$idx, iris10_transform_hnsw$nn$cosine$idx, check.attributes = FALSE ) expect_equal( iris10_hnsw$nn$cosine$dist, iris10_transform_hnsw$nn$cosine$dist, check.attributes = FALSE ) iris10_transform_annoy <- umap_transform(iris10, iris10_annoy, n_epochs = 0, ret_extra = c("nn")) expect_equal( iris10_transform_annoy$nn$cosine$idx, iris10_transform_hnsw$nn$cosine$idx, check.attributes = FALSE ) expect_equal( iris10_transform_annoy$nn$cosine$dist, iris10_transform_hnsw$nn$cosine$dist, check.attributes = FALSE, tol = 1e-6 ) }) test_that("hnsw gives correct correlation neighbor results", { testthat::skip_if_not_installed("RcppHNSW") iris10_annoy <- umap( iris10, n_neighbors = 4, nn_method = "annoy", ret_extra = c("nn"), ret_model = TRUE, n_epochs = 0, metric = "correlation" ) iris10_hnsw <- umap( iris10, n_neighbors = 4, nn_method = "hnsw", ret_extra = c("nn"), ret_model = TRUE, n_epochs = 0, metric = "correlation" ) expect_equal(iris10_annoy$nn$correlation$idx, iris10_hnsw$nn$correlation$idx, check.attributes = FALSE) expect_equal(iris10_annoy$nn$correlation$dist, iris10_hnsw$nn$correlation$dist, check.attributes = FALSE, tol = 1e-6) iris10_transform_hnsw <- umap_transform(iris10, iris10_hnsw, n_epochs = 0, ret_extra = c("nn")) expect_equal( iris10_hnsw$nn$correlation$idx, iris10_transform_hnsw$nn$correlation$idx, check.attributes = FALSE ) expect_equal( iris10_hnsw$nn$correlation$dist, iris10_transform_hnsw$nn$correlation$dist, check.attributes = FALSE ) iris10_transform_annoy <- umap_transform(iris10, iris10_annoy, n_epochs = 0, ret_extra = c("nn")) expect_equal( iris10_transform_annoy$nn$correlation$idx, iris10_transform_hnsw$nn$correlation$idx, check.attributes = FALSE ) expect_equal( iris10_transform_annoy$nn$correlation$dist, iris10_transform_hnsw$nn$correlation$dist, check.attributes = FALSE, tol = 1e-6 ) }) test_that("hnsw gives correct correlation neighbor results and multiple threads", { testthat::skip_if_not_installed("RcppHNSW") iris10_annoy <- umap( iris10, n_neighbors = 4, nn_method = "annoy", ret_extra = c("nn"), ret_model = TRUE, n_epochs = 0, metric = "correlation" ) iris10_hnsw <- umap( iris10, n_neighbors = 4, nn_method = "hnsw", ret_extra = c("nn"), ret_model = TRUE, n_epochs = 0, metric = "correlation", n_threads = 2 ) expect_equal(iris10_annoy$nn$correlation$idx, iris10_hnsw$nn$correlation$idx, check.attributes = FALSE) expect_equal(iris10_annoy$nn$correlation$dist, iris10_hnsw$nn$correlation$dist, check.attributes = FALSE, tol = 1e-6) iris10_transform_hnsw <- umap_transform(iris10, iris10_hnsw, n_epochs = 0, ret_extra = c("nn"), n_threads = 2) expect_equal( iris10_hnsw$nn$correlation$idx, iris10_transform_hnsw$nn$correlation$idx, check.attributes = FALSE ) expect_equal( iris10_hnsw$nn$correlation$dist, iris10_transform_hnsw$nn$correlation$dist, check.attributes = FALSE ) iris10_transform_annoy <- umap_transform(iris10, iris10_annoy, n_epochs = 0, ret_extra = c("nn")) expect_equal( iris10_transform_annoy$nn$correlation$idx, iris10_transform_hnsw$nn$correlation$idx, check.attributes = FALSE ) expect_equal( iris10_transform_annoy$nn$correlation$dist, iris10_transform_hnsw$nn$correlation$dist, check.attributes = FALSE, tol = 1e-6 ) }) # rnndescent test_that("nndescent gives correct euclidean neighbor results", { testthat::skip_if_not_installed("rnndescent") iris10_annoy <- umap( iris10, n_neighbors = 4, nn_method = "annoy", ret_extra = c("nn"), ret_model = TRUE, n_epochs = 0 ) iris10_nnd_no_model <- umap( iris10, n_neighbors = 4, nn_method = "nndescent", ret_extra = c("nn"), ret_model = FALSE, n_epochs = 0 ) expect_equal(iris10_annoy$nn$euclidean$idx, iris10_nnd_no_model$nn$euclidean$idx, check.attributes = FALSE) expect_equal(iris10_annoy$nn$euclidean$dist, iris10_nnd_no_model$nn$euclidean$dist, check.attributes = FALSE, tol = 1e-7) iris10_nnd <- umap( iris10, n_neighbors = 4, nn_method = "nndescent", ret_extra = c("nn"), ret_model = TRUE, n_epochs = 0 ) expect_equal(iris10_annoy$nn$euclidean$idx, iris10_nnd$nn$euclidean$idx, check.attributes = FALSE) expect_equal(iris10_annoy$nn$euclidean$dist, iris10_nnd$nn$euclidean$dist, check.attributes = FALSE, tol = 1e-7) iris10_transform_nnd <- umap_transform(iris10, iris10_nnd, n_epochs = 0, ret_extra = c("nn")) expect_equal( iris10_nnd$nn$euclidean$idx, iris10_transform_nnd$nn$euclidean$idx, check.attributes = FALSE ) expect_equal( iris10_nnd$nn$euclidean$dist, iris10_transform_nnd$nn$euclidean$dist, check.attributes = FALSE ) iris10_transform_annoy <- umap_transform(iris10, iris10_annoy, n_epochs = 0, ret_extra = c("nn")) expect_equal( iris10_transform_annoy$nn$euclidean$idx, iris10_transform_nnd$nn$euclidean$idx, check.attributes = FALSE ) expect_equal( iris10_transform_annoy$nn$euclidean$dist, iris10_transform_nnd$nn$euclidean$dist, check.attributes = FALSE, tol = 1e-6 ) }) test_that("nndescent gives correct cosine neighbor results", { testthat::skip_if_not_installed("rnndescent") iris10_annoy <- umap( iris10, n_neighbors = 4, nn_method = "annoy", ret_extra = c("nn"), ret_model = TRUE, n_epochs = 0, metric = "cosine" ) iris10_nnd <- umap( iris10, n_neighbors = 4, nn_method = "nndescent", ret_extra = c("nn"), ret_model = TRUE, n_epochs = 0, metric = "cosine" ) expect_equal(iris10_annoy$nn$cosine$idx, iris10_nnd$nn$cosine$idx, check.attributes = FALSE) expect_equal(iris10_annoy$nn$cosine$dist, iris10_nnd$nn$cosine$dist, check.attributes = FALSE, tol = 1e-6) iris10_transform_nnd <- umap_transform(iris10, iris10_nnd, n_epochs = 0, ret_extra = c("nn")) expect_equal( iris10_nnd$nn$cosine$idx, iris10_transform_nnd$nn$cosine$idx, check.attributes = FALSE ) expect_equal( iris10_nnd$nn$cosine$dist, iris10_transform_nnd$nn$cosine$dist, check.attributes = FALSE ) iris10_transform_annoy <- umap_transform(iris10, iris10_annoy, n_epochs = 0, ret_extra = c("nn")) expect_equal( iris10_transform_annoy$nn$cosine$idx, iris10_transform_nnd$nn$cosine$idx, check.attributes = FALSE ) expect_equal( iris10_transform_annoy$nn$cosine$dist, iris10_transform_nnd$nn$cosine$dist, check.attributes = FALSE, tol = 1e-6 ) }) test_that("nndescent gives correct correlation neighbor results", { testthat::skip_if_not_installed("rnndescent") iris10_annoy <- umap( iris10, n_neighbors = 4, nn_method = "annoy", ret_extra = c("nn"), ret_model = TRUE, n_epochs = 0, metric = "correlation" ) iris10_nnd <- umap( iris10, n_neighbors = 4, nn_method = "nndescent", ret_extra = c("nn"), ret_model = TRUE, n_epochs = 0, metric = "correlation" ) expect_equal(iris10_annoy$nn$correlation$idx, iris10_nnd$nn$correlation$idx, check.attributes = FALSE) expect_equal(iris10_annoy$nn$correlation$dist, iris10_nnd$nn$correlation$dist, check.attributes = FALSE, tol = 1e-6) iris10_transform_nnd <- umap_transform(iris10, iris10_nnd, n_epochs = 0, ret_extra = c("nn")) expect_equal( iris10_nnd$nn$correlation$idx, iris10_transform_nnd$nn$correlation$idx, check.attributes = FALSE ) expect_equal( iris10_nnd$nn$correlation$dist, iris10_transform_nnd$nn$correlation$dist, check.attributes = FALSE ) iris10_transform_annoy <- umap_transform(iris10, iris10_annoy, n_epochs = 0, ret_extra = c("nn")) expect_equal( iris10_transform_annoy$nn$correlation$idx, iris10_transform_nnd$nn$correlation$idx, check.attributes = FALSE ) expect_equal( iris10_transform_annoy$nn$correlation$dist, iris10_transform_nnd$nn$correlation$dist, check.attributes = FALSE, tol = 1e-6 ) }) test_that("nndescent gives correct correlation neighbor results and multiple threads", { testthat::skip_if_not_installed("rnndescent") iris10_annoy <- umap( iris10, n_neighbors = 4, nn_method = "annoy", ret_extra = c("nn"), ret_model = TRUE, n_epochs = 0, metric = "correlation" ) iris10_nnd <- umap( iris10, n_neighbors = 4, nn_method = "nndescent", ret_extra = c("nn"), ret_model = TRUE, n_epochs = 0, metric = "correlation", n_threads = 2 ) expect_equal(iris10_annoy$nn$correlation$idx, iris10_nnd$nn$correlation$idx, check.attributes = FALSE) expect_equal(iris10_annoy$nn$correlation$dist, iris10_nnd$nn$correlation$dist, check.attributes = FALSE, tol = 1e-6) iris10_transform_nnd <- umap_transform(iris10, iris10_nnd, n_epochs = 0, ret_extra = c("nn"), n_threads = 2) expect_equal( iris10_nnd$nn$correlation$idx, iris10_transform_nnd$nn$correlation$idx, check.attributes = FALSE ) expect_equal( iris10_nnd$nn$correlation$dist, iris10_transform_nnd$nn$correlation$dist, check.attributes = FALSE ) iris10_transform_annoy <- umap_transform(iris10, iris10_annoy, n_epochs = 0, ret_extra = c("nn")) expect_equal( iris10_transform_annoy$nn$correlation$idx, iris10_transform_nnd$nn$correlation$idx, check.attributes = FALSE ) expect_equal( iris10_transform_annoy$nn$correlation$dist, iris10_transform_nnd$nn$correlation$dist, check.attributes = FALSE, tol = 1e-6 ) model_with_args <- umap( iris10, n_neighbors = 4, n_epochs = 2, init = "spca", metric = "euclidean", verbose = FALSE, n_threads = 0, ret_model = TRUE, ret_extra = c("nn"), nn_method = "nndescent", nn_args = list( init = "rand", prune_reverse = TRUE, epsilon = 0.0 ) ) expect_equal( model_with_args$nn_args, list( init = "rand", prune_reverse = TRUE, epsilon = 0 ) ) }) uwot/tests/testthat/test_fuzzy_simplicial_set.R0000644000176200001440000001234414730166740021720 0ustar liggesuserslibrary(uwot) context("fuzzy simplicial set") ### Various fuzzy set matrices are defined in helper_fuzzy_sets.R # matrix # same as # umap.umap_.fuzzy_simplicial_set(iris10, 4, random_state=42, metric="euclidean") # as of 0.5 (and even earlier) res <- fuzzy_simplicial_set( set_op_mix_ratio = 1, local_connectivity = 1, bandwidth = 1, verbose = FALSE, n_threads = 0, nn = nn ) expect_equal(res, V_union, tol = 1e-4) # nnsp0 <- Matrix::sparseMatrix( # i = c(0, 4, 7, 9, 1, 2, 3, 9, 1, 2, 3, 6, 2, 3, 8, 9, 0, 4, 6, 7, 0, 4, 5, 7, 2, 3, 6, 7, 0, 4, 7, 9, 1, 2, 3, 8, 1, 2, 3, 9), # p = c(0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40), # x = c(0, 0.141421356237309, 0.173205080756888, 0.469041575982343, 0, 0.3, # 0.331662479035541, 0.173205080756888, 0.3, 0, 0.244948974278318, # 0.264575131106459, 0.244948974278318, 0, 0.3, 0.316227766016839, # 0.141421356237309, 0, 0.458257569495584, 0.223606797749979, # 0.616441400296898, 0.616441400296898, 0, 0.7, 0.264575131106459, # 0.33166247903554, 0, 0.424264068711929, 0.173205080756888, # 0.223606797749979, 0, 0.33166247903554, 0.509901951359279, # 0.435889894354067, 0.3, 0, 0.173205080756888, 0.316227766016838, # 0.316227766016839, 0), # dims = c(10, 10), # index1 = FALSE # ) # res <- fuzzy_simplicial_set( # set_op_mix_ratio = 1, local_connectivity = 1, # bandwidth = 1, verbose = FALSE, n_threads = 0, nn = Matrix::drop0(nnsp0) # )$matrix # expect_equal(res, V_union, tol = 1e-4) # mix union + intersection res <- fuzzy_simplicial_set( set_op_mix_ratio = 0.5, local_connectivity = 1, bandwidth = 1, verbose = FALSE, n_threads = 0, nn = nn ) expect_equal(res, V_mix, tol = 1e-4) # intersection res <- fuzzy_simplicial_set( set_op_mix_ratio = 0, local_connectivity = 1, bandwidth = 1, verbose = FALSE, n_threads = 0, nn = nn ) expect_equal(res, V_intersect, tol = 1e-4) # Union + local_connectivity res <- fuzzy_simplicial_set( set_op_mix_ratio = 1, local_connectivity = 1.5, bandwidth = 1, verbose = FALSE, n_threads = 0, nn = nn ) expect_equal(res, V_union_local, tol = 1e-4) # use unique iris nbrs to make comparison with Python UMAP easier # Union + bandwidth res <- fuzzy_simplicial_set( set_op_mix_ratio = 1, local_connectivity = 1, bandwidth = 0.5, verbose = FALSE, n_threads = 0, nn = self_unn4 ) expect_equal(res, V_union_bandwidth, tol = 1e-4) # intersect + local + bandwidth res <- fuzzy_simplicial_set( set_op_mix_ratio = 0, local_connectivity = 1.5, bandwidth = 0.5, verbose = FALSE, n_threads = 0, nn = self_unn4 ) expect_equal(res, V_intersect_local_bandwidth, tol = 1e-4) # parallel code path # matrix res <- fuzzy_simplicial_set( set_op_mix_ratio = 1, local_connectivity = 1, bandwidth = 1, verbose = FALSE, n_threads = 1, nn = nn ) expect_equal(res, V_union, tol = 1e-4) # mix union + intersection res <- fuzzy_simplicial_set( set_op_mix_ratio = 0.5, local_connectivity = 1, bandwidth = 1, verbose = FALSE, n_threads = 1, nn = nn ) expect_equal(res, V_mix, tol = 1e-4) # intersection res <- fuzzy_simplicial_set( set_op_mix_ratio = 0, local_connectivity = 1, bandwidth = 1, verbose = FALSE, n_threads = 1, nn = nn ) expect_equal(res, V_intersect, tol = 1e-4) # Union + local_connectivity res <- fuzzy_simplicial_set( set_op_mix_ratio = 1, local_connectivity = 1.5, bandwidth = 1, verbose = FALSE, n_threads = 1, nn = nn ) expect_equal(res, V_union_local, tol = 1e-4) # Union + bandwidth res <- fuzzy_simplicial_set( set_op_mix_ratio = 1, local_connectivity = 1, bandwidth = 0.5, verbose = FALSE, n_threads = 1, nn = self_unn4 ) expect_equal(res, V_union_bandwidth, tol = 1e-4) # intersect + local + bandwidth res <- fuzzy_simplicial_set( set_op_mix_ratio = 0, local_connectivity = 1.5, bandwidth = 0.5, verbose = FALSE, n_threads = 1, nn = self_unn4 ) expect_equal(res, V_intersect_local_bandwidth, tol = 1e-4) # parallel code path # matrix res <- fuzzy_simplicial_set( set_op_mix_ratio = 1, local_connectivity = 1, bandwidth = 1, verbose = FALSE, n_threads = 1, nn = nn ) expect_equal(res, V_union, tol = 1e-4) # mix union + intersection res <- fuzzy_simplicial_set( set_op_mix_ratio = 0.5, local_connectivity = 1, bandwidth = 1, verbose = FALSE, n_threads = 1, nn = nn ) expect_equal(res, V_mix, tol = 1e-4) # intersection res <- fuzzy_simplicial_set( set_op_mix_ratio = 0, local_connectivity = 1, bandwidth = 1, verbose = FALSE, n_threads = 1, nn = nn ) expect_equal(res, V_intersect, tol = 1e-4) # Union + local_connectivity res <- fuzzy_simplicial_set( set_op_mix_ratio = 1, local_connectivity = 1.5, bandwidth = 1, verbose = FALSE, n_threads = 1, nn = nn ) expect_equal(res, V_union_local, tol = 1e-4) # Union + bandwidth res <- fuzzy_simplicial_set( set_op_mix_ratio = 1, local_connectivity = 1, bandwidth = 0.5, verbose = FALSE, n_threads = 1, nn = self_unn4 ) expect_equal(res, V_union_bandwidth, tol = 1e-4) # intersect + local + bandwidth res <- fuzzy_simplicial_set( set_op_mix_ratio = 0, local_connectivity = 1.5, bandwidth = 0.5, verbose = FALSE, n_threads = 1, nn = self_unn4 ) expect_equal(res, V_intersect_local_bandwidth, tol = 1e-4) uwot/tests/testthat/test_curve.R0000644000176200001440000000037213631262762016572 0ustar liggesuserslibrary(uwot) context("Curve Parameters") expect_equal(as.vector(find_ab_params(spread = 1, min_dist = 0.001)), c(1.929, 0.792), tol = 1e-3 ) expect_equal(as.vector(find_ab_params(spread = 1, min_dist = 0.1)), c(1.577, 0.895), tol = 1e-3 ) uwot/tests/testthat/test_spectral.R0000644000176200001440000000301614730166740017261 0ustar liggesuserslibrary(uwot) library(RSpectra) context("Spectral") test_that("1 dimensional output gives a matrix", { expect_ok_matrix(spectral_init(V_union, ndim = 1, verbose = FALSE), nc = 1) expect_ok_matrix(normalized_laplacian_init(V_union, ndim = 1, verbose = FALSE ), nc = 1) expect_ok_matrix(laplacian_eigenmap(V_union, ndim = 1, verbose = FALSE), nc = 1 ) # 23: ndim was always 2 expect_ok_matrix(agspectral_init(V_union, n_neg_nbrs = 2, ndim = 1, verbose = FALSE), nc = 1 ) }) test_that("connected components", { # Example from doc of scipy.sparse.csgraph.connected_components graph <- as(Matrix::drop0(matrix( c( 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0 ), nrow = 5, byrow = TRUE )), "generalMatrix") cc_res <- connected_components(graph) expect_equal(cc_res$n_components, 2) expect_equal(cc_res$labels, c(0, 0, 0, 1, 1)) # Slightly more complicated example validated by running the Python version graph100 <- matrix(0, nrow = 10, ncol = 10) graph100[cbind(c(2, 6, 7, 8), c(5, 3, 7, 6))] <- 1 graph100 <- Matrix::drop0(graph100) g100_nc <- 7 g100_labels <- c(0, 1, 2, 3, 1, 2, 4, 2, 5, 6) cc_res <- connected_components(graph100) expect_equal(cc_res$n_components, g100_nc) expect_equal(cc_res$labels, g100_labels) # test recursive initialization of components sgraph <- graph + Matrix::t(graph) expect_ok_matrix(spectral_init(sgraph), nr = 5, nc = 2) }) uwot/tests/testthat/test_normlaplacian.R0000644000176200001440000001024714730166740020270 0ustar liggesuserslibrary(uwot) library(RSpectra) context("normalized laplacian") # this exists as a separate file only because it's easier to comment out as # part of temporarily removing any function calls to rspectra when using rhub # with sanitizers and valgrind (probably the extended compilation time with # eigen causes a preperror) # These numbers come from running UMAP Python code: # spectral_layout(pairwise_distances(iris.data[0:10, :])) # NB: # 1. iris data in scikit-learn is currently from UCI repo, which has errors # (although this doesn't affect the first ten entries) # 2. eigenvector calculation is not that converged and specifies a starting # vector that we can't supply with either RSpectra or eigen. # 3. The eigenvectors are only identical up to a sign, so we take the absolute # values. abs_expected_norm_lap <- abs( c2y( 0.7477, -0.1292, -0.03001, 0.02127, -0.563, -0.01149, 0.1402, -0.2725, -0.01241, 0.1084, -0.106, -0.5723, 0.2024, -0.3082, 0.1642, -5.549e-05, -0.04843, -0.1747, 0.1684, 0.6611 ) ) sparse_m <- Matrix::drop0(x2d(iris[1:10, ])) test_that("normalized laplacian", { res <- normalized_laplacian_init(sparse_m) expect_equal(abs(res), abs_expected_norm_lap, tolerance = 0.2) }) test_that("irlba tsvd normalized", { res <- irlba_tsvd_normalized_laplacian_init(sparse_m) expect_equal(abs(res), abs_expected_norm_lap, tolerance = 0.2) }) test_that("irlba normalized", { res <- irlba_normalized_laplacian_init(sparse_m) expect_equal(abs(res), abs_expected_norm_lap, tolerance = 0.2) }) test_that("laplacian eigenmap", { # tested via sklearn # things to note: # 1. output eigenvectors are not scaled to 1, due to the D^-1/2 transformation # from Lsym's eigenvectors back to Lrw # 2. Lsym is formed by calling # scipy.sparse.csgraph.laplacian(normed=True) on the affinity matrix, # which assumes the diagonal is zero. # # symmetrized normalized graph laplacian # from sklearn.preprocessing import normalize # from sklearn.datasets import load_digits # from sklearn.manifold import SpectralEmbedding # X, _ = load_digits(return_X_y=True) # embedding = SpectralEmbedding(n_components=3, n_neighbors=4, affinity="rbf", gamma = 1e-3) # X_transformed = embedding.fit_transform(X[:10]) # normalize(X_transformed, axis = 0) expected_lap_eig <- matrix( c( 0.21050269, -0.07732118, 0.63486516, -0.33501476, 0.11755963, -0.40229306, -0.36728785, 0.38404235, 0.020391, 0.20458482, -0.04123934, -0.44198941, -0.3841261, -0.47833969, 0.17196966, 0.3883986, -0.03743132, -0.22790212, -0.36483447, -0.32492041, 0.01860336, -0.27419176, 0.68954246, 0.34392682, 0.04537549, 0.14056785, 0.12175907, 0.39742651, -0.00077821, 0.15609656 ), ncol = 3, byrow = TRUE ) A <- matrix( c( 0, 0.0288109, 0.053397, 0.104038, 0.079341, 0.145439, 0.0946093, 0.0434563, 0.139317, 0.189191, 0.0288109, 0, 0.176753, 0.126438, 0.100761, 0.108501, 0.197306, 0.0744967, 0.0940433, 0.0614212, 0.053397, 0.176753, 0, 0.0544213, 0.0662712, 0.0462358, 0.124431, 0.0876854, 0.162838, 0.0494398, 0.104038, 0.126438, 0.0544213, 0, 0.0725848, 0.322066, 0.107206, 0.0395971, 0.164969, 0.130029, 0.079341, 0.100761, 0.0662712, 0.0725848, 0, 0.0532904, 0.255125, 0.0290714, 0.0634819, 0.0482673, 0.145439, 0.108501, 0.0462358, 0.322066, 0.0532904, 0, 0.0748701, 0.0202419, 0.187683, 0.380222, 0.0946093, 0.197306, 0.124431, 0.107206, 0.255125, 0.0748701, 0, 0.0273237, 0.142702, 0.0647643, 0.0434563, 0.0744967, 0.0876854, 0.0395971, 0.0290714, 0.0202419, 0.0273237, 0, 0.0584841, 0.0431531, 0.139317, 0.0940433, 0.162838, 0.164969, 0.0634819, 0.187683, 0.142702, 0.0584841, 0, 0.158817, 0.189191, 0.0614212, 0.0494398, 0.130029, 0.0482673, 0.380222, 0.0647643, 0.0431531, 0.158817, 0 ), nrow = 10 ) res <- laplacian_eigenmap(A, ndim = 3) expect_equal(abs(res), abs(expected_lap_eig), tolerance = 1e-4) expect_equal(abs(irlba_laplacian_eigenmap(A, ndim = 3)), abs(expected_lap_eig), tolerance = 1e-4) }) uwot/tests/testthat/test_saveload.R0000644000176200001440000002566414733074465017264 0ustar liggesuserslibrary(uwot) library(RSpectra) context("load/save model") test_that("can save and load simple model", { set.seed(1337) model <- umap(iris10, n_neighbors = 4, n_epochs = 2, init = "spca", metric = "euclidean", verbose = FALSE, n_threads = 0, ret_model = TRUE ) mod_fname <- tempfile(tmpdir = tempdir()) model <- save_uwot(model, file = mod_fname) expect_true(file.exists(mod_fname)) # Can use model after saving set.seed(1337) res_trans <- umap_transform(iris10, model) expect_ok_matrix(res_trans) # Clean up temp dir from saving expect_true(file.exists(model$mod_dir)) unload_uwot(model) expect_false(file.exists(model$mod_dir)) # Can't use transform now model is unloaded expect_error(umap_transform(iris10, model), "is unloaded") modelload <- load_uwot(file = mod_fname) set.seed(1337) resload_trans <- umap_transform(iris10, modelload) expect_ok_matrix(resload_trans) expect_equal(resload_trans, res_trans) if (file.exists(mod_fname)) { unlink(mod_fname) } # Clean up temp dir from loading expect_true(file.exists(modelload$mod_dir)) unload_uwot(modelload) expect_false(file.exists(modelload$mod_dir)) }) test_that("can save and load mixed distance model", { set.seed(1337) jiris10 <- jitter(iris10) metric2 <- list( "euclidean" = c(1, 2), "cosine" = c("Petal.Length", "Petal.Width") ) model <- umap(jiris10, n_neighbors = 4, n_epochs = 2, init = "spca", metric = metric2, verbose = FALSE, n_threads = 0, ret_nn = TRUE, ret_model = TRUE ) mod_fname <- tempfile(tmpdir = tempdir()) model <- save_uwot(model, file = mod_fname) expect_true(file.exists(mod_fname)) # Can use model after saving set.seed(1337) res_trans <- umap_transform(jiris10, model) expect_ok_matrix(res_trans) # Clean up temp dir from saving expect_true(file.exists(model$mod_dir)) unload_uwot(model) expect_false(file.exists(model$mod_dir)) # Can't use transform now model is unloaded expect_error(umap_transform(iris10, model), "is unloaded") modelload <- load_uwot(file = mod_fname) set.seed(1337) resload_trans <- umap_transform(jiris10, modelload) expect_ok_matrix(resload_trans) expect_equal(resload_trans, res_trans) if (file.exists(mod_fname)) { unlink(mod_fname) } # Clean up temp dir from loading expect_true(file.exists(modelload$mod_dir)) unload_uwot(modelload) expect_false(file.exists(modelload$mod_dir)) }) test_that("unloading a model on save", { set.seed(1337) model <- umap(iris10, n_neighbors = 4, n_epochs = 2, init = "spca", metric = "euclidean", verbose = FALSE, n_threads = 0, ret_model = TRUE ) mod_fname <- tempfile(tmpdir = tempdir()) model <- save_uwot(model, file = mod_fname, unload = TRUE) expect_false(file.exists(model$mod_dir)) # Trying to transform with a model that got unloaded won't work expect_error(umap_transform(iris10, model), "is unloaded") modelload <- load_uwot(file = mod_fname) # Clean up temp dir from loading expect_true(file.exists(modelload$mod_dir)) # Can avoid cleaning up if you really want that unload_uwot(modelload, cleanup = FALSE) expect_true(file.exists(modelload$mod_dir)) # Can unload multiple times unload_uwot(modelload, cleanup = TRUE) expect_false(file.exists(modelload$mod_dir)) }) # #88 test_that("save-load-save", { set.seed(1337) X <- matrix(rnorm(100), 10, 10) model <- uwot::umap(X, n_neighbors = 4, ret_model = TRUE) model_file <- tempfile(tmpdir = tempdir()) model <- uwot::save_uwot(model, file = model_file) model2 <- uwot::load_uwot(file = model_file) new_file <- tempfile(tmpdir = tempdir()) uwot::save_uwot(model2, file = new_file) expect_true(file.exists(new_file)) modelm <- uwot::umap(X, n_neighbors = 4, metric = list("euclidean" = 1:5, "euclidean" = 6:10), ret_model = TRUE) modelm_file <- tempfile(tmpdir = tempdir()) modelm <- uwot::save_uwot(modelm, file = modelm_file) modelm2 <- uwot::load_uwot(file = modelm_file) new_filem <- tempfile(tmpdir = tempdir()) uwot::save_uwot(modelm2, file = new_filem) expect_true(file.exists(new_filem)) }) # #117 correlation metric not correctly restored test_that("reload-correlation", { set.seed(1337) model <- umap(iris10, n_neighbors = 4, n_epochs = 2, init = "spca", metric = "correlation", verbose = FALSE, n_threads = 0, ret_model = TRUE, ret_extra = c("nn") ) expect_equal(names(model$metric), "correlation") expect_equal(model$nn_index$metric, "correlation") set.seed(1337) transformed_before_reload <- umap_transform(iris10, model, n_epochs = 2, ret_extra = c("nn")) expect_equal( transformed_before_reload$nn$correlation$dist, model$nn$correlation$dist, check.attributes = FALSE, tol = 1e-7 ) mod_fname <- tempfile(tmpdir = tempdir()) model <- save_uwot(model, file = mod_fname, unload = TRUE) modelload <- load_uwot(file = mod_fname) expect_equal(names(model$metric), "correlation") expect_equal(model$nn_index$metric, "correlation") set.seed(1337) transformed_after_reload <- umap_transform(iris10, modelload, n_epochs = 2, ret_extra = c("nn")) expect_equal( transformed_after_reload$nn$correlation$dist, model$nn$correlation$dist, check.attributes = FALSE, tol = 1e-7 ) if (file.exists(mod_fname)) { unlink(mod_fname) } expect_true(file.exists(modelload$mod_dir)) unload_uwot(modelload) expect_false(file.exists(modelload$mod_dir)) }) test_that("save-load hnsw", { testthat::skip_if_not_installed("RcppHNSW") set.seed(1337) model <- umap(iris10, n_neighbors = 4, n_epochs = 2, init = "spca", metric = "euclidean", verbose = FALSE, n_threads = 0, ret_model = TRUE, ret_extra = c("nn"), nn_method = "hnsw" ) expect_equal(model$nn_method, "hnsw") set.seed(1337) transformed_before_reload <- umap_transform(iris10, model, n_epochs = 2, ret_extra = c("nn") ) mod_fname <- tempfile(tmpdir = tempdir()) model <- save_uwot(model, file = mod_fname, unload = TRUE) modelload <- load_uwot(file = mod_fname) expect_equal(modelload$nn_method, "hnsw") set.seed(1337) transformed_after_reload <- umap_transform(iris10, modelload, n_epochs = 2, ret_extra = c("nn") ) if (file.exists(mod_fname)) { unlink(mod_fname) } expect_true(file.exists(modelload$mod_dir)) unload_uwot(modelload) expect_false(file.exists(modelload$mod_dir)) expect_equal(model$nn$euclidean$idx, modelload$nn$euclidean$idx) expect_equal(model$nn$euclidean$dist, modelload$nn$euclidean$dist) expect_equal( transformed_before_reload$nn$euclidean$idx, transformed_after_reload$nn$euclidean$idx, ) expect_equal( transformed_before_reload$nn$euclidean$dist, transformed_after_reload$nn$euclidean$dist, check.attributes = FALSE, tol = 1e-7 ) expect_equal( transformed_before_reload$embedding, transformed_after_reload$embedding ) }) test_that("save-load nndescent", { testthat::skip_if_not_installed("rnndescent") set.seed(1337) model <- umap(iris10, n_neighbors = 4, n_epochs = 2, init = "spca", metric = "euclidean", verbose = FALSE, n_threads = 0, ret_model = TRUE, ret_extra = c("nn"), nn_method = "nndescent" ) expect_equal(model$nn_method, "nndescent") set.seed(1337) transformed_before_reload <- umap_transform(iris10, model, n_epochs = 2, ret_extra = c("nn") ) mod_fname <- tempfile(tmpdir = tempdir()) model <- save_uwot(model, file = mod_fname, unload = TRUE) modelload <- load_uwot(file = mod_fname) expect_equal(modelload$nn_method, "nndescent") set.seed(1337) transformed_after_reload <- umap_transform(iris10, modelload, n_epochs = 2, ret_extra = c("nn") ) if (file.exists(mod_fname)) { unlink(mod_fname) } expect_true(file.exists(modelload$mod_dir)) unload_uwot(modelload) expect_false(file.exists(modelload$mod_dir)) expect_equal(model$nn$euclidean$idx, modelload$nn$euclidean$idx) expect_equal(model$nn$euclidean$dist, modelload$nn$euclidean$dist) expect_equal( transformed_before_reload$nn$euclidean$idx, transformed_after_reload$nn$euclidean$idx, ) expect_equal( transformed_before_reload$nn$euclidean$dist, transformed_after_reload$nn$euclidean$dist, check.attributes = FALSE, tol = 1e-7 ) expect_equal( transformed_before_reload$embedding, transformed_after_reload$embedding ) mod_fname2 <- tempfile(tmpdir = tempdir()) saveRDS(modelload, mod_fname2) modelload2 <- readRDS(mod_fname2) expect_equal(modelload2$nn_method, "nndescent") set.seed(1337) transformed_after_reload2 <- umap_transform(iris10, modelload2, n_epochs = 2, ret_extra = c("nn") ) expect_equal( transformed_after_reload$nn$euclidean$idx, transformed_after_reload2$nn$euclidean$idx, ) expect_equal( transformed_after_reload$nn$euclidean$dist, transformed_after_reload2$nn$euclidean$dist, check.attributes = FALSE, tol = 1e-7 ) expect_equal( transformed_after_reload$embedding, transformed_after_reload2$embedding ) if (file.exists(mod_fname2)) { unlink(mod_fname2) } }) # 131: can't use a relative path for saving test_that("save-load relative path", { set.seed(1337) model <- umap(iris10, n_neighbors = 4, n_epochs = 2, init = "spca", metric = "euclidean", verbose = FALSE, n_threads = 0, ret_model = TRUE ) # remember to go back to the original working directory old_wd <- getwd() on.exit(setwd(old_wd), add = TRUE) # move to a temp directory for this test test_dir <- tempfile("test_relative_path_") dir.create(test_dir) setwd(test_dir) # Test 1: relative path no sub-folders rel_path <- "model.uwot" model <- save_uwot(model, file = rel_path) expect_true(file.exists(rel_path)) modelload <- load_uwot(file = rel_path) set.seed(1337) resload_trans <- umap_transform(iris10, modelload) expect_ok_matrix(resload_trans) # Test 2: Relative path within a sub-folder dir.create("my_folder") rel_sub_path <- file.path("my_folder", "model.uwot") model <- save_uwot(model, file = rel_sub_path) expect_true(file.exists(rel_sub_path)) modelload <- load_uwot(file = rel_sub_path) set.seed(1337) resload_trans <- umap_transform(iris10, modelload) expect_ok_matrix(resload_trans) }) uwot/tests/testthat/test_epochs.R0000644000176200001440000000117314276620504016725 0ustar liggesuserslibrary(uwot) context("Epochs") V <- fuzzy_simplicial_set(nn = nn) n_epochs <- 500 V@x[V@x < max(V@x) / n_epochs] <- 0 V <- Matrix::drop0(V) expect_equal(make_epochs_per_sample(V@x, n_epochs), c( 1.0, 1.0, 1.0, 6.1763447, 1.44948659, 2.16709542, 2.35197392, 1.0, 1.44948659, 1.0, 1.0, 1.73965964, 2.00000768, 2.16709542, 1.0, 1.54738133, 1.0, 1.37424666, 1.0, 1.0, 3.56216605, 1.09552314, 1.0, 1.0, 1.0, 1.54738133, 3.56216605, 2.82683975, 1.0, 1.09552314, 2.82683975, 3.21967221, 2.35197392, 1.73965964, 1.0, 6.1763447, 1.0, 2.00000768, 1.37424666, 3.21967221 ), tol = 1e-5 ) uwot/tests/testthat/test_knn_aff.R0000644000176200001440000000323414730166740017050 0ustar liggesuserslibrary(uwot) context("knn affinity") expected_sparse <- matrix(0, nrow = 10, ncol = 10) for (i in seq_len(nrow(nn$idx))) { for (j in seq_len(ncol(nn$idx))) { expected_sparse[i, nn$idx[i, j]] <- 2 } } expected_sparse <- Matrix::drop0(expected_sparse) res <- nng_to_sparse(nn$idx, val = 2) expect_equal(res, expected_sparse) v <- 1 expected_sparse_mv <- matrix(0, nrow = 10, ncol = 10) for (i in seq_len(nrow(nn$idx))) { nnr <- sort(nn$idx[i, ]) for (j in seq_len(ncol(nn$idx))) { expected_sparse_mv[i, nnr[j]] <- v v <- v + 1 } } expect_equal(nng_to_sparse(nn$idx, matrix(1:40, nrow = 10, byrow = TRUE)), Matrix::drop0(expected_sparse_mv), check.attributes = FALSE ) res <- perplexity_similarities(iris10, 4, kernel = "knn", nn = nn)$matrix expected_sym_nn_graph <- matrix(0, nrow = 10, ncol = 10) o3 <- 1 / 3 o6 <- 1 / 6 expected_sym_nn_graph[1, c(5, 6, 8, 10)] <- c(o3, o6, o3, o6) expected_sym_nn_graph[2, c(3, 4, 9, 10)] <- c(o3, o6, o6, o3) expected_sym_nn_graph[3, c(2, 4, 7, 9, 10)] <- c(o3, o3, o3, o6, o6) expected_sym_nn_graph[4, c(2, 3, 7, 9, 10)] <- c(o6, o3, o6, o3, o3) expected_sym_nn_graph[5, c(1, 6, 7, 8)] <- c(o3, o6, o6, o3) expected_sym_nn_graph[6, c(1, 5, 8)] <- c(o6, o6, o6) expected_sym_nn_graph[7, c(3, 4, 5, 8)] <- c(o3, o6, o6, o6) expected_sym_nn_graph[8, c(1, 5, 6, 7, 10)] <- c(o3, o3, o6, o6, o6) expected_sym_nn_graph[9, c(2, 3, 4)] <- c(o6, o6, o3) expected_sym_nn_graph[10, c(1, 2, 3, 4, 8)] <- c(o6, o3, o6, o3, o6) expect_equal(sum(res), 10) expect_true(Matrix::isSymmetric(res)) expect_equal(as.matrix(res), expected_sym_nn_graph, check.attributes = FALSE, tol = 1e-7 ) uwot/tests/testthat/helper_data.R0000644000176200001440000000737514730166740016671 0ustar liggesusers# Small -ve distances are possible dist2 <- function(X) { D2 <- rowSums(X * X) D2 + sweep(X %*% t(X) * -2, 2, t(D2), `+`) } # Squared Euclidean distances, ensuring no small -ve distances can occur safe_dist2 <- function(X) { D2 <- dist2(X) D2[D2 < 0] <- 0 D2 } # convert dataframe to distance matrix x2d <- function(X) { sqrt(safe_dist2(x2m(X))) } # Covert a vector into a 2D matrix for generating Y output c2y <- function(...) { matrix(unlist(list(...)), ncol = 2) } iris10 <- NULL iris10_Y <- NULL diris10 <- NULL dmiris10 <- NULL dmiris10z <- NULL ycat <- NULL ycat2 <- NULL ynum <- NULL ynum2 <- NULL nn <- NULL ui10 <- NULL unn4 <- NULL self_unn4 <- NULL create_data <- function() { iris10 <<- x2m(iris[1:10, ]) iris10_Y <<- pca_init(iris10, ndim = 2) diris10 <<- stats::dist(iris10) # Sparse iris10 dist dmiris10 <<- as.matrix(diris10) dmiris10zl <- dmiris10 dmiris10zl[dmiris10zl > 0.71] <- 0 dmiris10z <<- as(Matrix::drop0(dmiris10zl), "generalMatrix") # some Y data ycat <<- as.factor(c(levels(iris$Species)[rep(1:3, each = 3)], NA)) ycat2 <<- as.factor(c(NA, levels(iris$Species)[rep(1:3, times = 3)])) ynum <<- (1:10) / 10 ynum2 <<- seq(from = 10, to = -10, length.out = 10) / 100 nnl <- find_nn(iris10, k = 4, method = "fnn", metric = "euclidean", n_threads = 0, verbose = FALSE ) row.names(nnl$idx) <- row.names(iris10) row.names(nnl$dist) <- row.names(iris10) nn <<- nnl # ten iris entries where the 4 nearest neighbors are distinct uiris <- unique(iris) uirism <- as.matrix(uiris[, -5]) ui10 <<- uirism[6:15, ] unn4 <<- list( idx = matrix(c( 6, 10, 3, 7, 7, 3, 5, 8, 7, 5, 2, 8, 9, 8, 2, 5, 8, 3, 7, 2, 1, 3, 10, 7, 3, 2, 5, 8, 5, 4, 7, 3, 4, 8, 2, 5, 6, 1, 3, 7 ), nrow = 10, byrow = TRUE), dist = matrix(c( 0.3464102, 0.6782330, 0.7000000, 0.8124038, 0.3000000, 0.4242641, 0.4795832, 0.4898979, 0.2236068, 0.3316625, 0.4242641, 0.4690416, 0.3464102, 0.4242641, 0.5477226, 0.5567764, 0.1732051, 0.3316625, 0.3464102, 0.4795832, 0.3464102, 0.5000000, 0.5830952, 0.6782330, 0.2236068, 0.3000000, 0.3464102, 0.4582576, 0.1732051, 0.4242641, 0.4582576, 0.4690416, 0.3464102, 0.5830952, 0.6164414, 0.7280110, 0.5830952, 0.6782330, 1.0440307, 1.2328828 ), nrow = 10, byrow = TRUE) ) self_unn4 <<- list( idx = matrix(c( 1, 6, 10, 3, 2, 7, 3, 5, 3, 7, 5, 2, 4, 9, 8, 2, 5, 8, 3, 7, 6, 1, 3, 10, 7, 3, 2, 5, 8, 5, 4, 7, 9, 4, 8, 2, 10, 6, 1, 3 ), nrow = 10, byrow = TRUE), dist = matrix(c( 0, 0.3464102, 0.6782330, 0.7000000, 0, 0.3000000, 0.4242641, 0.4795832, 0, 0.2236068, 0.3316625, 0.4242641, 0, 0.3464102, 0.4242641, 0.5477226, 0, 0.1732051, 0.3316625, 0.3464102, 0, 0.3464102, 0.5000000, 0.5830952, 0, 0.2236068, 0.3000000, 0.3464102, 0, 0.1732051, 0.4242641, 0.4582576, 0, 0.3464102, 0.5830952, 0.6164414, 0, 0.5830952, 0.6782330, 1.0440307 ), nrow = 10, byrow = TRUE) ) } # Just test that res is a matrix with valid numbers expect_ok_matrix <- function(res, nr = 10, nc = 2) { expect_is(res, "matrix") expect_equal(nrow(res), nr) expect_equal(ncol(res), nc) expect_false(any(is.infinite(res))) } expect_is_nn <- function(res, nr = 10, k = 4) { expect_is(res, "list") expect_is_nn_matrix(res$dist, nr, k) expect_is_nn_matrix(res$idx, nr, k) } expect_is_nn_matrix <- function(res, nr = 10, k = 4) { expect_is(res, "matrix") expect_equal(nrow(res), nr) expect_equal(ncol(res), k) } create_data() uwot/tests/testthat/test_perplexity.R0000644000176200001440000002563314730166740017662 0ustar liggesuserslibrary(uwot) context("perplexity") # Full neighbor values based on comparison with smallvis results iris10_nn10 <- dist_nn(stats::dist(iris10), k = 10) P_symm <- matrix(c( 0.000000e+00, 0.0022956859, 0.0022079944, 0.0004763074, 4.338953e-02, 1.822079e-02, 0.002913239, 0.0413498285, 5.184416e-05, 0.004134502, 2.295686e-03, 0.0000000000, 0.0188919615, 0.0129934442, 1.089032e-03, 5.689921e-04, 0.002131646, 0.0048261793, 6.996252e-03, 0.050676976, 2.207994e-03, 0.0188919615, 0.0000000000, 0.0444964580, 2.464225e-03, 5.935835e-04, 0.040353636, 0.0027720360, 1.111298e-02, 0.013673490, 4.763074e-04, 0.0129934442, 0.0444964580, 0.0000000000, 5.466455e-04, 2.771325e-04, 0.018028275, 0.0005761904, 3.389471e-02, 0.014363302, 4.338953e-02, 0.0010890318, 0.0024642250, 0.0005466455, 0.000000e+00, 1.831834e-02, 0.006393040, 0.0329052015, 5.372241e-05, 0.002356628, 1.822079e-02, 0.0005689921, 0.0005935835, 0.0002771325, 1.831834e-02, 0.000000e+00, 0.001326343, 0.0110122168, 1.065771e-05, 0.001168212, 2.913239e-03, 0.0021316462, 0.0403536359, 0.0180282748, 6.393040e-03, 1.326343e-03, 0.000000000, 0.0059083283, 4.862680e-03, 0.002656313, 4.134983e-02, 0.0048261793, 0.0027720360, 0.0005761904, 3.290520e-02, 1.101222e-02, 0.005908328, 0.0000000000, 2.982247e-04, 0.012212476, 5.184416e-05, 0.0069962518, 0.0111129834, 0.0338947056, 5.372241e-05, 1.065771e-05, 0.004862680, 0.0002982247, 0.000000e+00, 0.004150755, 4.134502e-03, 0.0506769759, 0.0136734904, 0.0143633019, 2.356628e-03, 1.168212e-03, 0.002656313, 0.0122124758, 4.150755e-03, 0.000000000 ) * 10, nrow = 10, byrow = TRUE) res <- perplexity_similarities( perplexity = 4, verbose = FALSE, nn = find_nn(iris10, k = 10, method = "fnn", metric = "euclidean", n_threads = 0, verbose = FALSE ) )$matrix expect_true(Matrix::isSymmetric(res)) expect_equal(as.matrix(res), P_symm, tol = 1e-5, check.attributes = FALSE) Psymm9 <- matrix( c( 0, 0.1111, 0.1112, 0.1110, 0.1116, 0.1113, 0.1112, 0.1115, 0.1106, 0.1112, 0.1111, 0, 0.1113, 0.1113, 0.1110, 0.1107, 0.1112, 0.1112, 0.1112, 0.1114, 0.1112, 0.1113, 0, 0.1113, 0.1112, 0.1106, 0.1114, 0.1112, 0.1112, 0.1113, 0.1110, 0.1113, 0.1113, 0, 0.1110, 0.1105, 0.1114, 0.1111, 0.1113, 0.1114, 0.1116, 0.1110, 0.1112, 0.1110, 0, 0.1113, 0.1113, 0.1115, 0.1106, 0.1111, 0.1113, 0.1107, 0.1106, 0.1105, 0.1113, 0, 0.1105, 0.1111, 0.1103, 0.1105, 0.1112, 0.1112, 0.1114, 0.1114, 0.1113, 0.1105, 0, 0.1113, 0.1112, 0.1112, 0.1115, 0.1112, 0.1112, 0.1111, 0.1115, 0.1111, 0.1113, 0, 0.1108, 0.1114, 0.1106, 0.1112, 0.1112, 0.1113, 0.1106, 0.1103, 0.1112, 0.1108, 0, 0.1111, 0.1112, 0.1114, 0.1113, 0.1114, 0.1111, 0.1105, 0.1112, 0.1114, 0.1111, 0 ), byrow = TRUE, nrow = 10 ) res <- perplexity_similarities( perplexity = 9, verbose = FALSE, nn = find_nn(iris10, k = 10, method = "fnn", metric = "euclidean", n_threads = 0, verbose = FALSE ) )$matrix expect_true(Matrix::isSymmetric(res)) expect_equal(as.matrix(res), Psymm9, tol = 1e-4, check.attributes = FALSE) P_symm_6nn <- matrix(c( 0, 0, 0.004227396, 0, 0.038581602, 0.016370215, 0.003972948, 0.037491042, 0, 0.007253571, 0, 0, 0.020541010, 0.01457322, 0, 0, 0, 0.008117719, 0.008608916, 0.043891243, 0.004227396, 0.020541010, 0, 0.04314614, 0.004242199, 0, 0.036275982, 0.004791681, 0.010952319, 0.015666352, 0, 0.014573224, 0.043146139, 0, 0, 0, 0.018725165, 0, 0.032811238, 0.015644628, 0.038581602, 0, 0.004242199, 0, 0, 0.016370215, 0.010365583, 0.031963895, 0, 0.003730662, 0.016370215, 0, 0, 0, 0.016370215, 0, 0.002795087, 0.011902114, 0, 0.002562369, 0.003972948, 0, 0.036275982, 0.01872517, 0.010365583, 0.002795087, 0, 0.006321792, 0.004717900, 0.003609179, 0.037491042, 0.008117719, 0.004791681, 0, 0.031963895, 0.011902114, 0.006321792, 0, 0, 0.015406444, 0, 0.008608916, 0.010952319, 0.03281124, 0, 0, 0.004717900, 0, 0, 0.004370167, 0.007253571, 0.043891243, 0.015666352, 0.01564463, 0.003730662, 0.002562369, 0.003609179, 0.015406444, 0.004370167, 0 ) * 10, nrow = 10, byrow = TRUE) res <- perplexity_similarities( perplexity = 4, verbose = FALSE, nn = find_nn(iris10, k = 6, method = "fnn", metric = "euclidean", n_threads = 0, verbose = FALSE ) )$matrix expect_true(Matrix::isSymmetric(res)) expect_equal(as.matrix(res), P_symm_6nn, tol = 1e-5, check.attributes = FALSE) # x2aff(stats::dist(iris10), perplexity = 4) P_row <- matrix(c( 0.000000e+00, 0.03254778, 0.04322171, 0.009522236, 4.179712e-01, 1.389888e-02, 0.03932256, 0.3802648571, 1.633620e-04, 0.06308741, 1.336594e-02, 0.00000000, 0.21654628, 0.163906282, 4.387114e-03, 4.819686e-08, 0.02029701, 0.0618376045, 2.029701e-02, 0.49936271, 9.381792e-04, 0.16129295, 0.00000000, 0.400023323, 9.381792e-04, 7.502536e-16, 0.29552576, 0.0143118438, 7.811162e-03, 0.11915861, 3.912338e-06, 0.09596260, 0.48990584, 0.000000000, 3.912338e-06, 1.913538e-19, 0.09596260, 0.0009992458, 1.842071e-01, 0.13295484, 4.498193e-01, 0.01739352, 0.04834632, 0.010928997, 0.000000e+00, 1.584988e-02, 0.07694327, 0.3403719404, 2.009270e-04, 0.04014584, 3.505169e-01, 0.01137979, 0.01187167, 0.005542650, 3.505169e-01, 0.000000e+00, 0.02652673, 0.2200679860, 2.131340e-04, 0.02336422, 1.894222e-02, 0.02233591, 0.51154696, 0.264602894, 5.091753e-02, 1.331050e-07, 0.00000000, 0.0834805843, 1.155348e-02, 0.03662029, 4.467317e-01, 0.03468598, 0.04112888, 0.010524562, 3.177321e-01, 1.763500e-04, 0.03468598, 0.0000000000, 1.925167e-05, 0.11431520, 8.735213e-04, 0.11962802, 0.21444851, 0.493687059, 8.735213e-04, 2.023140e-08, 0.08570012, 0.0059452421, 0.000000e+00, 0.07884399, 1.960264e-02, 0.51417681, 0.15431120, 0.154311203, 6.986716e-03, 2.081749e-08, 0.01650597, 0.1299343209, 4.171117e-03, 0.00000000 ), nrow = 10, byrow = TRUE) # taken from smallvis expected_sigmas <- c( 0.3252233, 0.2679755, 0.1817380, 0.1751287, 0.3280264, 0.4861266, 0.2463306, 0.2422687, 0.3463065, 0.2411619 ) iris10nn10d <- as.vector(t(iris10_nn10$dist)) resp <- calc_row_probabilities_parallel(iris10nn10d, n_vertices = nrow(iris10_nn10$dist), perplexity = 4, n_threads = 0, ret_sigma = TRUE ) res <- resp$matrix res <- nng_to_sparse(iris10_nn10$idx, as.vector(t(res)), self_nbr = TRUE, max_nbr_id = nrow(iris10_nn10$idx) ) expect_equal(as.matrix(res), P_row, tol = 1e-5, check.attributes = FALSE) expect_equal(resp$sigma, expected_sigmas, tol = 1e-5) res <- calc_row_probabilities_parallel(iris10nn10d, n_vertices = nrow(iris10_nn10$dist), perplexity = 4, n_threads = 1 )$matrix res <- nng_to_sparse(iris10_nn10$idx, as.vector(t(res)), self_nbr = TRUE, max_nbr_id = nrow(iris10_nn10$idx) ) expect_equal(as.matrix(res), P_row, tol = 1e-5, check.attributes = FALSE) iris_dup <- duplicated(x2m(iris)) uiris <- iris[!iris_dup, ] # LargeVis-style iris normalization normiris <- scale(x2m(uiris), center = TRUE, scale = FALSE) normiris <- normiris / max(abs(normiris)) # niris10_nn149 <- dist_nn(stats::dist(normiris), k = 149) # expect_equal(1 / res$sigma ^ 2, Prow_niris_p150_k50_betas, tol = 1e-5) # Taken from the LargeVis C++ implementation Prow_iris_p150_k50_rowSums <- c( 1.064902, 1.01981, 1.022902, 1.00269, 1.058712, 0.959587, 1.020604, 1.072308, 0.918501, 1.035426, 1.010711, 1.055485, 1.00664, 0.874596, 0.840662, 0.782034, 0.960034, 1.065464, 0.91116, 1.029154, 1.016113, 1.041956, 0.94594, 1.038197, 1.010267, 1.021842, 1.064241, 1.060124, 1.058187, 1.030837, 1.03162, 1.022887, 0.938471, 0.876479, 1.042369, 1.031878, 0.992018, 1.047312, 0.93035, 1.069906, 1.057901, 0.766783, 0.954321, 1.030951, 0.977892, 1.013204, 1.025217, 1.012253, 1.028178, 1.065557, 0.84282, 1.103245, 0.981218, 0.927567, 1.190567, 1.135817, 1.11851, 0.745579, 1.074413, 0.892972, 0.717667, 1.111475, 0.84308, 1.285817, 0.884184, 0.949937, 1.078251, 1.009963, 0.930185, 0.96844, 1.107436, 1.031162, 1.161345, 1.165258, 1.076716, 1.029794, 1.008188, 1.173165, 1.26895, 0.857554, 0.919873, 0.887521, 1.015009, 1.200669, 0.958936, 0.979149, 1.070616, 0.946976, 1.019518, 0.987791, 1.007625, 1.257148, 1.038777, 0.754076, 1.078727, 1.049221, 1.098134, 1.148382, 0.74274, 1.084166, 0.819342, 1.072723, 0.958094, 1.101275, 1.036348, 0.744063, 0.689913, 0.811003, 0.896867, 0.781658, 1.162044, 1.199109, 1.110908, 0.923403, 0.831603, 1.055974, 1.167772, 0.662894, 0.653821, 0.899437, 1.003726, 0.952751, 0.705241, 1.281862, 1.039594, 0.882058, 1.306628, 1.290206, 1.09357, 0.864005, 0.824926, 0.663579, 1.063836, 1.242015, 0.843297, 0.769286, 0.907494, 1.14366, 1.252945, 1.073047, 1.022525, 0.951965, 0.977462, 0.941184, 1.050544, 1.128182, 1.230836, 0.925821, 1.158545 ) Prow_niris_p150_k50_betas <- c( 5.885742, 5.736816, 5.197266, 5.471191, 5.71875, 5.699707, 5.451172, 6.242188, 4.727051, 5.95459, 5.53418, 6.278809, 5.412598, 3.991699, 4.12207, 4.150879, 4.842285, 6.005859, 5.578369, 5.635254, 6.838379, 5.978516, 4.450684, 7.612305, 7.321289, 6.594238, 6.863281, 6.144043, 6.030762, 6.04248, 6.217285, 6.470703, 4.745117, 4.282471, 6.104004, 5.433594, 5.414551, 5.560547, 4.568359, 6.307129, 5.703125, 4.453369, 4.664551, 7.005859, 6.844238, 5.671875, 5.775879, 5.260254, 5.626465, 5.994629, 13.903809, 19.361816, 17.199951, 10.595215, 21.495117, 19.960938, 22.048828, 6.962402, 17.828125, 9.40918, 6.223511, 18.996094, 11.31665, 26.000977, 9.661377, 15.227539, 18.856934, 12.867676, 20.546875, 9.992432, 22.505859, 16.000977, 23.277344, 24.697266, 18.488281, 16.779785, 17.730957, 23.207031, 25.374023, 8.538818, 8.724121, 8.112671, 12.005859, 23.245605, 16.113281, 19.423828, 17.885254, 19.319336, 13.402344, 11.15625, 15.037109, 24.227539, 12.86377, 6.838623, 14.530762, 14.649414, 15.500977, 20.317383, 7.972168, 14.302246, 8.755371, 19.519531, 9.589355, 18.270508, 11.995361, 4.407959, 11.406738, 6.748779, 14.204102, 6.195068, 22.625977, 21.842773, 14.548828, 17.580078, 15.902832, 15.451172, 19.606445, 3.676147, 3.610718, 18.935059, 10.456055, 18.219238, 4.096863, 26.099609, 12.159668, 8.970703, 27.116211, 26.008301, 15.51416, 11.36377, 7.287109, 4.036987, 14.57373, 25.604492, 17.268066, 5.501221, 11.135986, 19.822266, 25.111816, 14.611328, 11.272705, 15.021484, 9.565918, 9.592529, 15.895508, 22.691895, 22.258789, 13.867188, 22.583008 ) res <- perplexity_similarities( perplexity = 50, n_threads = 0, verbose = FALSE, ret_sigma = TRUE, nn = find_nn(normiris, k = 149, method = "fnn", metric = "euclidean", n_threads = 0, verbose = FALSE ) ) expect_equal(Matrix::rowSums(res$matrix), Prow_iris_p150_k50_rowSums, tol = 1e-6) expect_equal(1 / res$sigma^2, Prow_niris_p150_k50_betas, tol = 1e-6) res <- perplexity_similarities( perplexity = 50, n_threads = 1, verbose = FALSE, nn = find_nn(normiris, k = 149, method = "fnn", metric = "euclidean", n_threads = 1, verbose = FALSE ) )$matrix expect_equal(Matrix::rowSums(res), Prow_iris_p150_k50_rowSums, tol = 1e-6) uwot/tests/testthat/test_output.R0000644000176200001440000010707614733074465017024 0ustar liggesuserslibrary(uwot) library(RSpectra) context("API output") set.seed(1337) # No way to compare with the Python implementation due to differences in # random number implementations as well as floating point comparison # and various architecture differences. So we'll just check that the output # is ok res <- umap(iris10, n_neighbors = 4, n_epochs = 2, learning_rate = 0.5, min_dist = 0.001, init = "normlaplacian", verbose = FALSE, n_threads = 0 ) expect_ok_matrix(res) # Results are repeatable with n_threads = 0 (or 1) and same seed set.seed(1337) res2 <- umap(iris10, n_neighbors = 4, n_epochs = 2, learning_rate = 0.5, min_dist = 0.001, init = "normlaplacian", verbose = FALSE, n_threads = 0 ) expect_equal(res2, res) # Distance matrix input res <- umap(stats::dist(iris10), n_neighbors = 4, n_epochs = 2, learning_rate = 0.5, min_dist = 0.001, init = "laplacian", verbose = FALSE, n_threads = 0 ) expect_ok_matrix(res) # t-UMAP and cosine metric res <- tumap(iris10, n_neighbors = 4, n_epochs = 2, learning_rate = 0.5, metric = "cosine", init = "spectral", verbose = FALSE, n_threads = 0 ) expect_ok_matrix(res) # UMAP and cosine metric n_threads = 1 issue #5 res <- umap(iris10, n_neighbors = 4, n_epochs = 2, learning_rate = 0.5, metric = "cosine", init = "spectral", verbose = FALSE, n_threads = 1 ) expect_ok_matrix(res) # metric = Manhattan res <- umap(iris10, n_neighbors = 4, n_epochs = 2, learning_rate = 0.5, metric = "manhattan", init = "rand", verbose = FALSE, n_threads = 0 ) expect_ok_matrix(res) res <- umap(iris10, n_neighbors = 4, n_epochs = 2, learning_rate = 0.5, metric = "manhattan", init = "spca", verbose = FALSE, n_threads = 1 ) expect_ok_matrix(res) # init with matrix iris10_pca <- prcomp(iris10, retx = TRUE, center = TRUE, scale. = FALSE )$x[, 1:2] res <- umap(iris10, n_neighbors = 4, n_epochs = 2, learning_rate = 0.5, min_dist = 0.001, init = iris10_pca, verbose = FALSE, n_threads = 0 ) expect_ok_matrix(res) # Ensure that internal C++ code doesn't modify user-supplied initialization expect_equal(iris10_pca, prcomp(iris10, retx = TRUE, center = TRUE, scale. = FALSE )$x[, 1:2]) # return nn # reset seed here so we can compare output with next test result set.seed(1337) res <- umap(iris10, n_neighbors = 4, n_epochs = 2, learning_rate = 0.5, min_dist = 0.001, init = "spca", verbose = FALSE, n_threads = 0, ret_nn = TRUE ) expect_is(res, "list") expect_ok_matrix(res$embedding) expect_is(res$nn, "list") expect_is(res$nn$euclidean, "list") expect_ok_matrix(res$nn$euclidean$idx, nc = 4) expect_ok_matrix(res$nn$euclidean$dist, nc = 4) # Use pre-calculated nn: should be the same as previous result set.seed(1337) res_nn <- umap(iris10, nn_method = res$nn[[1]], n_epochs = 2, learning_rate = 0.5, min_dist = 0.001, init = "spca", verbose = FALSE, n_threads = 0 ) expect_ok_matrix(res_nn) expect_equal(res_nn, res$embedding) # X = NULL is ok if passing nn data and rand init set.seed(1337) res_nnxn <- umap( X = NULL, nn_method = nn, n_epochs = 2, learning_rate = 0.5, min_dist = 0.001, init = "rand", verbose = FALSE, n_threads = 0 ) # Passing nn list directly is also ok set.seed(1337) res_nnl <- umap(iris10, nn_method = res$nn, n_epochs = 2, learning_rate = 0.5, min_dist = 0.001, init = "rand", verbose = FALSE, n_threads = 0, ret_nn = TRUE ) expect_ok_matrix(res_nnl$embedding) expect_equal(res_nnl$nn[[1]], res$nn[[1]]) expect_equal(names(res_nnl$nn), "precomputed") expect_equal(res_nnxn, res_nnl$embedding) # Passing nn list directly and return a model set.seed(1337) res_nnl <- umap(iris10, nn_method = res$nn, n_epochs = 2, learning_rate = 0.5, min_dist = 0.001, init = "rand", verbose = FALSE, n_threads = 0, ret_nn = TRUE, ret_model = TRUE ) expect_ok_matrix(res_nnl$embedding) expect_equal(res_nnl$nn[[1]], res$nn[[1]]) expect_equal(names(res_nnl$nn), "precomputed") expect_equal(res_nnxn, res_nnl$embedding) expect_equal(res_nnl$num_precomputed_nns, 1) # Passing nn list directly and return a model and set X to NULL set.seed(1337) res_nnl <- umap( X = NULL, nn_method = res$nn, n_epochs = 2, learning_rate = 0.5, min_dist = 0.001, init = "rand", verbose = FALSE, n_threads = 0, ret_nn = TRUE, ret_model = TRUE ) expect_ok_matrix(res_nnl$embedding) expect_equal(res_nnl$nn[[1]], res$nn[[1]]) expect_equal(names(res_nnl$nn), "precomputed") expect_equal(res_nnxn, res_nnl$embedding) expect_equal(res_nnl$num_precomputed_nns, 1) # Use multiple nn data res_nn2 <- umap(iris10, nn_method = list(nn, nn), n_epochs = 2, learning_rate = 0.5, min_dist = 0.001, init = "spca", verbose = FALSE, n_threads = 0, ret_nn = TRUE ) expect_ok_matrix(res_nn2$embedding) expect_equal(names(res_nn2$nn), c("precomputed", "precomputed")) # lvish and force use of annoy res <- lvish(iris10, perplexity = 4, n_epochs = 2, learning_rate = 0.5, nn_method = "annoy", init = "lvrand", verbose = FALSE, n_threads = 1, ret_extra = c("sigma") ) expect_ok_matrix(res$embedding) expect_equal(res$sigma, sqrt(c(0.3039, 0.2063, 0.09489, 0.08811, 0.3091, 0.6789, 0.1743, 0.1686, 0.3445, 0.1671)), tol = 1e-4) # lvish with knn res <- lvish(iris10, kernel = "knn", perplexity = 4, n_epochs = 2, learning_rate = 0.5, init = "lvrand", verbose = FALSE, n_threads = 1 ) expect_ok_matrix(res) # return a model res <- umap(iris10, n_neighbors = 4, n_epochs = 2, learning_rate = 0.5, min_dist = 0.001, init = "rand", verbose = FALSE, n_threads = 1, ret_model = TRUE ) expect_is(res, "list") expect_ok_matrix(res$embedding) # #95: export min_dist and spread in returned model expect_equal(res$min_dist, 0.001) expect_equal(res$spread, 1) resab <- umap(iris10, n_neighbors = 4, n_epochs = 2, learning_rate = 0.5, a = 1, b = 0.9, init = "rand", verbose = FALSE, n_threads = 1, ret_model = TRUE ) expect_equal(resab$a, 1) expect_equal(resab$b, 0.9) # min_dist and spread in returned model are NULL if a and b are set expect_null(resab$min_dist) expect_null(resab$spread) res_test <- umap_transform(iris10, res, n_threads = 1, verbose = FALSE) expect_ok_matrix(res_test) # test we can use 0 epochs res_test0 <- umap_transform(iris10, res, n_epochs = 0, n_threads = 1, verbose = FALSE) expect_ok_matrix(res_test) expect_equal(dim(res_test0), c(10, 2)) # return nn and a model set.seed(42) res <- tumap(iris10, n_neighbors = 4, n_epochs = 2, learning_rate = 0.5, init = "rand", verbose = FALSE, n_threads = 1, ret_model = TRUE, ret_nn = TRUE ) expect_is(res, "list") expect_ok_matrix(res$embedding) expect_is(res$nn, "list") expect_is_nn(res$nn[[1]], k = 4) expect_equal(names(res$nn), "euclidean") res_test <- umap_transform(iris10, res, n_threads = 0, verbose = FALSE) expect_ok_matrix(res_test) # test sparse nn matrix exactly the same as knn graph with explicit 0s for # self neighbors sparse_nbr_matrix0 <- Matrix::sparseMatrix( i = c( 0, 4, 7, 9, 1, 2, 3, 9, 1, 2, 3, 6, 2, 3, 8, 9, 0, 4, 6, 7, 0, 4, 5, 7, 2, 3, 6, 7, 0, 4, 7, 9, 1, 2, 3, 8, 1, 2, 3, 9 ), p = c( 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40 ), x = c( 0, 0.141421228647, 0.173204988241, 0.469041585922, 0, 0.300000220537, 0.331662625074, 0.173205047846, 0.300000220537, 0, 0.244949027896, 0.264575153589, 0.244949027896, 0, 0.2999997437, 0.316227942705, 0.141421228647, 0, 0.458257555962, 0.223606646061, 0.616441547871, 0.616441547871, 0, 0.700000047684, 0.264575153589, 0.331662654877, 0, 0.424264162779, 0.173204988241, 0.223606646061, 0, 0.331662625074, 0.509901940823, 0.435889661312, 0.2999997437, 0, 0.173205047846, 0.31622800231, 0.316227942705, 0 ), index1 = FALSE ) set.seed(42) res_spnn0 <- tumap(iris10, n_neighbors = 4, n_epochs = 2, learning_rate = 0.5, init = "rand", verbose = FALSE, n_threads = 1, nn_method = sparse_nbr_matrix0, ret_nn = TRUE ) expect_is(res, "list") expect_ok_matrix(res_spnn0$embedding) # should get same results as with internal nn calculation expect_equal(res_spnn0$embedding, res$embedding) sparse_nbr_matrix0_with_names <- sparse_nbr_matrix0 row.names(sparse_nbr_matrix0_with_names) <- row.names(iris10) colnames(sparse_nbr_matrix0_with_names) <- row.names(iris10) expect_equal(res_spnn0$nn$euclidean, sparse_nbr_matrix0_with_names) # sparse neighbor matrix without explicit zeros sparse_nbr_matrix <- Matrix::sparseMatrix( i = c( 4, 7, 9, 2, 3, 9, 1, 3, 6, 2, 8, 9, 0, 6, 7, 0, 4, 7, 2, 3, 7, 0, 4, 9, 1, 2, 3, 1, 2, 3 ), p = c( 0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30 ), x = c( 0.141421228647, 0.173204988241, 0.469041585922, 0.300000220537, 0.331662625074, 0.173205047846, 0.300000220537, 0.244949027896, 0.264575153589, 0.244949027896, 0.2999997437, 0.316227942705, 0.141421228647, 0.458257555962, 0.223606646061, 0.616441547871, 0.616441547871, 0.700000047684, 0.264575153589, 0.331662654877, 0.424264162779, 0.173204988241, 0.223606646061, 0.331662625074, 0.509901940823, 0.435889661312, 0.2999997437, 0.173205047846, 0.31622800231, 0.316227942705 ), index1 = FALSE ) set.seed(42) res_spnn <- tumap(iris10, n_neighbors = 4, n_epochs = 2, learning_rate = 0.5, init = "rand", verbose = FALSE, n_threads = 1, nn_method = sparse_nbr_matrix ) expect_ok_matrix(res_spnn) # should get same results as with internal nn calculation expect_equal(res_spnn, res$embedding) # null X is ok with sparse nearest neighbors set.seed(42) res_spnn_nullX <- tumap( X = NULL, n_neighbors = 4, n_epochs = 2, learning_rate = 0.5, init = "rand", verbose = FALSE, n_threads = 1, nn_method = sparse_nbr_matrix0_with_names ) expect_ok_matrix(res_spnn_nullX) # output picks up row names from input distance matrix expect_equal(res_spnn_nullX, res$embedding) # https://github.com/jlmelville/uwot/issues/6 res <- umap(iris10, n_components = 1, n_neighbors = 4, n_epochs = 2, n_threads = 1, verbose = FALSE ) expect_ok_matrix(res, nc = 1) # enforce irlba for spectral initialization even if RSpectra is present res <- umap(iris10, n_components = 1, n_neighbors = 4, n_epochs = 2, n_threads = 1, verbose = FALSE, init = "irlba_spectral" ) expect_ok_matrix(res, nc = 1) # Supervised set.seed(1337) res_y <- umap(iris10, n_neighbors = 4, n_epochs = 2, learning_rate = 0.5, min_dist = 0.001, init = "spca", verbose = FALSE, n_threads = 0, y = 1 / (1:10)^2, target_n_neighbors = 2 ) expect_ok_matrix(res_y) # Repeat using equivalent NN info for y y_nn <- list( idx = matrix(c( 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 9 ), ncol = 2, byrow = TRUE), dist = matrix(c( 0, 0.750000000, 0, 0.138888896, 0, 0.048611112, 0, 0.022500001, 0, 0.012222221, 0, 0.007369615, 0, 0.004783163, 0, 0.003279321, 0, 0.002345679, 0, 0.002345679 ), ncol = 2, byrow = TRUE) ) set.seed(1337) res_ynn <- umap(iris10, n_neighbors = 4, n_epochs = 2, learning_rate = 0.5, min_dist = 0.001, init = "spca", verbose = FALSE, n_threads = 0, y = y_nn ) expect_ok_matrix(res_ynn) # Should be the same result expect_equal(res_ynn, res_y) bin10 <- structure(c( 0L, 0L, 1L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 0L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 1L, 0L, 0L, 1L, 1L, 0L, 0L, 1L, 1L, 0L ), .Dim = c(10L, 4L)) res <- umap(bin10, n_neighbors = 4, metric = "hamming", verbose = FALSE, n_threads = 1 ) expect_ok_matrix(res) # Multiple metrics set.seed(1337) res <- umap(iris10, n_neighbors = 4, n_epochs = 2, learning_rate = 0.5, init = "spca", verbose = FALSE, n_threads = 0, metric = list(euclidean = c(1, 2), euclidean = c(3, 4)), ret_model = TRUE ) res_trans <- umap_transform(iris10, model = res, verbose = FALSE, n_threads = 0, n_epochs = 2 ) expect_ok_matrix(res_trans) # PCA dimensionality reduction res <- umap(iris10, n_neighbors = 4, n_epochs = 2, learning_rate = 0.5, init = "spca", verbose = FALSE, n_threads = 0, pca = 2, ret_model = TRUE ) expect_ok_matrix(res$embedding) expect_is(res$pca_models, "list") expect_equal(length(res$pca_models), 1) expect_ok_matrix(res$pca_models[["1"]]$rotation, nr = 4, nc = 2) expect_equal(res$pca_models[["1"]]$center, c(4.86, 3.31, 1.45, 0.22), check.attributes = FALSE ) # no centering res <- umap(iris10, n_neighbors = 4, n_epochs = 2, learning_rate = 0.5, init = "spca", verbose = FALSE, n_threads = 0, pca = 2, pca_center = FALSE, ret_model = TRUE ) expect_ok_matrix(res$embedding) expect_is(res$pca_models, "list") expect_equal(length(res$pca_models), 1) expect_ok_matrix(res$pca_models[["1"]]$rotation, nr = 4, nc = 2) expect_null(res$pca_models[["1"]]$center) res <- umap(iris10, n_neighbors = 4, n_epochs = 2, learning_rate = 0.5, metric = list("euclidean" = 1:2, "euclidean" = 3:4), init = "spca", verbose = FALSE, n_threads = 0, pca = 2 ) expect_ok_matrix(res) # Mixed metrics, PCA and transform set.seed(1337) ib10 <- cbind(iris10, bin10, bin10) res <- umap(ib10, n_neighbors = 4, n_epochs = 2, learning_rate = 0.5, init = "spca", verbose = FALSE, n_threads = 0, metric = list( euclidean = c(1, 2), hamming = 5:12, euclidean = c(3, 4) ), pca = 2, ret_model = TRUE ) expect_ok_matrix(res$embedding) expect_is(res$pca_models, "list") expect_equal(length(res$pca_models), 2) expect_equal(names(res$pca_models), c("1", "3")) expect_ok_matrix(res$pca_models[["1"]]$rotation, nr = 2, nc = 2) expect_equal(res$pca_models[["1"]]$center, c(4.86, 3.31), check.attributes = FALSE ) expect_ok_matrix(res$pca_models[["3"]]$rotation, nr = 2, nc = 2) expect_equal(res$pca_models[["3"]]$center, c(1.45, 0.22), check.attributes = FALSE ) res_trans <- umap_transform(ib10, model = res, verbose = FALSE, n_threads = 0, n_epochs = 2 ) expect_ok_matrix(res_trans) # Override pca command in third block set.seed(1337) res <- umap(ib10, n_neighbors = 4, n_epochs = 2, learning_rate = 0.5, init = "spca", verbose = FALSE, n_threads = 0, metric = list( euclidean = c(1, 2), hamming = 5:8, euclidean = list(c(3, 4), pca = NULL) ), pca = 2, ret_model = TRUE ) expect_ok_matrix(res$embedding) expect_is(res$pca_models, "list") expect_equal(length(res$pca_models), 1) expect_equal(names(res$pca_models), "1") expect_ok_matrix(res$pca_models[["1"]]$rotation, nr = 2, nc = 2) expect_equal(res$pca_models[["1"]]$center, c(4.86, 3.31), check.attributes = FALSE ) res_trans <- umap_transform(ib10, model = res, verbose = FALSE, n_threads = 0, n_epochs = 2 ) expect_ok_matrix(res_trans) # Turn off PCA centering for binary data set.seed(1337) res <- umap(bin10, n_neighbors = 4, n_epochs = 2, learning_rate = 0.5, init = "spca", verbose = FALSE, n_threads = 0, metric = "manhattan", pca = 2, pca_center = FALSE, ret_model = TRUE ) expect_ok_matrix(res$embedding) expect_is(res$pca_models, "list") expect_equal(length(res$pca_models), 1) expect_equal(names(res$pca_models), "1") expect_ok_matrix(res$pca_models[["1"]]$rotation, nr = 4, nc = 2) expect_null(res$pca_models[["1"]]$center) res_trans <- umap_transform(bin10, model = res, verbose = FALSE, n_threads = 0, n_epochs = 2 ) expect_ok_matrix(res_trans) # shrunk spectral initialization res <- umap(iris10, n_neighbors = 4, n_epochs = 2, learning_rate = 0.5, init = "pca", verbose = FALSE, n_threads = 0, init_sdev = 2 ) expect_ok_matrix(res) res <- umap(iris10, n_neighbors = 4, n_epochs = 2, learning_rate = 0.5, init = "laplacian", verbose = FALSE, n_threads = 0, init_sdev = 0.1 ) expect_ok_matrix(res) res <- umap(iris10, n_neighbors = 4, n_epochs = 2, learning_rate = 0.5, init = "spectral", verbose = FALSE, n_threads = 0, init_sdev = 5 ) expect_ok_matrix(res) # test that init_sdev actually applies to the input matrix # store old sd res_sd <- apply(res, 2, sd) res2 <- umap(iris10, n_neighbors = 4, n_epochs = 0, learning_rate = 0.5, init = res, verbose = FALSE, n_threads = 0, init_sdev = 5 ) expect_ok_matrix(res2) expect_equal(apply(res2, 2, sd), rep(5, ncol(res2))) # make sure input is unchanged expect_equal(apply(res, 2, sd), res_sd) # umap transform when test datset size > train dataset size set.seed(1337) res <- umap(iris10[1:4, ], n_neighbors = 4, n_epochs = 2, learning_rate = 0.5, min_dist = 0.001, init = "rand", verbose = FALSE, ret_model = TRUE ) expect_is(res, "list") expect_ok_matrix(res$embedding, nr = 4) res_test <- umap_transform(iris10[5:10, ], res, verbose = FALSE, n_epochs = 10) expect_ok_matrix(res_test, nr = 6) # 31: ensure we store the ndim for single-metric models expect_equal(res$metric$euclidean$ndim, 4) # taus88 prng res <- umap(iris10, pcg_rand = FALSE, n_neighbors = 4, n_epochs = 2, learning_rate = 0.5, init = "spectral", verbose = FALSE, n_threads = 0, init_sdev = 5 ) expect_ok_matrix(res) # https://github.com/jlmelville/uwot/issues/39 res <- umap(iris10, n_neighbors = 4, n_threads = 0.5) expect_ok_matrix(res) res <- umap(iris10, n_neighbors = 4, n_threads = 1.5) expect_ok_matrix(res) res <- umap(iris10, n_neighbors = 4, n_sgd_threads = 0.5) expect_ok_matrix(res) res <- umap(iris10, n_neighbors = 4, n_sgd_threads = 1.5) expect_ok_matrix(res) # https://github.com/jlmelville/uwot/issues/47 # return fuzzy graph res <- umap(iris10, n_neighbors = 4, n_epochs = 2, learning_rate = 0.5, min_dist = 0.001, init = "spca", verbose = FALSE, n_threads = 0, ret_extra = c("fgraph") ) expect_is(res, "list") expect_ok_matrix(res$embedding) expect_is(res$fgraph, "Matrix") res <- tumap(iris10, n_neighbors = 4, n_epochs = 2, learning_rate = 0.5, init = "spca", verbose = FALSE, n_threads = 0, ret_extra = c("fgraph") ) expect_is(res, "list") expect_ok_matrix(res$embedding) expect_is(res$fgraph, "Matrix") # param is ret_P and returned value is P in lvish res <- lvish(iris10, perplexity = 4, n_epochs = 2, learning_rate = 0.5, init = "spca", verbose = FALSE, n_threads = 0, ret_extra = c("P") ) expect_is(res, "list") expect_ok_matrix(res$embedding) expect_is(res$P, "Matrix") # 22 Pearson correlation set.seed(42) res_cor <- tumap(iris10, n_neighbors = 4, n_epochs = 2, learning_rate = 0.5, metric = "correlation", init = "spectral", verbose = FALSE, n_threads = 0, ret_model = TRUE ) expect_ok_matrix(res_cor$embedding) # Ensure cosine results are different from correlation set.seed(42) res_cos <- tumap(iris10, n_neighbors = 4, n_epochs = 2, learning_rate = 0.5, metric = "cosine", init = "spectral", verbose = FALSE, n_threads = 0, ret_model = TRUE ) expect_gt(sum((res_cor$embedding - res_cos$embedding)^2), 1e-3) # Ensure correlation transform results differ from cosine set.seed(42) res_trans_cor <- umap_transform(x2m(iris[11:20, ]), res_cor, n_threads = 0, verbose = FALSE) expect_ok_matrix(res_trans_cor) # Switch metric and results should differ res_cor$nn_index$metric <- "cosine" set.seed(42) res_trans_cor2 <- umap_transform(x2m(iris[11:20, ]), res_cor, n_threads = 0, verbose = FALSE) expect_ok_matrix(res_trans_cor2) expect_gt(sum((res_trans_cor - res_trans_cor2)^2), 1e-3) # 81: Preserve row names set.seed(42) xnames <- data.frame(matrix(rnorm(10 * 4), nrow = 10), row.names = letters[1:10]) xumap <- umap( xnames, n_neighbors = 4, verbose = FALSE, n_threads = 0, ret_model = TRUE, ret_nn = TRUE ) expect_equal(row.names(xumap$embedding), row.names(xnames)) expect_equal(row.names(xumap$nn$euclidean$idx), row.names(xnames)) expect_equal(row.names(xumap$nn$euclidean$dist), row.names(xnames)) first_coords <- c() test_callback <- function(epochs, n_epochs, coords) { first_coords <<- c(first_coords, coords[1, 1]) } set.seed(42) ibatch <- tumap(iris10, n_neighbors = 4, n_epochs = 2, learning_rate = 0.5, init = "spca", verbose = FALSE, batch = TRUE, n_threads = 0, n_sgd_threads = 0, ret_model = TRUE, epoch_callback = test_callback ) expect_equal(length(first_coords), 2) set.seed(42) ibatch2 <- tumap(iris10, n_neighbors = 4, n_epochs = 2, learning_rate = 0.5, init = "spca", verbose = FALSE, batch = TRUE, n_threads = 0, n_sgd_threads = 2, ret_model = TRUE ) expect_equal(ibatch$embedding, ibatch2$embedding) itest <- x2m(iris[11:20, ]) first_coords <- c() fixed_first_coords <- c() test_transform_callback <- function(epochs, n_epochs, coords, fixed_coords) { first_coords <<- c(first_coords, coords[1, 1]) fixed_first_coords <<- c(fixed_first_coords, fixed_coords[1, 1]) } set.seed(42) ibatchtest <- umap_transform(itest, ibatch, epoch_callback = test_transform_callback, n_epochs = 5) expect_equal(length(first_coords), 5) expect_equal(length(fixed_first_coords), 5) # coords don't actually change on the first epoch expect_equal(length(unique(first_coords)), 4) # if coords are fixed they should be the same at each epoch expect_equal(length(unique(fixed_first_coords)), 1) set.seed(42) ibatchtest2 <- umap_transform(itest, ibatch, n_sgd_threads = 2, n_epochs = 5) expect_equal(ibatchtest, ibatchtest2) oargs_umap <- tumap(iris10, n_neighbors = 4, n_epochs = 0, learning_rate = 0.5, init = "spca", verbose = FALSE, batch = TRUE, n_threads = 0, n_sgd_threads = 0, ret_model = TRUE, opt_args = list(alpha = 0.4, beta1 = 0.1, beta2 = 0.2, eps = 1e-3) ) expect_equal(length(oargs_umap$opt_args), 5) expect_equal(oargs_umap$opt_args$method, "adam") expect_equal(oargs_umap$opt_args$alpha, 0.4) expect_equal(oargs_umap$opt_args$beta1, 0.1) expect_equal(oargs_umap$opt_args$beta2, 0.2) expect_equal(oargs_umap$opt_args$eps, 1e-3) oargs_umap <- tumap(iris10, n_neighbors = 4, n_epochs = 2, learning_rate = 0.5, init = "spca", verbose = FALSE, batch = TRUE, n_threads = 0, n_sgd_threads = 0, ret_model = TRUE, opt_args = list(method = "sgd", alpha = 0.4) ) expect_equal(length(oargs_umap$opt_args), 2) expect_equal(oargs_umap$opt_args$method, "sgd") expect_equal(oargs_umap$opt_args$alpha, 0.4) # Return sigma and rho res <- umap(iris10, n_neighbors = 4, n_epochs = 2, learning_rate = 0.5, min_dist = 0.001, init = "spca", verbose = FALSE, n_threads = 0, ret_extra = c("sigma") ) expect_is(res, "list") expect_ok_matrix(res$embedding) expected_sigma <- c( 0.1799, 0.2049, 0.04938, 0.0906, 0.2494, 0.003906, 0.1537, 0.1355, 0.2454, 0.2063 ) sigma <- res$sigma expect_equal(sigma, expected_sigma, tolerance = 1e-4) expected_rho <- c( 0.1414, 0.1732, 0.2449, 0.2449, 0.1414, 0.6164, 0.2646, 0.1732, 0.3, 0.1732 ) rho <- res$rho expect_equal(rho, expected_rho, 1e-4) res <- umap(iris10, n_neighbors = 4, n_epochs = 2, learning_rate = 0.5, min_dist = 0.001, init = "normlaplacian", verbose = FALSE, n_threads = 0, dens_scale = 1 ) expect_ok_matrix(res) res <- umap(iris10, n_neighbors = 4, n_epochs = 2, learning_rate = 0.5, min_dist = 0.001, init = "normlaplacian", verbose = FALSE, n_threads = 0, dens_scale = 1, ret_extra = c("sigma", "localr") ) expect_is(res, "list") expect_ok_matrix(res$embedding) sigma <- res$sigma expect_equal(sigma, expected_sigma, tolerance = 1e-4) rho <- res$rho expect_equal(rho, expected_rho, tolerance = 1e-4) expected_localr <- c( 0.3214, 0.3781, 0.2943, 0.3356, 0.3908, 0.6203, 0.4182, 0.3087, 0.5454, 0.3795 ) localr <- res$localr expect_equal(localr, expected_localr, tolerance = 1e-4) res <- umap(iris10, n_neighbors = 4, n_epochs = 2, learning_rate = 0.5, min_dist = 0.001, init = "normlaplacian", verbose = FALSE, n_threads = 0, dens_scale = 1, ret_model = TRUE ) expect_is(res, "list") expect_ok_matrix(res$embedding) expected_ai <- c( 8.072, 2.957, 13.89, 6.181, 2.41, 0.1389, 1.585, 10.34, 0.3076, 2.888 ) ai <- res$ai expect_equal(ai, expected_ai, tolerance = 1e-4) expect_equal(res$dens_scale, 1.0) expect_equal(res$method, "leopold") res <- umap(iris10, n_neighbors = 4, n_epochs = 2, learning_rate = 0.5, min_dist = 0.001, init = "normlaplacian", verbose = FALSE, n_threads = 0, dens_scale = 0.5, ret_model = TRUE ) expected_ai05 <- c( 3.348, 2.027, 4.392, 2.93, 1.83, 0.4392, 1.484, 3.79, 0.6536, 2.003 ) expect_equal(res$ai, expected_ai05, tolerance = 1e-3) expect_equal(res$dens_scale, 0.5) ret_trans <- umap_transform(iris10, res) expect_ok_matrix(res$embedding) # 97: should be able to create a model without pre-computed nns and allow # umap_transform to work with pre-computed nns train_nn <- annoy_nn( X = iris10, k = 4, metric = "euclidean", n_threads = 0, ret_index = TRUE ) set.seed(42) umap_train_x_null <- umap( X = NULL, nn_method = train_nn, ret_model = TRUE, n_neighbors = 4 ) set.seed(42) umap_train_x <- umap(X = iris10, ret_model = TRUE, n_neighbors = 4) # make the test set a different size to the training set iris9test <- x2m(iris[11:19, ]) query_ref_nn <- annoy_search( X = iris9test, k = 4, ann = train_nn$index, n_threads = 0 ) row.names(query_ref_nn$dist) <- row.names(iris9test) # Success set.seed(42) umap_test_1 <- umap_transform( X = NULL, model = umap_train_x_null, nn_method = query_ref_nn ) # This was throwing an error because umap_train_x doesn't have pre-computed # neighbors (and there is no reason to insist that it should have just because # the test data uses them) set.seed(42) umap_test_2 <- umap_transform( X = NULL, model = umap_train_x, nn_method = query_ref_nn ) expect_equal(umap_test_1, umap_test_2) res <- umap(iris10, n_neighbors = 4, n_epochs = 2, learning_rate = 0.5, init = "laplacian", verbose = FALSE, n_threads = 0, init_sdev = 0.1 ) expect_ok_matrix(res) # 99 init_sdev = "range" range scales input data columns 0-10 res <- umap(iris10, n_neighbors = 4, init_sdev = "range", n_epochs = 0) expect_equal(apply(res, 2, range), matrix(c(0, 10, 0, 10), ncol = 2)) # init_sdev = "range" should rescale with user-supplied input too res <- umap( iris10, n_neighbors = 4, init = res, init_sdev = "range", n_epochs = 0 ) expect_equal(apply(res, 2, range), matrix(c(0, 10, 0, 10), ncol = 2)) # 101 intersect and union test_that("intersect and union", { # expected values are confirmed via the python implementation iris10_12 <- as(Matrix::drop0(matrix( c( 0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 1.0000000e+00, 1.0000000e+00, 0.0000000e+00, 1.0000000e+00, 0.0000000e+00, 3.1662715e-09, 0.0000000e+00, 0.0000000e+00, 5.2903235e-01, 4.7097012e-01, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 4.1861886e-01, 1.0000000e+00, 0.0000000e+00, 5.2903235e-01, 0.0000000e+00, 1.0000000e+00, 0.0000000e+00, 0.0000000e+00, 1.0000000e+00, 0.0000000e+00, 5.8137214e-01, 7.9134369e-01, 0.0000000e+00, 4.7097012e-01, 1.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 8.1357503e-01, 0.0000000e+00, 1.0000000e+00, 4.1732001e-01, 1.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 1.0000000e+00, 2.3949760e-01, 9.2372406e-01, 0.0000000e+00, 0.0000000e+00, 1.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 1.0000000e+00, 0.0000000e+00, 0.0000000e+00, 1.5851905e-08, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 1.0000000e+00, 8.1357503e-01, 2.3949760e-01, 0.0000000e+00, 0.0000000e+00, 3.5861197e-01, 0.0000000e+00, 0.0000000e+00, 1.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 9.2372406e-01, 1.5851905e-08, 3.5861197e-01, 0.0000000e+00, 0.0000000e+00, 3.1848407e-01, 0.0000000e+00, 4.1861886e-01, 5.8137214e-01, 1.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 3.1662715e-09, 1.0000000e+00, 7.9134369e-01, 4.1732001e-01, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 3.1848407e-01, 0.0000000e+00, 0.0000000e+00 ), byrow = TRUE, nrow = 10 )), "generalMatrix") iris10_34 <- as(Matrix::drop0(matrix( c( 0.000000e+00, 1.000000e+00, 1.000000e+00, 9.995531e-01, 1.000000e+00, 0.000000e+00, 1.000000e+00, 9.995531e-01, 1.000000e+00, 6.160182e-10, # 2 1.000000e+00, 0.000000e+00, 1.000000e+00, 0.000000e+00, 1.000000e+00, 0.000000e+00, 1.000000e+00, 0.000000e+00, 1.000000e+00, 0.000000e+00, # 3 1.000000e+00, 1.000000e+00, 0.000000e+00, 0.000000e+00, 1.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00, # 4 9.995531e-01, 0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00, 1.000000e+00, 0.000000e+00, 1.000000e+00, 0.000000e+00, 1.000000e+00, # 5 1.000000e+00, 1.000000e+00, 1.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00, 1.000000e+00, 0.000000e+00, 1.000000e+00, 0.000000e+00, # 6 0.000000e+00, 0.000000e+00, 0.000000e+00, 1.000000e+00, 0.000000e+00, 0.000000e+00, 3.771700e-08, 1.000000e+00, 0.000000e+00, 0.000000e+00, # 7 1.000000e+00, 1.000000e+00, 0.000000e+00, 0.000000e+00, 1.000000e+00, 3.771700e-08, 0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00, # 8 9.995531e-01, 0.000000e+00, 0.000000e+00, 1.000000e+00, 0.000000e+00, 1.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00, 1.000000e+00, # 9 1.000000e+00, 1.000000e+00, 0.000000e+00, 0.000000e+00, 1.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00, # 10 6.160182e-10, 0.000000e+00, 0.000000e+00, 1.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00, 1.000000e+00, 0.000000e+00, 0.000000e+00 ), byrow = TRUE, nrow = 10 )), "generalMatrix") expected_intersect <- matrix( c( 0.0, 0.55551356, 0.6280843, 0.5780957, 1.0, 0.9556667, 0.65937775, 1.0, 0.8031627, 0.7241592, 0.55551356, 0.0, 1.0, 0.6247057, 0.62943524, 0.0, 0.7119533, 0.0, 1.0, 0.78192693, 0.6280843, 1.0, 0.0, 0.701477, 0.68993694, 0.0, 0.7589823, 0.0, 0.85413647, 0.8139497, 0.5780957, 0.6247057, 0.701477, 0.0, 0.0, 0.96441525, 0.7222059, 0.7034965, 0.84200585, 1.0, 1.0, 0.62943524, 0.68993694, 0.0, 0.0, 0.96303976, 0.99999994, 0.69032043, 0.8358984, 0.0, 0.9556667, 0.0, 0.0, 0.96441525, 0.96303976, 0.0, 0.345057, 1.0, 0.0, 0.0, 0.65937775, 0.7119533, 0.7589823, 0.7222059, 0.99999994, 0.345057, 0.0, 0.7411224, 0.0, 0.0, 1.0, 0.0, 0.0, 0.7034965, 0.69032043, 1.0, 0.7411224, 0.0, 0.0, 0.999708, 0.8031627, 1.0, 0.85413647, 0.84200585, 0.8358984, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7241592, 0.78192693, 0.8139497, 1.0, 0.0, 0.0, 0.0, 0.999708, 0.0, 0.0 ), byrow = TRUE, nrow = 10 ) expect_equal(as.matrix(simplicial_set_intersect(iris10_12, iris10_34)), expected_intersect, tol = 1e-7, check.attributes = FALSE ) expected_union <- matrix( c( 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0002958188, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.90659714, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0002958188, 1.0, 0.90659714, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0 ), byrow = TRUE, nrow = 10 ) expect_equal(as.matrix(simplicial_set_union(iris10_12, iris10_34)), expected_union, tol = 1e-7, check.attributes = FALSE ) expected_intersect_weighted <- matrix( c( 0.0, 0.40025446, 0.39215788, 0.36378226, 1.0, 0.99999994, 0.4569211, 1.0, 0.6892773, 0.51736516, 0.40025446, 0.0, 1.0, 0.8227322, 0.4237253, 0.0, 0.4646226, 0.0, 1.0, 0.91525006, 0.39215788, 1.0, 0.0, 0.8513419, 0.41594562, 0.0, 0.89491767, 0.0, 0.9377875, 0.906834, 0.36378226, 0.8227322, 0.8513419, 0.0, 0.0, 0.7309169, 0.8804487, 0.4460863, 0.94368106, 1.0, 1.0, 0.4237253, 0.41594562, 0.0, 0.0, 1.0, 1.0, 0.88101995, 0.70143735, 0.0, 0.99999994, 0.0, 0.0, 0.7309169, 1.0, 0.0, 0.6468519, 0.7858214, 0.0, 0.0, 0.4569211, 0.4646226, 0.89491767, 0.8804487, 1.0, 0.6468519, 0.0, 0.8816428, 0.0, 0.0, 1.0, 0.0, 0.0, 0.4460863, 0.88101995, 0.7858214, 0.8816428, 0.0, 0.0, 0.99908066, 0.6892773, 1.0, 0.9377875, 0.94368106, 0.70143735, 0.0, 0.0, 0.0, 0.0, 0.0, 0.51736516, 0.91525006, 0.906834, 1.0, 0.0, 0.0, 0.0, 0.99908066, 0.0, 0.0 ), byrow = TRUE, nrow = 10 ) expect_equal(as.matrix(simplicial_set_intersect(iris10_12, iris10_34, weight = 0.25)), expected_intersect_weighted, tol = 1e-7, check.attributes = FALSE ) }) test_that("can set seed internally", { set.seed(42) res <- umap( iris10, n_neighbors = 4, n_epochs = 10, learning_rate = 0.5, n_sgd_threads = 1 ) expect_ok_matrix(res) # default doesn't reset the seed res2 <- umap( iris10, n_neighbors = 4, n_epochs = 10, learning_rate = 0.5, n_sgd_threads = 1 ) diff12 <- res - res2 expect_gt(sqrt(sum(diff12 * diff12) / length(diff12)), 0.01) # set seed internally same as calling set.seed res3 <- umap( iris10, n_neighbors = 4, n_epochs = 10, learning_rate = 0.5, n_sgd_threads = 1, seed = 42 ) expect_equal(res, res3) # creating a model stores the seed but also forces annoy for nearest neighbors # which changes the RNG state more than when FNN can be used internally # 115: eh actually this is probably due more to irlba than RSpectra? res_model <- umap( iris10, n_neighbors = 4, n_epochs = 10, learning_rate = 0.5, n_sgd_threads = 1, seed = 42, ret_model = TRUE ) expect(res_model$seed, 42) diff1m <- res - res_model$embedding expect_gt(sqrt(sum(diff1m * diff1m) / length(diff1m)), 1e-6) # explicitly set annoy nn and things are reproducible again res4 <- umap( iris10, n_neighbors = 4, n_epochs = 10, learning_rate = 0.5, n_sgd_threads = 1, seed = 42, nn_method = "annoy" ) expect_equal(res_model$embedding, res4) }) test_that("can provide nn_args", { res <- umap( iris10, n_neighbors = 4, n_epochs = 10, learning_rate = 0.5, n_trees = 5, nn_args = list(n_trees = 10), ret_model = TRUE ) expect_ok_matrix(res$embedding) expect_equal(res$nn_args$n_trees, 10) }) test_that("deterministic negative sampling is reproducible", { res_seed42 <- umap( iris10, seed = 42, n_neighbors = 4, n_epochs = 2, learning_rate = 0.5, init = iris10_pca, verbose = FALSE, n_threads = 0, rng_type = "deterministic" ) res_seed1337 <- umap( iris10, seed = 1337, n_neighbors = 4, n_epochs = 2, learning_rate = 0.5, init = iris10_pca, verbose = FALSE, n_threads = 0, rng_type = "deterministic" ) expect_ok_matrix(res_seed42) expect_equal(res_seed42, res_seed1337) res_seed42_t2 <- umap( iris10, seed = 42, n_neighbors = 4, n_epochs = 2, learning_rate = 0.5, init = iris10_pca, verbose = FALSE, n_threads = 2, rng_type = "deterministic" ) expect_equal(res_seed42, res_seed42_t2) }) uwot/tests/testthat/test_errors.R0000644000176200001440000001171314756666530016775 0ustar liggesuserslibrary(uwot) library(RSpectra) context("Input validation") expect_error(umap(X = list(X = "bad", Y = "wrong", nn = "what")), "data format") expect_error(umap(iris10, n_neighbors = 1, n_threads = 0), "n_neighbors") expect_error(umap(iris10, n_neighbors = 15, n_threads = 0), "n_neighbors") expect_error(umap(iris10, set_op_mix_ratio = 10, n_threads = 0), "set_op_mix_ratio") expect_error(umap(iris10, set_op_mix_ratio = -10, n_threads = 0), "set_op_mix_ratio") expect_error(umap(iris10, local_connectivity = 0.5, n_threads = 0), "local_connectivity") expect_error(umap(diris10, ret_model = TRUE, n_threads = 0), "models") expect_error(umap(dmiris10z, ret_model = TRUE, n_threads = 0), "models") expect_error(umap(dmiris10z[, 1:9], n_threads = 0), "distance") expect_error(umap(dmiris10z[, 1:9], n_threads = 0), "distance") expect_error(umap(iris[, "Species", drop = FALSE], n_threads = 0), "numeric") expect_error(umap(iris10, n_threads = 0, nn_method = list()), "precalculated") expect_error(umap(iris10, n_threads = 0, nn_method = list(idx = matrix(1:4, nrow = 2), dist = matrix(1:4, nrow = 2))), "rows") expect_error(umap(iris10, n_threads = 0, nn_method = list(idx = matrix(1:40, nrow = 10))), "dist") expect_error(umap(iris10, n_threads = 0, nn_method = list( idx = matrix(1:40, nrow = 10), dist = matrix(1:4, nrow = 2) )), "dimensions") expect_error(umap(iris10, n_threads = 0, n_neighbors = 4, nn_method = "fnn", metric = "cosine"), "FNN") expect_error(umap(iris10, n_threads = 0, n_neighbors = 4, nn_method = "fnn", ret_model = TRUE), "FNN") expect_error(lvish(iris10, n_threads = 0, perplexity = 50), "perplexity") expect_error(tumap(iris10, n_components = 0), "n_components") expect_error(umap(iris10, pca = 1), "'pca' must be >=") expect_error(umap(iris10, pca_method = "bad-pca-package")) expect_error(umap(iris10, n_threads = 0, n_neighbors = 4, y = c(1:9, NA)), "numeric y") expect_error(umap( X = NULL, n_threads = 0, n_neighbors = 4, nn_method = nn, init = "spca" ), "spca") # add an extra column to nn nn5 <- nn nn5$idx <- cbind(nn5$idx, rep(100, nrow(nn5$idx))) nn5$dist <- cbind(nn5$dist, rep(100.0, nrow(nn5$dist))) expect_error(umap(X = NULL, n_threads = 0, nn_method = list(nn, nn5)), "Invalid neighbor") expect_error(umap(iris10, n_threads = 0, pca = 0), "positive integer") expect_error(umap(iris10, n_threads = -1), "n_threads") expect_error(umap(iris10, n_sgd_threads = -1), "n_sgd_threads") model <- umap(iris10, n_neighbors = 2, ret_model = TRUE, n_epochs = 2) expect_error(umap_transform(iris10[, 1:2], model), "Incorrect dimensions") # #42: check init is a matrix or a string; complain otherwise expect_error(umap(iris10, n_neighbors = 4, init = as.matrix(iris[, 1:3])), "(10, 2)") expect_error(umap(iris10, n_neighbors = 4, init = iris), "matrix or string") # Don't use data with NA in it test_that("Detect data with NA in", { diris10na <- diris10 diris10na[1] <- NA expect_error(umap(diris10na), "missing", ignore.case = TRUE) dmiris10zna <- dmiris10z dmiris10zna[2, 1] <- NA expect_error(umap(dmiris10zna, n_neighbors = 4), "missing", ignore.case = TRUE) iris10na <- iris10 iris10na[1, 1] <- NA expect_error(umap(iris10na, n_neighbors = 4), "missing", ignore.case = TRUE) }) set.seed(42) nnsp10 <- Matrix::drop0(matrix(runif(100), nrow = 10)^2, 0.5) expect_error(umap(iris10, n_neighbors = 4, nn_method = nnsp10[, -10]), "same number") expect_error(umap(iris10, n_neighbors = 4, nn_method = nnsp10[-10, -10]), "unexpected number of rows") # give obs 5 0 neighbors nnsp10_nbr0 <- nnsp10 nnsp10_nbr0[, 5] <- 0 nnsp10_nbr0 <- Matrix::drop0(nnsp10_nbr0) expect_error(umap(X = NULL, n_neighbors = 4, nn_method = nnsp10_nbr0), "at least one neighbor") # 76: umap_transform does not validate input sufficiently model <- umap(iris[1:10, ], n_neighbors = 4, n_epochs = 0, ret_model = TRUE) expect_error(trans <- umap_transform(iris[0, ], model = model), "Not enough rows") # bad min_dist/spread expect_error(umap(iris, spread = 1, min_dist = 20), "a, b") # n_components too high expect_warning( umap( iris10, n_components = 50, ret_model = TRUE, init = "rand", n_neighbors = 4, n_epochs = 0 ), "n_components >" ) suppressWarnings(expect_error( umap(iris[1:100, ], n_components = 10), "Initial data contains NA" )) # user-supplied intialization should not contain NA transform_init <- model$embedding[1:5, ] transform_init[1, 1] <- NA expect_error(trans <- umap_transform(iris[51:55, ], model = model, init = transform_init), "contains NA") # model embedding coords should also not contain NA old11 <- model$embedding[1, 1] model$embedding[1, 1] <- NA expect_error(trans <- umap_transform(iris[51:55, ], model = model), "contains NA") model$embedding[1, 1] <- old11 # 110: warn if standard deviation of initial input could create small gradients expect_warning( umap(iris10, init_sdev = 100.0, n_neighbors = 4), "embedding standard deviation" ) uwot/tests/testthat/test_smooth_knn_dists.R0000644000176200001440000002723614730166740021043 0ustar liggesuserslibrary(uwot) context("Smooth kNN distances") flatmat <- function(x, nr) { as.vector(t(matrix(x, nrow = nr))) } ### C++ tests nn_8 <- find_nn(iris10, k = 8) nbrs8 <- ncol(nn_8$dist) target8 <- log2(nbrs8) res <- smooth_knn_distances_parallel( as.vector(t(nn_8$dist)), nn_ptr = nbrs8, skip_first = TRUE, ret_sigma = TRUE, target = target8 ) expect_equal(flatmat(res$matrix, nbrs8), c( 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0.883551016667945, 0.563402698221087, 0.789087277089996, 0.57607769298836, 0.750303047844192, 1, 0.656969510423194, 0.745156972400171, 0.586089930619138, 0.481555035145846, 0.279104282300107, 0.488196761636568, 0.514561802750023, 0.489639480054329, 0.330384991495572, 0.626571100714632, 0.367875074735739, 0.39660773625396, 0.438113497020007, 0.481555035145844, 0.238036308009313, 0.321078737378799, 0.423034729464725, 0.419490620521321, 0.275816663602388, 0.120285582819758, 0.297337437716562, 0.247710312149843, 0.377578194428495, 0.445038652274379, 0.229198240647999, 0.217928223724008, 0.13265884755527, 0.419490620521318, 0.257869637066222, 0.110625826611838, 0.260166429776407, 0.231017974667955, 0.364373813389398, 0.220580064268054, 0.212929653970733, 0.217928223724007, 0.0998021268957899, 0.0776802085446247, 0.195560609120723, 0.072176661510608, 0.215176296482231, 0.231017974667954, 0.147146427255277, 0.209014049190051, 0.157184945393181, 0.191460580118967, 0.0408496922133704, 0.0176222685661076, 0.190057981521641, 0.0703455098666948, 0.202477876903057, 0.148483915739209, 0.086695317543654, 0.162252224543109 )) expect_equal(res$sigma, c( 0.2567215, 0.22098923, 0.08285332, 0.09981823, 0.28608322, 0.17873764, 0.15968704, 0.17134094, 0.25434113, 0.19572449 )) expected_rho <- c( 0.14142136, 0.17320508, 0.24494897, 0.24494897, 0.14142136, 0.6164414, 0.26457513, 0.17320508, 0.3, 0.17320508 ) expect_equal(res$rho, expected_rho) nn_4 <- find_nn(iris10, k = 4) nn4dist <- as.vector(t(nn_4$dist)) nbrs4 <- ncol(nn_4$dist) target4 <- log2(nbrs4) res <- smooth_knn_distances_parallel( nn4dist, nn_ptr = nbrs4, skip_first = TRUE, ret_sigma = TRUE, target = target4 ) expect_equal(flatmat(res$matrix, nbrs4), c( 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0.838084924053271, 0.538562488894191, 0.672032147261722, 0.54465912156976, 0.719264468544344, 1, 0.646253111343435, 0.689408428587288, 0.574825084073865, 0.499998079054375, 0.161908061792063, 0.461447148366392, 0.327968484367062, 0.455344908480281, 0.280728069552432, 5.12868558931539e-10, 0.35375192447642, 0.310590623941469, 0.425174782792857, 0.499998079054373 )) # nn4 expected_sigma4 <- c( 0.17993927, 0.20488739, 0.0493803, 0.09060478, 0.24940491, 0.00390625, 0.15367126, 0.13551712, 0.24542618, 0.20633698 ) expect_equal(res$sigma, expected_sigma4) expected_rho4 <- c( 0.14142136, 0.17320508, 0.24494897, 0.24494897, 0.14142136, 0.6164414, 0.26457513, 0.17320508, 0.3, 0.17320508 ) expect_equal(res$rho, expected_rho4) # explicitly provide pointers into distances res <- smooth_knn_distances_parallel( nn4dist, nn_ptr = seq(from = 0, to = 40, by = 4), skip_first = TRUE, ret_sigma = TRUE, target = target4 ) expect_equal(flatmat(res$matrix, nbrs4), c( 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0.838084924053271, 0.538562488894191, 0.672032147261722, 0.54465912156976, 0.719264468544344, 1, 0.646253111343435, 0.689408428587288, 0.574825084073865, 0.499998079054375, 0.161908061792063, 0.461447148366392, 0.327968484367062, 0.455344908480281, 0.280728069552432, 5.12868558931539e-10, 0.35375192447642, 0.310590623941469, 0.425174782792857, 0.499998079054373 )) expect_equal(res$sigma, expected_sigma4) expect_equal(res$rho, expected_rho4) ### Various fuzzy set matrices are defined in helper_fuzzy_sets.R # unsymmetrized fuzzy set res <- nng_to_sparse(t(nn_4$idx), res$matrix, self_nbr = TRUE, by_row = FALSE) expect_equal(res, V_asymm, tol = 1e-4) # Fuzzy Set Union expect_equal(fuzzy_set_union(res), V_union, tol = 1e-4) # mix intersection with union expect_equal(fuzzy_set_union(res, set_op_mix_ratio = 0.5), V_mix, tol = 1e-4 ) # intersection expect_equal(fuzzy_set_union(res, set_op_mix_ratio = 0), V_intersect, tol = 1e-4 ) res_cpp_conn1 <- smooth_knn_distances_parallel( nn4dist, nn_ptr = nbrs4, skip_first = TRUE, target = target4, n_iter = 64, local_connectivity = 1.0, tol = 1e-5, min_k_dist_scale = 1e-3, n_threads = 0 ) expect_equal( nng_to_sparse(nn_4$idx, flatmat(res_cpp_conn1$matrix, nbrs4), self_nbr = TRUE ), V_asymm, tol = 1e-4 ) res_cpp_conn1.5 <- smooth_knn_distances_parallel( nn4dist, nn_ptr = nbrs4, skip_first = TRUE, target = target4, n_iter = 64, local_connectivity = 1.5, tol = 1e-5, min_k_dist_scale = 1e-3, n_threads = 0 ) expect_equal( nng_to_sparse(t(nn_4$idx), res_cpp_conn1.5$matrix, self_nbr = TRUE, by_row = FALSE ), V_asymm_local, tol = 1e-4 ) res_cpp_conn1 <- smooth_knn_distances_parallel( nn4dist, nn_ptr = nbrs4, skip_first = TRUE, target = target4, n_iter = 64, local_connectivity = 1.0, tol = 1e-5, min_k_dist_scale = 1e-3, n_threads = 1, grain_size = 1 ) expect_equal( nng_to_sparse(t(nn_4$idx), res_cpp_conn1$matrix, self_nbr = TRUE, by_row = FALSE ), V_asymm, tol = 1e-4 ) res_cpp_conn1.5 <- smooth_knn_distances_parallel( nn4dist, nn_ptr = nbrs4, skip_first = TRUE, target = target4, n_iter = 64, local_connectivity = 1.5, tol = 1e-5, min_k_dist_scale = 1e-3, n_threads = 1, grain_size = 1 ) expect_equal( nng_to_sparse(t(nn_4$idx), res_cpp_conn1.5$matrix, self_nbr = TRUE, by_row = FALSE ), V_asymm_local, tol = 1e-4 ) # Test cross-distances V_asymm_local_cross <- V_asymm_local diag(V_asymm_local_cross) <- 1 V_asymm_local_cross <- cbind( V_asymm_local_cross, matrix(0, nrow = 10, ncol = 2) ) res_cpp_conn1.5_cross <- smooth_knn_distances_parallel( nn4dist, nn_ptr = nbrs4, skip_first = TRUE, target = target4, n_iter = 64, local_connectivity = 1.5, tol = 1e-5, min_k_dist_scale = 1e-3, n_threads = 0 ) expect_equal( nng_to_sparse( t(nn_4$idx), res_cpp_conn1.5_cross$matrix, by_row = FALSE, self_nbr = FALSE, max_nbr_id = 12 ), V_asymm_local_cross, tol = 1e-4 ) res_cpp_conn1.5_cross <- smooth_knn_distances_parallel( nn4dist, nn_ptr = nbrs4, skip_first = TRUE, target = target4, n_iter = 64, local_connectivity = 1.5, tol = 1e-5, min_k_dist_scale = 1e-3, n_threads = 1 ) expect_equal( nng_to_sparse( t(nn_4$idx), res_cpp_conn1.5_cross$matrix, by_row = FALSE, self_nbr = FALSE, max_nbr_id = 12 ), V_asymm_local_cross, tol = 1e-4 ) # smooth_knn_matrix expected_sknn4m <- Matrix::drop0(matrix( c( 0, 0, 0, 0, 1.0000000, 0, 0, 8.380849e-01, 0, 0.1619081, 0, 0, 0.5385625, 0.4614471, 0, 0, 0, 0, 0, 1.0000000, 0, 0.3279685, 0, 1.0000000, 0, 0, 0.6720321, 0, 0, 0, 0, 0, 1.0000000, 0, 0, 0, 0, 0, 0.5446591, 0.4553449, 1, 0, 0, 0, 0, 0, 0.2807281, 7.192645e-01, 0, 0, 1, 0, 0, 0, 1.0000000, 0, 0, 5.128686e-10, 0, 0, 0, 0, 1.0000000, 0.6462531, 0, 0, 0, 3.537519e-01, 0, 0, 1, 0, 0, 0, 0.6894084, 0, 0, 0, 0, 0.3105906, 0, 0.4251748, 0.5748251, 1.0000000, 0, 0, 0, 0, 0, 0, 0, 1.0000000, 0.4999981, 0.4999981, 0, 0, 0, 0, 0, 0 ), nrow = 10, byrow = TRUE )) sknn4m <- smooth_knn_matrix(nn_4)$matrix expect_equal(sknn4m@x, expected_sknn4m@x, tol = 1e-7) expect_equal(sknn4m@i, expected_sknn4m@i) nn4sp <- Matrix::drop0(matrix( c( 0, 0, 0, 0, 0.1414214, 0.6164414, 0, 0.1732051, 0, 0, 0, 0, 0.3000000, 0, 0, 0, 0, 0, 0.5099020, 0.1732051, 0, 0.3000000, 0, 0.2449490, 0, 0, 0.2645751, 0, 0.4358899, 0.3162278, 0, 0.3316625, 0.2449490, 0, 0, 0, 0.3316625, 0, 0.3000000, 0.3162278, 0.1414214, 0, 0, 0, 0, 0.6164414, 0, 0.2236068, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.2645751, 0, 0.4582576, 0, 0, 0, 0, 0, 0.1732051, 0, 0, 0, 0.2236068, 0.7000000, 0.4242641, 0, 0, 0, 0, 0, 0, 0.3000000, 0, 0, 0, 0, 0, 0, 0.4690416, 0.1732051, 0, 0.3162278, 0, 0, 0, 0.3316625, 0, 0 ), nrow = 10, byrow = TRUE )) sknn4msp <- smooth_knn_matrix(nn4sp)$matrix expect_equal(sknn4msp@x, expected_sknn4m@x, tol = 1e-6) expect_equal(sknn4msp@i, expected_sknn4m@i) nn3sp <- Matrix::drop0(matrix(c( 0, 0, 0, 0, 0.1414212, 0.6164416, 0, 0.1732050, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.1732050, 0, 0.3000002, 0, 0.2449490, 0, 0, 0.2645751, 0, 0.4358897, 0, 0, 0, 0.2449490, 0, 0, 0, 0.3316627, 0, 0.2999998, 0.3162279, 0.1414212, 0, 0, 0, 0, 0.6164416, 0, 0.2236066, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.2645751, 0, 0, 0, 0, 0, 0, 0, 0.1732050, 0, 0, 0, 0.2236066, 0, 0, 0, 0, 0, 0, 0, 0, 0.2999998, 0, 0, 0, 0, 0, 0, 0, 0.1732050, 0, 0, 0, 0, 0, 0, 0, 0 ), nrow = 10, byrow = TRUE)) expected_sknn3m <- Matrix::drop0(matrix(c( 0, 0, 0, 0, 1.0000000, 0, 0, 0.5849702, 0, 0, 0, 0, 0.5849609, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1.0000000, 0, 0, 0.5849651, 0, 0, 0, 0, 0, 1.0000000, 0, 0, 0, 0, 0, 0.5849684, 0, 1, 0, 0, 0, 0, 0, 0, 0.5849684, 0, 0, 1, 0, 0, 0, 1.0000000, 0, 0, 0, 0, 0, 0, 0, 1.0000000, 0.5849615, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0.5849545, 0, 0, 0, 0, 0, 0, 0, 0.5849692, 1.0000000, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0.5849544, 0, 0, 0, 0, 0, 0 ), nrow = 10, byrow = TRUE)) sknn3msp <- smooth_knn_matrix(nn3sp)$matrix expect_equal(sknn3msp@x, expected_sknn3m@x, tol = 1e-6) expect_equal(sknn3msp@i, expected_sknn3m@i) nn34sp <- nn4sp nn34sp[, c(2, 4, 6)] <- nn3sp[, c(2, 4, 6)] expected_sknn34m <- Matrix::drop0(matrix(c( 0, 0, 0, 0, 1.0000000, 0, 0, 0.8380850, 0, 0.1619081, 0, 0, 0.5849609, 0, 0, 0, 0, 0, 0, 1.0000000, 0, 0.3279687, 0, 1.0000000, 0, 0, 0.6720329, 0, 0, 0, 0, 0, 1.0000000, 0, 0, 0, 0, 0, 0.584968, 0, 1, 0, 0, 0, 0, 0, 0.2807281, 0.7192646, 0, 0, 1, 0, 0, 0, 1.0000000, 0, 0, 0, 0, 0, 0, 0, 1.0000000, 0.6462529, 0, 0, 0, 0.3537518, 0, 0, 1, 0, 0, 0, 0.6894085, 0, 0, 0, 0, 0.3105906, 0, 0.4251747, 0.5748251, 1.0000000, 0, 0, 0, 0, 0, 0, 0, 1.0000000, 0.4999980, 0.4999980, 0, 0, 0, 0, 0, 0 ), nrow = 10, byrow = TRUE)) sknn34msp <- smooth_knn_matrix(nn34sp, ret_sigma = TRUE) expect_equal(sknn34msp$matrix@x, expected_sknn34m@x, tol = 1e-6) expect_equal(sknn34msp$matrix@i, expected_sknn34m@i) expect_equal(sknn34msp$n_failures, 1) expect_equal(sknn34msp$sigma, c(0.1799393, 0.2364655, 0.0493803, 0.1026688, 0.2494049, 1.0000000, 0.1536713, 0.1355171, 0.2454262, 0.2063370), tol = 1e-7 ) expect_equal(sknn34msp$rho, expected_rho, tol = 1e-6) uwot/tests/testthat/test_pca.R0000644000176200001440000000370114730166740016210 0ustar liggesuserslibrary(uwot) context("PCA") iris10prcomp <- prcomp(iris10, retx = TRUE, center = TRUE, scale. = FALSE) test_that("PCA initialization", { iris10_pca_scores <- pca_init(iris10, ndim = 2) suppressWarnings(iris10_irlba_scores <- irlba_scores(iris10, ncol = 2)) expect_equal(abs(iris10prcomp$x[, 1:2]), abs(iris10_pca_scores), check.attributes = FALSE ) expect_equal(abs(iris10prcomp$x[, 1:2]), abs(iris10_irlba_scores), check.attributes = FALSE ) suppressWarnings(iris10_svdr_scores <- irlba_svdr_scores(iris10, ncol = 2)) expect_equal(abs(iris10prcomp$x[, 1:2]), abs(iris10_svdr_scores), check.attributes = FALSE ) }) test_that("1 component initialization works", { expect_ok_matrix(pca_init(iris10, ndim = 1), nc = 1) }) test_that("PCA returns model data", { iris10_pca_scores <- pca_init(iris10, ndim = 2, ret_extra = TRUE) expect_equal(abs(iris10prcomp$x[, 1:2]), abs(iris10_pca_scores$scores), check.attributes = FALSE ) expect_equal(abs(iris10prcomp$rotation[, 1:2]), abs(iris10_pca_scores$rotation), check.attributes = FALSE ) expect_equal(abs(iris10prcomp$center), abs(iris10_pca_scores$center), check.attributes = FALSE ) suppressWarnings(iris10_irlba_scores <- irlba_scores(iris10, ncol = 2, ret_extra = TRUE )) expect_equal(abs(iris10prcomp$x[, 1:2]), abs(iris10_irlba_scores$scores), check.attributes = FALSE ) expect_equal(abs(iris10prcomp$rotation[, 1:2]), abs(iris10_irlba_scores$rotation), check.attributes = FALSE ) expect_equal(abs(iris10prcomp$center), abs(iris10_irlba_scores$center), check.attributes = FALSE ) }) test_that("logical pca ok", { set.seed(1337) random_logical <- matrix(rnorm(1000), nrow = 100) > 0.5 random_int <- random_logical * 1 expect_equal(abs(irlba_scores(random_logical, ncol = 2)), abs(irlba_scores(random_int, ncol = 2))) }) uwot/tests/testthat/test_scale.R0000644000176200001440000000345314730166740016540 0ustar liggesuserslibrary(uwot) context("Scaling") iris10_sd <- apply(iris10, 2, sd) iris10_mean <- apply(iris10, 2, mean) iris10_none <- scale_input(iris10, scale_type = FALSE) expect_equal(apply(iris10_none, 2, sd), iris10_sd) expect_equal(apply(iris10_none, 2, mean), iris10_mean) iris10_scale <- scale_input(iris10, scale_type = TRUE) expect_equal(apply(iris10_scale, 2, sd), rep(1, 4), check.attributes = FALSE) expect_equal(apply(iris10_scale, 2, mean), rep(0, 4), check.attributes = FALSE) # "scale" and "z" and TRUE are synonyms expect_equal(scale_input(iris10, scale_type = "scale"), iris10_scale) expect_equal(scale_input(iris10, scale_type = "Z"), iris10_scale) iris10_maxabs <- scale_input(iris10, scale_type = "maxabs") expect_equal(apply(iris10_maxabs, 2, mean), rep(0, 4), check.attributes = FALSE) expect_equal(max(abs(iris10_maxabs)), 1) iris10_range <- scale_input(iris10, scale_type = "range") expect_equal(max(iris10_range), 1) expect_equal(min(iris10_range), 0) iris10_colrange <- scale_input(iris10, scale_type = "colrange") expect_equal(apply(iris10_colrange, 2, max), rep(1, 4), check.attributes = FALSE) expect_equal(apply(iris10_colrange, 2, min), rep(0, 4), check.attributes = FALSE) test_that("scaling applied outside umap should not appear in the model", { iris10s <- scale(iris10) iris10s_umap0 <- umap( iris10s, n_neighbors = 4, n_epochs = 0, init = "rand", ret_model = TRUE, scale = FALSE ) expect_null(iris10s_umap0$scale_info) iris10s_umap0s <- umap( iris10s, n_neighbors = 4, n_epochs = 0, init = "rand", ret_model = TRUE, scale = TRUE ) expect_equal( names(iris10s_umap0s$scale_info), c("scaled:center", "scaled:scale", "scaled:nzvcols") ) }) uwot/tests/testthat/test_rand_init.R0000644000176200001440000000064213631262762017415 0ustar liggesuserslibrary(uwot) context("Random initialization") n_vertices <- 10 res_rand <- rand_init(n_vertices, ndim = 2, verbose = FALSE) expect_ok_matrix(rand_init(n_vertices, ndim = 2, verbose = FALSE)) expect_ok_matrix(rand_init_lv(n_vertices, ndim = 2, verbose = FALSE)) expect_ok_matrix(rand_init(n_vertices, ndim = 1, verbose = FALSE), nc = 1) expect_ok_matrix(rand_init_lv(n_vertices, ndim = 1, verbose = FALSE), nc = 1) uwot/tests/testthat/test_transform.R0000644000176200001440000003254214730166740017465 0ustar liggesuserslibrary(uwot) library(RSpectra) context("Transform") diagonal1s <- as(Matrix::drop0(diag(1, nrow(V_asymm), ncol(V_asymm))), "generalMatrix") graph <- V_asymm + diagonal1s dV <- as.matrix(graph) vdV <- as.vector(t(dV)) dgraph <- matrix(vdV[vdV > 0], byrow = TRUE, nrow = 10) dgraph <- apply(dgraph, 1, function(x) { sort(x, decreasing = TRUE) }) graph <- Matrix::t(graph) nnt <- nn_graph_t(nn) train_embedding <- t(matrix(1:20, nrow = 10)) av <- t(matrix(c( 6.00, 16.00, 4.75, 14.75, 4.00, 14.00, 6.50, 16.50, 5.25, 15.25, 5.00, 15.00, 5.50, 15.50, 6.00, 16.00, 4.50, 14.50, 4.75, 14.75 ), nrow = 10, byrow = TRUE)) embedding <- init_new_embedding(train_embedding, as.vector(nnt$idx), ncol(nnt$idx), graph = NULL, weighted = FALSE, n_threads = 0, verbose = FALSE ) expect_equal(embedding, av, check.attributes = FALSE) wav <- t(matrix(c( 4.774600, 14.77460, 5.153800, 15.15380, 4.120000, 14.12000, 5.485100, 15.48510, 4.573100, 14.57310, 4.000000, 14.00000, 5.138362, 15.13836, 5.184333, 15.18433, 5.191600, 15.19160, 5.166667, 15.16667 ), nrow = 10, byrow = TRUE)) embedding <- init_new_embedding(train_embedding, as.vector(nnt$idx), ncol(nnt$idx), graph = dgraph, weighted = TRUE, n_threads = 0, verbose = FALSE ) expect_equal(embedding, wav, check.attributes = FALSE, tol = 1e-5) # Check threaded code embedding <- init_new_embedding(train_embedding, as.vector(nnt$idx), ncol(nnt$idx), graph = NULL, weighted = FALSE, n_threads = 1, verbose = FALSE ) expect_equal(embedding, av, check.attributes = FALSE) embedding <- init_new_embedding(train_embedding, as.vector(nnt$idx), ncol(nnt$idx), graph = dgraph, weighted = TRUE, n_threads = 1, verbose = FALSE ) expect_equal(embedding, wav, check.attributes = FALSE, tol = 1e-5) iris10_range <- scale_input(iris10, scale_type = "range", ret_model = TRUE) iris10_rtrans <- apply_scaling(iris10, attr_to_scale_info(iris10_range)) expect_equal(iris10_range, iris10_rtrans, check.attributes = FALSE) iris10_maxabs <- scale_input(iris10, scale_type = "maxabs", ret_model = TRUE) iris10_matrans <- apply_scaling(iris10, attr_to_scale_info(iris10_maxabs)) expect_equal(iris10_maxabs, iris10_matrans, check.attributes = FALSE) iris10_scale <- scale_input(iris10, scale_type = "scale", ret_model = TRUE) iris10_strans <- apply_scaling(iris10, attr_to_scale_info(iris10_scale)) expect_equal(iris10_scale, iris10_strans, check.attributes = FALSE) iris10_zv_col <- iris10 iris10_zv_col[, 3] <- 10 iris10zvc_scale <- scale_input(iris10_zv_col, scale_type = "scale", ret_model = TRUE ) # scale the original iris10 here on purpose to check that full-variance column # is correctly removed iris10_zvstrans <- apply_scaling(iris10, attr_to_scale_info(iris10zvc_scale)) expect_equal(iris10zvc_scale, iris10_zvstrans, check.attributes = FALSE) iris10_none <- scale_input(iris10, scale_type = FALSE, ret_model = TRUE) expect_null(attr_to_scale_info(iris10_none)) iris10_colrange <- scale_input(iris10, scale_type = "colrange", ret_model = TRUE) iris10_crtrans <- apply_scaling(iris10, attr_to_scale_info(iris10_colrange)) expect_equal(iris10_colrange, iris10_crtrans, check.attributes = FALSE) # test pca transform works iris10pca <- pca_init(iris10, ndim = 2, ret_extra = TRUE) iris10pcat <- apply_pca(iris10, iris10pca) expect_equal(iris10pca$scores, iris10pcat, check.attributes = FALSE) # #64 (and some #81) test_that("can use pre-calculated neighbors in transform", { set.seed(1337) X_train <- as.matrix(iris[c(1:10, 51:60), -5]) X_test <- as.matrix(iris[101:110, -5]) iris_train_nn <- annoy_nn( X = X_train, k = 4, metric = "euclidean", n_threads = 0, ret_index = TRUE ) # (81) test row names are found if it's just the dist matrix of the NN graph row.names(iris_train_nn$dist) <- row.names(X_train) iris_umap_train <- umap( X = NULL, nn_method = iris_train_nn, ret_model = TRUE, n_neighbors = 4 ) expect_equal(row.names(iris_umap_train$embedding), row.names(X_train)) query_ref_nn <- annoy_search( X = X_test, k = 4, ann = iris_train_nn$index, n_threads = 0 ) # (81) test row names are found if it's just the index matrix of the NN graph row.names(query_ref_nn$dist) <- row.names(X_test) iris_umap_test <- umap_transform( X = NULL, model = iris_umap_train, nn_method = query_ref_nn, ret_extra = c("nn") ) expect_ok_matrix(iris_umap_test$embedding) expect_equal(row.names(iris_umap_test$embedding), row.names(X_test)) expect_equal(iris_umap_test$nn$precomputed$idx, query_ref_nn$idx) expect_equal(iris_umap_test$nn$precomputed$dist, query_ref_nn$dist) # also test that we can provide our own input and it's unchanged with 0 epochs nr <- nrow(query_ref_nn$idx) nc <- ncol(iris_umap_train$embedding) test_init <- matrix(rnorm(nr * nc), nrow = nr, ncol = nc) # set init row name and then set the NN dist names back to NULL to test # we can get row names from init matrix if needed row.names(test_init) <- row.names(X_test) row.names(query_ref_nn$dist) <- NULL iris_umap_test_rand0 <- umap_transform( X = NULL, model = iris_umap_train, nn_method = query_ref_nn, init = test_init, n_epochs = 0 ) expect_equal(iris_umap_test_rand0, test_init) }) test_that("equivalent results with nn graph or sparse distance matrix", { set.seed(42) iris_even <- iris[seq(2, 75, 2), ] iris_odd <- iris[seq(1, 25, 2), ] iris_even_nn <- uwot:::annoy_nn( X = uwot:::x2m(iris_even), k = 10, metric = "euclidean", ret_index = TRUE ) row.names(iris_even_nn$idx) <- row.names(iris_even) row.names(iris_even_nn$dist) <- row.names(iris_even) iris_odd_nn <- annoy_search( X = uwot:::x2m(iris_odd), k = 10, ann = iris_even_nn$index ) row.names(iris_odd_nn$idx) <- row.names(iris_odd) row.names(iris_odd_nn$dist) <- row.names(iris_odd) iris_even_nn$index <- NULL iris_even_umap <- umap( X = NULL, nn_method = iris_even_nn, ret_model = TRUE ) set.seed(42) iris_odd_transform_nn_graph <- umap_transform(X = NULL, iris_even_umap, nn_method = iris_odd_nn) expect_ok_matrix(iris_odd_transform_nn_graph, nrow(iris_odd), 2) expect_equal(row.names(iris_odd_transform_nn_graph), row.names(iris_odd)) iris_odd_nn_sp <- t(uwot:::nng_to_sparse(iris_odd_nn$idx, as.vector(iris_odd_nn$dist), self_nbr = FALSE, max_nbr_id = nrow(iris_even) )) row.names(iris_odd_nn_sp) <- row.names(iris_even_umap$embedding) colnames(iris_odd_nn_sp) <- row.names(iris_odd) set.seed(42) iris_odd_transform_sp <- umap_transform( X = NULL, iris_even_umap, nn_method = iris_odd_nn_sp, ret_extra = c("nn") ) expect_ok_matrix(iris_odd_transform_sp$embedding, nrow(iris_odd), 2) expect_equal(row.names(iris_odd_transform_sp$embedding), row.names(iris_odd)) expect_equal(iris_odd_transform_sp$embedding, iris_odd_transform_nn_graph) expect_equal(iris_odd_transform_sp$nn$precomputed, iris_odd_nn_sp) }) test_that("n_components can be > n_neighbors (#102)", { train <- iris[1:20, ] test <- iris[101:110, ] set.seed(42) train_umap <- umap( train, n_components = 4, ret_model = TRUE, y = train$Petal.Length, init = "rand", n_neighbors = 3 ) set.seed(42) test_umap <- umap_transform(test, train_umap) expect_equal(dim(test_umap), c(10, 4)) }) test_that("return transform fgraph (#104)", { train <- iris[1:20, ] test <- iris[101:110, ] set.seed(42) train_umap <- umap( train, ret_model = TRUE, n_neighbors = 3 ) set.seed(42) test_umap <- umap_transform(test, train_umap, ret_extra = c("fgraph", "localr", "sigma", "nn") ) expect_is(test_umap, "list") expect_ok_matrix(test_umap$embedding) expect_equal(dim(test_umap$embedding), c(10, 2)) expect_is(test_umap$fgraph, "Matrix") expect_equal(dim(test_umap$fgraph), c(10, 20)) expect_is(test_umap$localr, "numeric") expect_is(test_umap$sigma, "numeric") expect_is(test_umap$rho, "numeric") expect_equal(length(test_umap$localr), 10) expect_equal(length(test_umap$sigma), 10) expect_equal(length(test_umap$rho), 10) expect_equal(dim(test_umap$nn$euclidean$idx), c(10, 3)) expect_equal(dim(test_umap$nn$euclidean$dist), c(10, 3)) }) # regression tests the bug reported in #103 where ai and aj were transposed and # also the data being transformed is larger than the original data # this at best leads to a wide ring structure being formed for some of the # transformed data (but may also lead to NaN or a seg fault). This test checks # that the transformed data doesn't cover a large range, which would be # diagnostic of the ring forming: with the error present, the range of # coordinates is around c(-40, 40) vs c(-6, 6) otherwise test_that("leopold transform (#103)", { iris_species_12 <- iris[1:100, ] iris_species_3 <- iris[101:150, ] set.seed(42) iris_s3_leopold <- umap(iris_species_3, dens_scale = 1, ret_model = TRUE) set.seed(42) iris_s12_transform <- umap_transform(iris_species_12, iris_s3_leopold) transform_range <- range(iris_s12_transform) # as long as the coordinates aren't in the c(-40, 40) range then we are # probably ok expect_gt(transform_range[1], -10.0) expect_lt(transform_range[2], 10.0) }) test_that("can transform with binary edge weights", { iris_species_12 <- iris[1:100, ] iris_species_3 <- iris[101:150, ] set.seed(42) iris_s3 <- umap(iris_species_3, binary_edge_weights = TRUE, ret_model = TRUE) expect_true(iris_s3$binary_edge_weights) iris_s12_transform <- umap_transform(iris_species_12, iris_s3, ret_extra = c("fgraph") ) expect_true(all(iris_s12_transform$fgraph@x == 1)) }) test_that("transform can set or inherit model seed", { iris_species_12 <- iris[1:100, ] iris_species_3 <- iris[101:150, ] # transform inherits seed from model by default iris_model <- umap( iris_species_12, seed = 42, ret_model = TRUE, n_sgd_threads = 1 ) iris_transform1 <- umap_transform(iris_species_3, iris_model, n_sgd_threads = 1) iris_transform2 <- umap_transform(iris_species_3, iris_model, n_sgd_threads = 1) expect_equal(iris_transform1, iris_transform2) iris_transform3 <- umap_transform(iris_species_3, iris_model, seed = 42, n_sgd_threads = 1 ) expect_equal(iris_transform1, iris_transform3) # external seed setting should be same as internal set.seed(42) iris_model2 <- umap(iris_species_12, ret_model = TRUE, n_sgd_threads = 1 ) set.seed(42) iris_transform4 <- umap_transform(iris_species_3, iris_model2, n_sgd_threads = 1) expect_equal(iris_transform1, iris_transform4) # setting seed explicitly overrides model seed, gives different results iris_transform5 <- umap_transform(iris_species_3, iris_model, seed = 123, n_sgd_threads = 1 ) diff15 <- iris_transform1 - iris_transform5 expect_gt(sqrt(sum(diff15 * diff15) / length(diff15)), 0.01) # force model seed setting off iris_transform6 <- umap_transform(iris_species_3, iris_model, seed = FALSE, n_sgd_threads = 1 ) diff16 <- iris_transform1 - iris_transform6 expect_gt(sqrt(sum(diff16 * diff16) / length(diff16)), 0.01) iris_transform7 <- umap_transform(iris_species_3, iris_model, seed = FALSE, n_sgd_threads = 1 ) diff17 <- iris_transform1 - iris_transform7 expect_gt(sqrt(sum(diff17 * diff17) / length(diff17)), 0.01) # and transforms are different from each other diff67 <- iris_transform6 - iris_transform7 expect_gt(sqrt(sum(diff67 * diff67) / length(diff67)), 0.01) }) #118 fgraph must be transposed even if n_epochs = 0 test_that("graph dim is consistent when n_epochs = 0", { iris_species_12 <- iris[1:100, ] iris_species_3 <- iris[101:150, ] iris_model <- umap( iris_species_12, ret_model = TRUE, n_epochs = 0, batch = TRUE ) iris_transform_10 <- umap_transform(iris_species_3, iris_model, n_epochs = 10, ret_extra = "fgraph" ) iris_transform_0 <- umap_transform(iris_species_3, iris_model, n_epochs = 0, ret_extra = "fgraph" ) expect_equal( dim(iris_transform_10$fgraph), dim(iris_transform_0$fgraph) ) expect_equal(dim(iris_transform_10$fgraph), c(50, 100)) #118/129 and also without batch iris_model_no_batch <- umap( iris_species_12, ret_model = TRUE, n_epochs = 0, batch = FALSE ) iris_transform_10_no_batch <- umap_transform(iris_species_3, iris_model_no_batch, n_epochs = 10, ret_extra = "fgraph" ) iris_transform_0_no_batch <- umap_transform(iris_species_3, iris_model_no_batch, n_epochs = 0, ret_extra = "fgraph" ) expect_equal( dim(iris_transform_10_no_batch$fgraph), dim(iris_transform_0_no_batch$fgraph) ) expect_equal(dim(iris_transform_10_no_batch$fgraph), c(50, 100)) }) uwot/tests/testthat/test_optonly.R0000644000176200001440000000463614730166740017161 0ustar liggesuserslibrary(uwot) context("optimization only") # this mainly exists to hedge against rhub timing out during tests and to still # exercise the main umap code even with RSpectra and other dependencies # temporarily removed due to excessive compilation times. Filter on # this in testthat.R: # # test_check("uwot", filter = "optonly") # init_coords <- matrix(c( -0.286003508982688, 0.205935933716443, 0.212672369696097, 0.318264664390379, -0.290855854751177, -0.84524521577413, 0.10829983500751, -0.163970086771776, 0.611654891362094, 0.12924697210725, 0.0469197151280184, 0.226976224751362, -0.0547582725501509, -0.0373386048885834, -0.0879982948022033, -0.0168061906596455, -0.269189585603006, 0.0524409053485183, -0.0619916567707883, 0.201745760046477 ), ncol = 2) nn <- list( euclidean = list( idx = matrix( c( 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 5L, 10L, 4L, 3L, 1L, 5L, 3L, 1L, 4L, 2L, 8L, 3L, 7L, 9L, 8L, 1L, 4L, 5L, 3L, 3L, 10L, 4L, 2L, 10L, 7L, 8L, 8L, 10L, 2L, 4L ), ncol = 4 ), dist = matrix( c( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.141421356237309, 0.173205080756888, 0.244948974278318, 0.244948974278318, 0.141421356237309, 0.616441400296898, 0.264575131106459, 0.173205080756888, 0.3, 0.173205080756888, 0.173205080756888, 0.3, 0.264575131106459, 0.3, 0.223606797749979, 0.616441400296898, 0.33166247903554, 0.223606797749979, 0.435889894354067, 0.316227766016838, 0.469041575982343, 0.331662479035541, 0.3, 0.316227766016839, 0.458257569495584, 0.7, 0.424264068711929, 0.33166247903554, 0.509901951359279, 0.316227766016839 ), ncol = 4 ) ) ) res <- umap(iris10, n_neighbors = 4, n_epochs = 2, learning_rate = 0.5, min_dist = 0.001, init = init_coords, verbose = FALSE, n_threads = 0, nn_method = nn ) expect_ok_matrix(res) first_coords <- c() test_callback <- function(epochs, n_epochs, coords) { first_coords <<- c(first_coords, coords[1, 1]) } set.seed(42) res <- umap(iris10, n_neighbors = 4, n_epochs = 10, learning_rate = 0.5, min_dist = 0.001, init = init_coords, verbose = FALSE, nn_method = nn, batch = TRUE, epoch_callback = test_callback, approx_pow = TRUE, n_threads = 2 ) expect_ok_matrix(res) expect_equal(length(first_coords), 10) uwot/tests/testthat/test_similarity_graph.R0000644000176200001440000000626514730166740021024 0ustar liggesuserslibrary(uwot) library(RSpectra) context("similarity graph") # #96: more convenient way to just get the high dimensional similarity graph # Hard way first (by using umap function) # allow for no initialization if n_epochs = 0 # Allowable but returns nothing of value expect_warning( res <- umap(iris10, n_neighbors = 4, init = NULL, n_epochs = 0), "will be returned" ) expect_null(res) # More sensibly, return high-dimensional data res <- umap(iris10, n_neighbors = 4, init = NULL, n_epochs = 0, ret_extra = c("fgraph") ) expect_is(res, "list") expect_null(res$embedding) expect_is(res$fgraph, "sparseMatrix") # also a bad idea but maybe ok to extract something from the return value # manually expect_warning( res_with_model <- umap(iris10, n_neighbors = 4, init = NULL, n_epochs = 0, ret_model = TRUE ), "will not be valid for transforming" ) expect_is(res_with_model, "list") expect_null(res_with_model$embedding) # but you cannot use umap_transform with the model expect_error(umap_transform(iris10, res_with_model), "(?i)invalid embedding coordinates") # Simpler way (by using similarity_graph function) set.seed(42) sim_graph <- similarity_graph(iris10, n_neighbors = 4) expect_equal(res$fgraph, sim_graph) # can return extra data set.seed(42) sim_graph_extra <- similarity_graph(iris10, n_neighbors = 4, ret_extra = c("sigma", "nn") ) expect_is(sim_graph_extra, "list") expect_equal(sim_graph, sim_graph_extra$similarity_graph) expect_equal(length(sim_graph_extra$sigma), 10) expect_equal(length(sim_graph_extra$rho), 10) expect_is(sim_graph_extra$nn$euclidean, "list") # can use pre-computed nn instead of data sim_graph_nn <- similarity_graph( nn_method = sim_graph_extra$nn, ret_extra = c("sigma") ) expect_equal(sim_graph, sim_graph_nn$similarity_graph) expect_equal(sim_graph_extra$sigma, sim_graph_nn$sigma) # can use largevis for t-SNE-like graph sim_graph_largevis <- similarity_graph(iris10, method = "largevis", perplexity = 5, ret_extra = c("sigma") ) expect_is(sim_graph_largevis, "list") expect_is(sim_graph_largevis$similarity_graph, "sparseMatrix") expect_equal(dim(sim_graph_largevis$similarity_graph), c(10, 10)) expect_equal(length(sim_graph_extra$sigma), 10) # specific use case of bbknnR fss <- fuzzy_simplicial_set( nn = sim_graph_extra$nn$euclidean, set_op_mix_ratio = 0.5, local_connectivity = 2 ) sim_graph_bbknnR <- similarity_graph( nn_method = sim_graph_extra$nn$euclidean, set_op_mix_ratio = 0.5, local_connectivity = 2 ) expect_equal(sim_graph_bbknnR, fss) # supervised sim_graphy <- similarity_graph(iris10, n_neighbors = 4, y = ynum) expect_is(sim_graphy, "sparseMatrix") # binary edge weights sim_graphb <- similarity_graph(iris10, n_neighbors = 4, binary_edge_weights = TRUE) expect_true(all(sim_graphb@x == 1)) test_that("optimize graph layout", { iris30 <- iris[c(1:10, 51:60, 101:110), ] iris30_sim_graph <- similarity_graph(iris30, n_neighbors = 10) set.seed(42) iris30_opt <- optimize_graph_layout(iris30_sim_graph, X = iris30) set.seed(42) iris30_umap <- umap(iris30, n_neighbors = 10) expect_equal(iris30_opt, iris30_umap) }) uwot/tests/testthat/test_mixed_distances.R0000644000176200001440000000610114577210515020604 0ustar liggesuserslibrary(uwot) context("mixed distance calculations") set.seed(1337) res <- umap(iris10, n_neighbors = 4, n_epochs = 2, init = "spca", verbose = FALSE, n_threads = 0 ) expect_ok_matrix(res) set.seed(1337) resmli <- umap(iris10, n_neighbors = 4, n_epochs = 2, init = "spca", metric = list("euclidean" = 1:4), verbose = FALSE, n_threads = 0 ) expect_equal(resmli, res) set.seed(1337) resmls <- umap(iris10, n_neighbors = 4, n_epochs = 2, init = "spca", metric = list("euclidean" = c( "Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width" )), verbose = FALSE, n_threads = 0 ) expect_equal(resmls, res) set.seed(1337) jiris10 <- jitter(iris10) metric2 <- list( "euclidean" = c(1, 2), "euclidean" = c("Petal.Length", "Petal.Width") ) reseuc2 <- umap(jiris10, n_neighbors = 4, n_epochs = 2, init = "spca", metric = metric2, verbose = FALSE, n_threads = 0, ret_nn = TRUE, ret_model = TRUE ) expect_ok_matrix(reseuc2$embedding) expect_equal(reseuc2$metric, metric2) expect_is(reseuc2$nn, "list") expect_equal(names(reseuc2$nn), c("euclidean", "euclidean")) expect_is_nn(reseuc2$nn[[1]], 10, 4) expect_is_nn(reseuc2$nn[[2]], 10, 4) expect_ok_matrix(umap_transform(jiris10, reseuc2)) i10factor <- factor(c(rep("foo", 3), rep("bar", 3), rep("baz", 4))) res_y2 <- umap(iris10[, -1], y = cbind(i10factor, iris$Sepal.Length[1:10]), n_neighbors = 4, n_epochs = 2, init = "spca", verbose = FALSE, n_threads = 0 ) expect_ok_matrix(res_y2) nafactor <- as.factor(c(levels(iris$Species)[ c(rep(1, 3), rep(2, 3), rep(3, 3)) ], NA)) iris10c <- cbind(data.frame(iris10), nafactor) rescat <- umap(iris10c, metric = list("euclidean" = 1:4, "categorical" = "nafactor"), n_neighbors = 4, n_epochs = 2, init = "spca", verbose = FALSE, n_threads = 0 ) expect_ok_matrix(rescat) irismixed <- data.frame(iris10, ynum, ynum2, ycat, ycat2) resmixed <- umap(irismixed, n_neighbors = 4, n_epochs = 2, learning_rate = 0.5, min_dist = 0.001, init = "rand", verbose = FALSE, n_threads = 1, metric = list( "euclidean" = 1:4, "euclidean" = 5, "cosine" = 6, "categorical" = c("ycat", "ycat2") ) ) expect_ok_matrix(resmixed) irismixed <- data.frame(iris10, ynum, ynum2, ycat, ycat2) resmixed <- umap(irismixed, n_neighbors = 4, n_epochs = 2, learning_rate = 0.5, min_dist = 0.001, init = "rand", verbose = FALSE, n_threads = 1, metric = list( "euclidean" = 1:4, "euclidean" = 5, "cosine" = 6, "categorical" = c("ycat", "ycat2") ), ret_model = TRUE, ret_nn = TRUE ) expect_ok_matrix(resmixed$embedding) expect_ok_matrix(umap_transform(irismixed, resmixed, n_threads = 1)) expect_equal(names(resmixed$nn), c("euclidean", "euclidean", "cosine")) # #20: allow matrix column for categorical int_column <- c(1, 2, 3, 4, 4, 4, 2, 1, 2, 1) irisic <- cbind(iris10, int_column) resic <- umap(irisic, n_neighbors = 4, n_epochs = 2, learning_rate = 0.5, init = "spca", verbose = FALSE, n_threads = 0, metric = list("euclidean" = 1:4, "categorical" = 5) ) expect_ok_matrix(resic) uwot/tests/testthat/helper_fuzzy_sets.R0000644000176200001440000001171014730166740020171 0ustar liggesusers# fuzzy set data for iris10 with 4 neighbors # numbers have been compared with python fuzzy_simplicial_set # Asymmetric fuzzy set data V_asymm <- Matrix::sparseMatrix( i = c( 5, 6, 8, 3, 9, 10, 2, 4, 7, 9, 10, 2, 3, 7, 9, 10, 1, 6, 8, 3, 5, 1, 5, 6, 7, 4, 1, 2, 4, 8 ), j = c( 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 7, 7, 8, 8, 8, 8, 9, 10, 10, 10, 10 ), x = c( 1, 1, 1, 0.328, 0.4252, 1, 0.5386, 1, 1, 0.5748, 0.5, 0.4614, 1, 0.6463, 1, 0.5, 1, 1, 0.6894, 0.672, 0.2807, 0.8381, 0.7193, 5.129e-10, 0.3538, 0.5447, 0.1619, 1, 0.4553, 0.3106 ) ) # Fuzzy Set Union V_union <- Matrix::sparseMatrix( i = c( 5, 6, 8, 10, 3, 4, 9, 10, 2, 4, 7, 9, 10, 2, 3, 7, 9, 10, 1, 6, 7, 8, 1, 5, 8, 3, 4, 5, 8, 1, 5, 6, 7, 10, 2, 3, 4, 1, 2, 3, 4, 8 ), j = c( 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8, 8, 8, 9, 9, 9, 10, 10, 10, 10, 10 ), x = c( 1, 1, 1, 0.1619, 0.6899, 0.4614, 0.4252, 1, 0.6899, 1, 1, 0.5748, 0.5, 0.4614, 1, 0.6463, 1, 0.7277, 1, 1, 0.2807, 0.9128, 1, 1, 5.129e-10, 1, 0.6463, 0.2807, 0.3538, 1, 0.9128, 5.129e-10, 0.3538, 0.3106, 0.4252, 0.5748, 1, 0.1619, 1, 0.5, 0.7277, 0.3106 ) ) # mix intersection with union V_mix <- Matrix::sparseMatrix( i = c( 5, 6, 8, 10, 3, 4, 9, 10, 2, 4, 7, 9, 10, 2, 3, 7, 9, 10, 1, 6, 7, 8, 1, 5, 8, 3, 4, 5, 8, 1, 5, 6, 7, 10, 2, 3, 4, 1, 2, 3, 4, 8 ), j = c( 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8, 8, 8, 9, 9, 9, 10, 10, 10, 10, 10 ), x = c( 1, 0.5, 0.919, 0.08095, 0.4333, 0.2307, 0.2126, 1, 0.4333, 1, 0.836, 0.2874, 0.25, 0.2307, 1, 0.3231, 0.7723, 0.4777, 1, 0.5, 0.1404, 0.7043, 0.5, 0.5, 2.564e-10, 0.836, 0.3231, 0.1404, 0.1769, 0.919, 0.7043, 2.564e-10, 0.1769, 0.1553, 0.2126, 0.2874, 0.7723, 0.08095, 1, 0.25, 0.4777, 0.1553 ) ) # intersection V_intersect <- Matrix::sparseMatrix( i = c(5, 8, 3, 10, 2, 4, 7, 3, 9, 10, 1, 8, 3, 1, 5, 4, 2, 4), j = c(1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 7, 8, 8, 9, 10, 10), x = c( 1, 0.8381, 0.1766, 1, 0.1766, 1, 0.672, 1, 0.5447, 0.2277, 1, 0.4959, 0.672, 0.8381, 0.4959, 0.5447, 1, 0.2277 ) ) # asymm with local connectivity = 1.5 V_asymm_local <- Matrix::sparseMatrix( i = c( 5, 6, 8, 3, 9, 10, 2, 4, 7, 9, 10, 2, 3, 7, 9, 10, 1, 6, 8, 3, 5, 1, 5, 6, 7, 4, 1, 2, 4, 8 ), j = c( 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 7, 7, 8, 8, 8, 8, 9, 10, 10, 10, 10 ), x = c( 1, 1, 1, 0.2559, 0.3748, 1, 0.5698, 1, 1, 0.6252, 0.5, 0.4302, 1, 0.7157, 1, 0.5, 1, 1, 0.7622, 0.7441, 0.2084, 0.8925, 0.7916, 5.129e-10, 0.2843, 0.5797, 0.1075, 1, 0.4203, 0.2378 ) ) V_union_local <- Matrix::sparseMatrix( i = c( 5, 6, 8, 10, 3, 4, 9, 10, 2, 4, 7, 9, 10, 2, 3, 7, 9, 10, 1, 6, 7, 8, 1, 5, 8, 3, 4, 5, 8, 1, 5, 6, 7, 10, 2, 3, 4, 1, 2, 3, 4, 8 ), j = c( 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8, 8, 8, 9, 9, 9, 10, 10, 10, 10, 10 ), x = c( 1, 1, 1, 0.1075, 0.6799, 0.4302, 0.3748, 1, 0.6799, 1, 1, 0.6252, 0.5, 0.4302, 1, 0.7157, 1, 0.7102, 1, 1, 0.2084, 0.9504, 1, 1, 5.129e-10, 1, 0.7157, 0.2084, 0.2843, 1, 0.9504, 5.129e-10, 0.2843, 0.2378, 0.3748, 0.6252, 1, 0.1075, 1, 0.5, 0.7102, 0.2378 ) ) # NB have to modify UMAP source code to allow bandwidth to be specified # umap.umap_.fuzzy_simplicial_set(irisxy[0][5:15], 4, random_state=42, metric="euclidean", bandwidth=0.5, set_op_mix_ratio=1)[0] V_union_bandwidth <- Matrix::sparseMatrix( i = c( 2, 5, 9, 2, 3, 4, 6, 8, 0, 1, 4, 5, 6, 9, 1, 7, 8, 1, 2, 6, 7, 0, 2, 9, 1, 2, 4, 7, 3, 4, 6, 8, 1, 3, 7, 0, 2, 5 ), j = c( 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8, 9, 9, 9 ), x = c( 1.486e-10, 1, 5.145e-06, 1.237e-07, 4.152e-23, 1.04e-10, 1, 3.123e-08, 1.486e-10, 1.237e-07, 9.861e-07, 2.897e-09, 1, 2.381e-26, 4.152e-23, 1.074e-07, 1, 1.04e-10, 9.861e-07, 2.353e-10, 1, 1, 2.897e-09, 1, 1, 1, 2.353e-10, 1.194e-08, 1.074e-07, 1, 1.194e-08, 2.639e-07, 3.123e-08, 1, 2.639e-07, 5.145e-06, 2.381e-26, 1 ), index1 = FALSE ) # umap.umap_.fuzzy_simplicial_set(irisxy[0][5:15], 4, random_state=42, metric="euclidean", local_connectivity=1.5, bandwidth=0.5, set_op_mix_ratio=0)[0] V_intersect_local_bandwidth <- Matrix::sparseMatrix( i = c( 5, 9, 2, 6, 1, 4, 6, 7, 8, 2, 6, 7, 0, 9, 1, 2, 4, 3, 4, 3, 0, 5 ), j = c( 0, 0, 1, 1, 2, 2, 2, 3, 3, 4, 4, 4, 5, 5, 6, 6, 6, 7, 7, 8, 9, 9 ), x = c( 1, 3.079e-15, 6.166e-24, 3.211e-09, 6.166e-24, 1.53e-15, 1, 2.323e-16, 1, 1.53e-15, 5.479e-30, 1, 1, 1.673e-18, 3.211e-09, 1, 5.479e-30, 2.323e-16, 1, 1, 3.079e-15, 1.673e-18 ), index1 = FALSE ) uwot/tests/testthat/test_supervised.R0000644000176200001440000001065414730166740017643 0ustar liggesuserscontext("Supervised") # categorical y res <- umap(iris10, n_neighbors = 4, n_epochs = 2, learning_rate = 0.5, min_dist = 0.001, init = "rand", verbose = FALSE, n_threads = 1, y = ycat ) expect_ok_matrix(res) # numeric y res <- umap(iris10, n_neighbors = 4, n_epochs = 2, learning_rate = 0.5, min_dist = 0.001, init = "rand", verbose = FALSE, n_threads = 1, y = ynum ) expect_ok_matrix(res) # mixed categorical and numeric res <- umap(iris10, n_neighbors = 4, n_epochs = 2, learning_rate = 0.5, min_dist = 0.001, init = "rand", verbose = FALSE, n_threads = 1, y = data.frame(ycat, ynum) ) expect_ok_matrix(res) # multiple categorical y res <- umap(iris10, n_neighbors = 4, n_epochs = 2, learning_rate = 0.5, min_dist = 0.001, init = "rand", verbose = FALSE, n_threads = 1, y = data.frame(ycat, ycat2) ) expect_ok_matrix(res) # multiple numeric y res <- umap(iris10, n_neighbors = 4, n_epochs = 2, learning_rate = 0.5, min_dist = 0.001, init = "rand", verbose = FALSE, n_threads = 1, y = data.frame(ynum, ynum2) ) expect_ok_matrix(res) # multiple numeric and categorical res <- umap(iris10, n_neighbors = 4, n_epochs = 2, learning_rate = 0.5, min_dist = 0.001, init = "rand", verbose = FALSE, n_threads = 1, y = data.frame(ynum, ynum2, ycat, ycat2) ) expect_ok_matrix(res) # multiple numeric with different metrics and categorical set.seed(1337) res <- umap(iris10, n_neighbors = 4, n_epochs = 2, learning_rate = 0.5, min_dist = 0.001, init = "rand", verbose = FALSE, n_threads = 1, target_metric = list("euclidean" = 1, "cosine" = 2), target_weight = 0.5, y = data.frame(ynum, ynum2, ycat, ycat2) ) expect_ok_matrix(res) sm <- Matrix::drop0(matrix(c( -0.9183907, -1.4071020, 0.70400164, 0.4990913, -0.1631884, -0.03232201, 0.2156861, 0.4341653, 0.92592670 ), byrow = TRUE, nrow = 3)) # make matrix positive and symmetric like typical UMAP fuzzy graph sms <- (Matrix::t(sm) + sm)^2 expected <- matrix(c( 1, 1, 0.43551409, 1, 0.24170431, 0.23371835, 0.43551409, 0.23371835, 1 ), byrow = TRUE, nrow = 3) # checked against python version expect_equal(as.matrix(reset_local_connectivity(sms)), expected, tol = 1e-7, check.attributes = FALSE ) # tested on a modified python version with the effect n_neighbors changed expected_reset_local_metric <- matrix(c( 1, 1, 0.5972302, 1, 0.44010492, 0.43783589, 0.5972302, 0.43783589, 1 ), byrow = TRUE, nrow = 3) expect_equal( as.matrix( reset_local_connectivity(sms, reset_local_metric = TRUE, num_local_metric_neighbors = 3) ), expected_reset_local_metric, tol = 1e-7, check.attributes = FALSE ) expect_equal( as.matrix( reset_local_connectivity(sms, reset_local_metric = TRUE, num_local_metric_neighbors = 3, n_threads = 2) ), expected_reset_local_metric, tol = 1e-7, check.attributes = FALSE ) sparr <- new("dgCMatrix", i = c(0L, 2L, 0L, 1L, 2L, 0L, 1L), p = c(0L, 2L, 5L, 7L), Dim = c(3L, 3L), Dimnames = list(NULL, NULL), x = c( 0.918390745913514, 0.215686070576616, 1.40710203887692, 0.163188411813119, 0.434165332563817, 0.704001636268765, 0.0323220081795518 ), factors = list() ) sparr2 <- new("dgCMatrix", i = c(0L, 1L, 2L, 1L, 2L, 0L, 1L), p = c(0L, 3L, 5L, 7L), Dim = c(3L, 3L), Dimnames = list(NULL, NULL), x = c( 1.68463092, 2.91620546, 0.26469792, 1.08820257, 0.96444675, 1.46399222, 2.72643589 ), factors = list() ) # Numbers taken from Python implementation int09 <- general_simplicial_set_intersection(sparr, sparr2, 0.9) res09 <- matrix( c( 1.66877087146, 0.137467853888, 1.40799953091, 1.84399206494, 0.889673751622, 1.86201852389, 0.223218799442, 0.879058365893, 0.000000 ), nrow = 3, byrow = TRUE ) expect_equal(as.matrix(int09), res09, check.attributes = FALSE, tol = 1e-6) int01 <- general_simplicial_set_intersection(sparr, sparr2, 0.1) res01 <- matrix( c( 0.97318335824, 1.12392924757, 0.734457833761, 0.0182018202924, 0.164728272878, 0.0361324854953, 0.186072986202, 0.432422466467, 0.000000 ), nrow = 3, byrow = TRUE ) expect_equal(as.matrix(int01), res01, check.attributes = FALSE, tol = 1e-6) sp34 <- Matrix::drop0(matrix(nrow = 3, byrow = TRUE, c( 0, 0.7403984, 0, 0.6574427, 0, 0, 0.9472488, 0, 0, 0.3039677, 0.2868714, 0 ))) expect_equal(colMaxs(sp34), c(0, 0.7403984, 0.9472488, 0.6574427)) uwot/tests/testthat.R0000644000176200001440000000007014151727010014370 0ustar liggesuserslibrary(testthat) library(uwot) test_check("uwot") uwot/MD50000644000176200001440000001152314757010752011572 0ustar liggesuserscbb461e87e8e661e767a6b30080b27d8 *DESCRIPTION 79750f29f16894c37d02cf88befb74c1 *NAMESPACE 921bcfdf18213458042367217501c658 *NEWS.md f33284f95fc731405545f966d12befd7 *R/RcppExports.R 15a2fa771ba77dd610db90bb7a5d2046 *R/affinity.R b1486dbe304023c715057f3fddf5b132 *R/bigstatsr_init.R 0c37a6c9bb59ceb19f000b00c76d1bd4 *R/init.R 9c345b75096909979b7e3c4224a0e185 *R/neighbors.R 703c21d6c4d64ab3db43177288f7e375 *R/nn_hnsw.R b0ac2ad778b792a7748dbda182bef869 *R/nn_nndescent.R 985b71a797a7361f6548468e8733446e *R/rspectra_init.R 95bf23b1ab296e32625b995d87559ef5 *R/supervised.R c5aebbf1335625d997e780b2d202adaf *R/transform.R cc9770b0bdbf219e8c0402766e13ff49 *R/umap2.R 322eb0ceadd9bcff09df063da3370355 *R/util.R b3f31a8cbeca094e4440ee3d6f753586 *R/uwot.R 1dda19015f0aa969acee990c150e3ac9 *build/partial.rdb 1d5d7f832bdfeea809014c69e2a5fa81 *build/vignette.rds e45cd4c83e64fd46ceb9d6fd7406c981 *inst/doc/uwot.R 2d2e2e5f167538b58728154d0be691a5 *inst/doc/uwot.Rmd ee54e11c96a0378eddc3f7d142d4a8ec *inst/doc/uwot.html 339aef829029a0cb11f5700f9c03fc26 *inst/include/RcppPerpendicular.h 93a073d84957f140072fc866ec94f528 *inst/include/uwot/connected_components.h d23152faf24dfd44b1e8f09d686312f2 *inst/include/uwot/coords.h a7fa4dcdc1d78eba6c59eb5aae3c2084 *inst/include/uwot/epoch.h 229c807657a00aa4e83c277dbd4d274d *inst/include/uwot/gradient.h 9432927d2fa0d5a320239260ac4ba724 *inst/include/uwot/optimize.h e99ac5d7d544cc67b9d03dcec36f5619 *inst/include/uwot/perplexity.h 9357a3dcfb819928a204d6f5b0e31300 *inst/include/uwot/sampler.h c0d9730df4fc49aba4b46ca253c4d55b *inst/include/uwot/smooth_knn.h 6210dab463400d48e2e8e33889717a85 *inst/include/uwot/supervised.h 34abc73c7083073f75a2aa139505c9e3 *inst/include/uwot/tauprng.h 4a2d3af056e8c6908a12acaaf42f5a5f *inst/include/uwot/transform.h f546426d50fdf9f2df735c8a2414b22f *inst/include/uwot/update.h c32085b1cc137075587fe5bc820a362f *man/figures/mnist-r.png ddf7b457a1f83fcafa201c8c66a2083c *man/load_uwot.Rd fde260e1e642a0e4b190902b4208c51c *man/lvish.Rd b09441fcda53250ad263ab5a6d176c78 *man/optimize_graph_layout.Rd 1d4632b8cb23651627cbd3645cdc85fd *man/save_uwot.Rd 04e1776b3f4b080d012b2228c442e977 *man/similarity_graph.Rd cab6a6b5d1682a8ff8c50052d3f014b0 *man/simplicial_set_intersect.Rd 9b82f69bf58710ab3b8d1ec63416e557 *man/simplicial_set_union.Rd 35da7fc3a9f5ba515649374956617bc4 *man/tumap.Rd 64595cf906fefec881ecc00b0c70994a *man/umap.Rd 88756e33cd445a7da699b384608d0d8c *man/umap2.Rd cdc9330d8457646c4152756028f51020 *man/umap_transform.Rd d56654f34a88c4a5a49e3ba5cc748359 *man/unload_uwot.Rd 7768c3651f82a95496d4f65ee856ba4e *src/Makevars fa9f9e35f33c1572ea79177f53119e10 *src/RcppExports.cpp 8b804bf880076d43811bb6374d8ccaea *src/connected_components.cpp 4fd23a7e07e3cbe61c23d3aa982dae11 *src/nn_parallel.cpp 9c34f2022b946d9449fb182d22474994 *src/nn_parallel.h 910d3403f766531a656ae6e0b974aacb *src/perplexity.cpp f0aa578f1beaba6bb9a0c2e733c861dd *src/r_uwot.cpp 11d9a37925b4d7a26c762d501364a8e5 *src/rng.h 6cc250899c718da3b3f6e4847399b711 *src/rparallel.h bb708eb4c2d72950a5d0044d3fdf62dd *src/rprogress.h f3c153e4f203e59f59b5c9b4f8dad992 *src/smooth_knn.cpp d6597b1c2732030dc11e24a9ebddb997 *src/supervised.cpp 006cde5c7ed3f8a7a741ce89241fef15 *src/thread.cpp 5d3e624eeac762b94f41e03a8c180244 *src/transform.cpp d5b47dc43e1e70a8e4159fc6f8e12ae0 *tests/testthat.R 7ae1a61d3c952b969fbff38335cb9fad *tests/testthat/helper_data.R e21b907bf3227d9faebf36514a60b0d7 *tests/testthat/helper_fuzzy_sets.R f402a4131e20c4be58bcf488330fa2b5 *tests/testthat/test_curve.R 4ecbd44ecf8cca9b6ff86200140d9384 *tests/testthat/test_epochs.R 910d9ea993fa431e94cae7cda5426ac3 *tests/testthat/test_errors.R 82c05ece9957193b30681f814201f581 *tests/testthat/test_fuzzy_simplicial_set.R 9014e3ae723d9c8a61ad7b4dda9a3f53 *tests/testthat/test_knn_aff.R 887e3f7cb9d5f4b331ad641887536649 *tests/testthat/test_mixed_distances.R 2b3e2f2c13aaf1cb9dd885328d36dd64 *tests/testthat/test_neighbors.R f7214d4f2848c4a2e092f72be3e83300 *tests/testthat/test_normlaplacian.R c8898e30a72525e409733ad8d2729072 *tests/testthat/test_optonly.R 99e63dd8c5281a7311996d4c710c8e75 *tests/testthat/test_output.R 1cadfcdc332f090bd1f8b64a516c9f08 *tests/testthat/test_pca.R 46841f21adb4b92f7e6ee4adb43dec24 *tests/testthat/test_perplexity.R e4cd686cf4caa3a594b44e99ab99cb4a *tests/testthat/test_rand_init.R 9c1387034761eb9d1459e79c815a06d0 *tests/testthat/test_saveload.R 0346cdf602760f83201477a48b2205ad *tests/testthat/test_scale.R 8dda8db7aa346817753d64ebf37d6eb8 *tests/testthat/test_similarity_graph.R 95756f7bc5d844914bafb28167f3cc8f *tests/testthat/test_smooth_knn_dists.R 529ad486ad41b7d67ca33d4740d77b0b *tests/testthat/test_spectral.R 0a8156c4592a1764721867bb272b3181 *tests/testthat/test_supervised.R f8904b4b41cd567f86f127d3ec523024 *tests/testthat/test_transform.R 1b1e72b91b75e10ee106066ff585b11a *vignettes/mnist-py.png c32085b1cc137075587fe5bc820a362f *vignettes/mnist-r.png 2d2e2e5f167538b58728154d0be691a5 *vignettes/uwot.Rmd uwot/R/0000755000176200001440000000000014757004313011455 5ustar liggesusersuwot/R/RcppExports.R0000644000176200001440000000636514757000744014107 0ustar liggesusers# Generated by using Rcpp::compileAttributes() -> do not edit by hand # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 connected_components_undirected <- function(N, indices1, indptr1, indices2, indptr2) { .Call(`_uwot_connected_components_undirected`, N, indices1, indptr1, indices2, indptr2) } annoy_search_parallel_cpp <- function(index_name, mat, n_neighbors, search_k, metric, n_threads = 0L, grain_size = 1L) { .Call(`_uwot_annoy_search_parallel_cpp`, index_name, mat, n_neighbors, search_k, metric, n_threads, grain_size) } calc_row_probabilities_parallel <- function(nn_dist, n_vertices, perplexity, n_iter = 200L, tol = 1e-5, ret_sigma = FALSE, n_threads = 0L, grain_size = 1L) { .Call(`_uwot_calc_row_probabilities_parallel`, nn_dist, n_vertices, perplexity, n_iter, tol, ret_sigma, n_threads, grain_size) } optimize_layout_r <- function(head_embedding, tail_embedding, positive_head, positive_tail, positive_ptr, n_epochs, n_head_vertices, n_tail_vertices, epochs_per_sample, method, method_args, initial_alpha, opt_args, epoch_callback, negative_sample_rate, rng_type = "tausworthe", batch = FALSE, n_threads = 0L, grain_size = 1L, move_other = TRUE, verbose = FALSE) { .Call(`_uwot_optimize_layout_r`, head_embedding, tail_embedding, positive_head, positive_tail, positive_ptr, n_epochs, n_head_vertices, n_tail_vertices, epochs_per_sample, method, method_args, initial_alpha, opt_args, epoch_callback, negative_sample_rate, rng_type, batch, n_threads, grain_size, move_other, verbose) } smooth_knn_distances_parallel <- function(nn_dist, nn_ptr, skip_first, target, n_iter = 64L, local_connectivity = 1.0, tol = 1e-5, min_k_dist_scale = 1e-3, ret_sigma = FALSE, n_threads = 0L, grain_size = 1L) { .Call(`_uwot_smooth_knn_distances_parallel`, nn_dist, nn_ptr, skip_first, target, n_iter, local_connectivity, tol, min_k_dist_scale, ret_sigma, n_threads, grain_size) } reset_local_metrics_parallel <- function(indptr, probabilities, n_iter = 32L, tol = 1e-5, num_local_metric_neighbors = 15.0, n_threads = 0L) { .Call(`_uwot_reset_local_metrics_parallel`, indptr, probabilities, n_iter, tol, num_local_metric_neighbors, n_threads) } fast_intersection_cpp <- function(rows, cols, values, target, unknown_dist = 1.0, far_dist = 5.0) { .Call(`_uwot_fast_intersection_cpp`, rows, cols, values, target, unknown_dist, far_dist) } general_sset_intersection_cpp <- function(indptr1, indices1, data1, indptr2, indices2, data2, result_row, result_col, result_val, mix_weight = 0.5) { .Call(`_uwot_general_sset_intersection_cpp`, indptr1, indices1, data1, indptr2, indices2, data2, result_row, result_col, result_val, mix_weight) } general_sset_union_cpp <- function(indptr1, indices1, data1, indptr2, indices2, data2, result_row, result_col, result_val) { .Call(`_uwot_general_sset_union_cpp`, indptr1, indices1, data1, indptr2, indices2, data2, result_row, result_col, result_val) } hardware_concurrency <- function() { .Call(`_uwot_hardware_concurrency`) } init_transform_parallel <- function(train_embedding, nn_index, n_test_vertices, nn_weights, n_threads = 0L, grain_size = 1L) { .Call(`_uwot_init_transform_parallel`, train_embedding, nn_index, n_test_vertices, nn_weights, n_threads, grain_size) } uwot/R/nn_nndescent.R0000644000176200001440000000676114730166740014272 0ustar liggesusersnndescent_nn <- function(X, k = 10, metric = "euclidean", nn_args = list(), n_threads = NULL, ret_index = FALSE, verbose = FALSE) { if (is.null(n_threads)) { n_threads <- default_num_threads() } if (!ret_index) { nn_knn_args <- get_nndescent_knn_args(nn_args) nn_knn_args <- lmerge( nn_knn_args, list( data = X, k = k, metric = metric, n_threads = n_threads, verbose = verbose ) ) return(do.call(rnndescent::rnnd_knn, nn_knn_args)) } ann <- nndescent_build( X, k, metric, nn_args = nn_args, n_threads = n_threads, verbose = verbose ) res <- list( idx = ann$ann$graph$idx, dist = ann$ann$graph$dist, index = ann ) res$index$ann$ann$graph <- NULL res } nndescent_build <- function(X, k, metric, nn_args = list(), n_threads = NULL, verbose = FALSE) { nn_build_args <- get_nndescent_build_args(nn_args) nn_build_args <- lmerge( nn_build_args, list( data = X, k = k, metric = metric, n_threads = n_threads, verbose = verbose ) ) index <- do.call(rnndescent::rnnd_build, nn_build_args) list( ann = index, type = "nndescentv1", metric = metric, ndim = ncol(X) ) } nndescent_search <- function(X, k, ann, nn_args = list(), n_threads = NULL, verbose = FALSE) { nn_query_args <- get_nndescent_query_args(nn_args) nn_query_args <- lmerge( nn_query_args, list( index = ann$ann, query = X, k = k, n_threads = n_threads, verbose = verbose ) ) do.call(rnndescent::rnnd_query, nn_query_args) } get_nndescent_knn_args <- function(nn_args) { nn_knn_args <- list() nnd_knn_names <- c( "use_alt_metric", "init", "n_trees", "leaf_size", "max_tree_depth", "margin", "n_iters", "delta", "max_candidates", "weight_by_degree", "low_memory" ) for (name in nnd_knn_names) { if (name %in% names(nn_args)) { nn_knn_args[[name]] <- nn_args[[name]] } } nn_knn_args } get_nndescent_build_args <- function(nn_args) { # prune_reverse should probably always be TRUE nn_build_args <- list(prune_reverse = TRUE) nnd_build_names <- c( "use_alt_metric", "init", "n_trees", "leaf_size", "max_tree_depth", "margin", "n_iters", "delta", "max_candidates", "weight_by_degree", "low_memory", "n_search_trees", "pruning_degree_multiplier", "diversify_prob", "prune_reverse" ) for (name in nnd_build_names) { if (name %in% names(nn_args)) { nn_build_args[[name]] <- nn_args[[name]] } } nn_build_args } get_nndescent_query_args <- function(nn_args) { nn_query_args <- list() nnd_query_names <- c( "epsilon", "max_search_fraction" ) for (name in nnd_query_names) { if (name %in% names(nn_args)) { nn_query_args[[name]] <- nn_args[[name]] } } nn_query_args } uwot/R/rspectra_init.R0000644000176200001440000000322014754234236014451 0ustar liggesusersrspectra_is_installed <- function() { is_installed("RSpectra") } rspectra_eigs_asym <- function(L, ndim) { res <- NULL suppressWarnings(res <- tryCatch( RSpectra::eigs( L, k = ndim + 1, which = "LR", opts = list(tol = 1e-4) ), error = function(c) { NULL } )) res } rspectra_eigs_sym <- function(L, ndim, verbose = FALSE, ...) { k <- ndim + 1 opt <- lmerge(list(tol = 1e-4), list(...)) suppressWarnings(res <- tryCatch( RSpectra::eigs_sym(L, k = k, which = "SM", opts = opt), error = function(c) { NULL } )) if (is.null(res) || !is.list(res) || !"vectors" %in% names(res) || is.null(res$vectors) || tryCatch( is.na(ncol(res$vectors)), error = function(e) { TRUE } ) || ncol(res$vectors) < ndim) { tsmessage("RSpectra calculation failed, retrying with shifted") if ("initvec" %in% names(opt)) { opt$initvec <- 1 / opt$initvec } suppressWarnings(res <- tryCatch( RSpectra::eigs_sym( L, k = k, which = "LM", sigma = 1e-9, opts = opt ), error = function(c) { tsmessage("RSpectra shifted calculation also failed") NULL } )) } res } rspectra_eigs_shift_sym <- function(L, ndim, verbose = FALSE, ...) { k <- ndim + 1 opts <- lmerge(list(tol = 1e-4), list(...)) suppressWarnings(res <- tryCatch( RSpectra::eigs_sym( L, k = k, which = "LM", opts = opts ), error = function(c) { NULL } )) res } uwot/R/uwot.R0000644000176200001440000072654614757000724012624 0ustar liggesusers#' Dimensionality Reduction with UMAP #' #' Carry out dimensionality reduction of a dataset using the Uniform Manifold #' Approximation and Projection (UMAP) method (McInnes et al., 2018). Some of #' the following help text is lifted verbatim from the Python reference #' implementation at \url{https://github.com/lmcinnes/umap}. #' #' @param X Input data. Can be a \code{\link{data.frame}}, \code{\link{matrix}}, #' \code{\link[stats]{dist}} object or \code{\link[Matrix]{sparseMatrix}}. #' Matrix and data frames should contain one observation per row. Data frames #' will have any non-numeric columns removed, although factor columns will be #' used if explicitly included via \code{metric} (see the help for #' \code{metric} for details). A sparse matrix is interpreted as a distance #' matrix, and is assumed to be symmetric, so you can also pass in an #' explicitly upper or lower triangular sparse matrix to save storage. There #' must be at least \code{n_neighbors} non-zero distances for each row. Both #' implicit and explicit zero entries are ignored. Set zero distances you want #' to keep to an arbitrarily small non-zero value (e.g. \code{1e-10}). #' \code{X} can also be \code{NULL} if pre-computed nearest neighbor data is #' passed to \code{nn_method}, and \code{init} is not \code{"spca"} or #' \code{"pca"}. #' @param n_neighbors The size of local neighborhood (in terms of number of #' neighboring sample points) used for manifold approximation. Larger values #' result in more global views of the manifold, while smaller values result in #' more local data being preserved. In general values should be in the range #' \code{2} to \code{100}. #' @param n_components The dimension of the space to embed into. This defaults #' to \code{2} to provide easy visualization, but can reasonably be set to any #' integer value in the range \code{2} to \code{100}. #' @param metric Type of distance metric to use to find nearest neighbors. For #' \code{nn_method = "annoy"} this can be one of: #' \itemize{ #' \item \code{"euclidean"} (the default) #' \item \code{"cosine"} #' \item \code{"manhattan"} #' \item \code{"hamming"} #' \item \code{"correlation"} (a distance based on the Pearson correlation) #' \item \code{"categorical"} (see below) #' } #' For \code{nn_method = "hnsw"} this can be one of: #' \itemize{ #' \item \code{"euclidean"} #' \item \code{"cosine"} #' \item \code{"correlation"} #' } #' If \href{https://cran.r-project.org/package=rnndescent}{rnndescent} is #' installed and \code{nn_method = "nndescent"} is specified then many more #' metrics are avaiable, including: #' \itemize{ #' \item \code{"braycurtis"} #' \item \code{"canberra"} #' \item \code{"chebyshev"} #' \item \code{"dice"} #' \item \code{"hamming"} #' \item \code{"hellinger"} #' \item \code{"jaccard"} #' \item \code{"jensenshannon"} #' \item \code{"kulsinski"} #' \item \code{"rogerstanimoto"} #' \item \code{"russellrao"} #' \item \code{"sokalmichener"} #' \item \code{"sokalsneath"} #' \item \code{"spearmanr"} #' \item \code{"symmetrickl"} #' \item \code{"tsss"} #' \item \code{"yule"} #' } #' For more details see the package documentation of \code{rnndescent}. #' For \code{nn_method = "fnn"}, the distance metric is always "euclidean". #' #' If \code{X} is a data frame or matrix, then multiple metrics can be #' specified, by passing a list to this argument, where the name of each item in #' the list is one of the metric names above. The value of each list item should #' be a vector giving the names or integer ids of the columns to be included in #' a calculation, e.g. \code{metric = list(euclidean = 1:4, manhattan = 5:10)}. #' #' Each metric calculation results in a separate fuzzy simplicial set, which are #' intersected together to produce the final set. Metric names can be repeated. #' Because non-numeric columns are removed from the data frame, it is safer to #' use column names than integer ids. #' #' Factor columns can also be used by specifying the metric name #' \code{"categorical"}. Factor columns are treated different from numeric #' columns and although multiple factor columns can be specified in a vector, #' each factor column specified is processed individually. If you specify #' a non-factor column, it will be coerced to a factor. #' #' For a given data block, you may override the \code{pca} and \code{pca_center} #' arguments for that block, by providing a list with one unnamed item #' containing the column names or ids, and then any of the \code{pca} or #' \code{pca_center} overrides as named items, e.g. \code{metric = #' list(euclidean = 1:4, manhattan = list(5:10, pca_center = FALSE))}. This #' exists to allow mixed binary and real-valued data to be included and to have #' PCA applied to both, but with centering applied only to the real-valued data #' (it is typical not to apply centering to binary data before PCA is applied). #' @param n_epochs Number of epochs to use during the optimization of the #' embedded coordinates. By default, this value is set to \code{500} for #' datasets containing 10,000 vertices or less, and \code{200} otherwise. #' If \code{n_epochs = 0}, then coordinates determined by \code{"init"} will #' be returned. #' @param scale Scaling to apply to \code{X} if it is a data frame or matrix: #' \itemize{ #' \item{\code{"none"} or \code{FALSE} or \code{NULL}} No scaling. #' \item{\code{"Z"} or \code{"scale"} or \code{TRUE}} Scale each column to #' zero mean and variance 1. #' \item{\code{"maxabs"}} Center each column to mean 0, then divide each #' element by the maximum absolute value over the entire matrix. #' \item{\code{"range"}} Range scale the entire matrix, so the smallest #' element is 0 and the largest is 1. #' \item{\code{"colrange"}} Scale each column in the range (0,1). #' } #' For UMAP, the default is \code{"none"}. #' @param learning_rate Initial learning rate used in optimization of the #' coordinates. #' @param init Type of initialization for the coordinates. Options are: #' \itemize{ #' \item \code{"spectral"} Spectral embedding using the normalized Laplacian #' of the fuzzy 1-skeleton, with Gaussian noise added. #' \item \code{"normlaplacian"}. Spectral embedding using the normalized #' Laplacian of the fuzzy 1-skeleton, without noise. #' \item \code{"random"}. Coordinates assigned using a uniform random #' distribution between -10 and 10. #' \item \code{"lvrandom"}. Coordinates assigned using a Gaussian #' distribution with standard deviation 1e-4, as used in LargeVis #' (Tang et al., 2016) and t-SNE. #' \item \code{"laplacian"}. Spectral embedding using the Laplacian Eigenmap #' (Belkin and Niyogi, 2002). #' \item \code{"pca"}. The first two principal components from PCA of #' \code{X} if \code{X} is a data frame, and from a 2-dimensional classical #' MDS if \code{X} is of class \code{"dist"}. #' \item \code{"spca"}. Like \code{"pca"}, but each dimension is then scaled #' so the standard deviation is 1e-4, to give a distribution similar to that #' used in t-SNE. This is an alias for \code{init = "pca", init_sdev = #' 1e-4}. #' \item \code{"agspectral"} An "approximate global" modification of #' \code{"spectral"} which all edges in the graph to a value of 1, and then #' sets a random number of edges (\code{negative_sample_rate} edges per #' vertex) to 0.1, to approximate the effect of non-local affinities. #' \item A matrix of initial coordinates. #' } #' For spectral initializations, (\code{"spectral"}, \code{"normlaplacian"}, #' \code{"laplacian"}, \code{"agspectral"}), if more than one connected #' component is identified, no spectral initialization is attempted. Instead #' a PCA-based initialization is attempted. If \code{verbose = TRUE} the #' number of connected components are logged to the console. The existence of #' multiple connected components implies that a global view of the data cannot #' be attained with this initialization. Increasing the value of #' \code{n_neighbors} may help. #' @param init_sdev If non-\code{NULL}, scales each dimension of the initialized #' coordinates (including any user-supplied matrix) to this standard #' deviation. By default no scaling is carried out, except when \code{init = #' "spca"}, in which case the value is \code{0.0001}. Scaling the input may #' help if the unscaled versions result in initial coordinates with large #' inter-point distances or outliers. This usually results in small gradients #' during optimization and very little progress being made to the layout. #' Shrinking the initial embedding by rescaling can help under these #' circumstances. Scaling the result of \code{init = "pca"} is usually #' recommended and \code{init = "spca"} as an alias for \code{init = "pca", #' init_sdev = 1e-4} but for the spectral initializations the scaled versions #' usually aren't necessary unless you are using a large value of #' \code{n_neighbors} (e.g. \code{n_neighbors = 150} or higher). For #' compatibility with recent versions of the Python UMAP package, if you are #' using \code{init = "spectral"}, then you should also set #' \code{init_sdev = "range"}, which will range scale each of the columns #' containing the initial data between 0-10. This is not set by default to #' maintain backwards compatibility with previous versions of uwot. #' @param spread The effective scale of embedded points. In combination with #' \code{min_dist}, this determines how clustered/clumped the embedded points #' are. #' @param min_dist The effective minimum distance between embedded points. #' Smaller values will result in a more clustered/clumped embedding where #' nearby points on the manifold are drawn closer together, while larger #' values will result on a more even dispersal of points. The value should be #' set relative to the \code{spread} value, which determines the scale at #' which embedded points will be spread out. #' @param set_op_mix_ratio Interpolate between (fuzzy) union and intersection as #' the set operation used to combine local fuzzy simplicial sets to obtain a #' global fuzzy simplicial sets. Both fuzzy set operations use the product #' t-norm. The value of this parameter should be between \code{0.0} and #' \code{1.0}; a value of \code{1.0} will use a pure fuzzy union, while #' \code{0.0} will use a pure fuzzy intersection. #' @param local_connectivity The local connectivity required -- i.e. the number #' of nearest neighbors that should be assumed to be connected at a local #' level. The higher this value the more connected the manifold becomes #' locally. In practice this should be not more than the local intrinsic #' dimension of the manifold. #' @param bandwidth The effective bandwidth of the kernel if we view the #' algorithm as similar to Laplacian Eigenmaps. Larger values induce more #' connectivity and a more global view of the data, smaller values concentrate #' more locally. #' @param repulsion_strength Weighting applied to negative samples in low #' dimensional embedding optimization. Values higher than one will result in #' greater weight being given to negative samples. #' @param negative_sample_rate The number of negative edge/1-simplex samples to #' use per positive edge/1-simplex sample in optimizing the low dimensional #' embedding. #' @param a More specific parameters controlling the embedding. If \code{NULL} #' these values are set automatically as determined by \code{min_dist} and #' \code{spread}. #' @param b More specific parameters controlling the embedding. If \code{NULL} #' these values are set automatically as determined by \code{min_dist} and #' \code{spread}. #' @param nn_method Method for finding nearest neighbors. Options are: #' \itemize{ #' \item \code{"fnn"}. Use exact nearest neighbors via the #' \href{https://cran.r-project.org/package=FNN}{FNN} package. #' \item \code{"annoy"} Use approximate nearest neighbors via the #' \href{https://cran.r-project.org/package=RcppAnnoy}{RcppAnnoy} package. #' \item \code{"hnsw"} Use approximate nearest neighbors with the #' Hierarchical Navigable Small World (HNSW) method (Malkov and Yashunin, #' 2018) via the #' \href{https://cran.r-project.org/package=RcppHNSW}{RcppHNSW} package. #' \code{RcppHNSW} is not a dependency of this package: this option is #' only available if you have installed \code{RcppHNSW} yourself. Also, #' HNSW only supports the following arguments for \code{metric} and #' \code{target_metric}: \code{"euclidean"}, \code{"cosine"} and #' \code{"correlation"}. #' \item \code{"nndescent"} Use approximate nearest neighbors with the #' Nearest Neighbor Descent method (Dong et al., 2011) via the #' \href{https://cran.r-project.org/package=rnndescent}{rnndescent} #' package. \code{rnndescent} is not a dependency of this package: this #' option is only available if you have installed \code{rnndescent} #' yourself. #' } #' By default, if \code{X} has less than 4,096 vertices, the exact nearest #' neighbors are found. Otherwise, approximate nearest neighbors are used. #' You may also pass pre-calculated nearest neighbor data to this argument. It #' must be one of two formats, either a list consisting of two elements: #' \itemize{ #' \item \code{"idx"}. A \code{n_vertices x n_neighbors} matrix #' containing the integer indexes of the nearest neighbors in \code{X}. Each #' vertex is considered to be its own nearest neighbor, i.e. #' \code{idx[, 1] == 1:n_vertices}. #' \item \code{"dist"}. A \code{n_vertices x n_neighbors} matrix #' containing the distances of the nearest neighbors. #' } #' or a sparse distance matrix of type \code{dgCMatrix}, with dimensions #' \code{n_vertices x n_vertices}. Distances should be arranged by column, #' i.e. a non-zero entry in row \code{j} of the \code{i}th column indicates #' that the \code{j}th observation in \code{X} is a nearest neighbor of the #' \code{i}th observation with the distance given by the value of that #' element. #' The \code{n_neighbors} parameter is ignored when using precomputed #' nearest neighbor data. If using the sparse distance matrix input, each #' column can contain a different number of neighbors. #' @param n_trees Number of trees to build when constructing the nearest #' neighbor index. The more trees specified, the larger the index, but the #' better the results. With \code{search_k}, determines the accuracy of the #' Annoy nearest neighbor search. Only used if the \code{nn_method} is #' \code{"annoy"}. Sensible values are between \code{10} to \code{100}. #' @param search_k Number of nodes to search during the neighbor retrieval. The #' larger k, the more the accurate results, but the longer the search takes. #' With \code{n_trees}, determines the accuracy of the Annoy nearest neighbor #' search. Only used if the \code{nn_method} is \code{"annoy"}. #' @param nn_args A list containing additional arguments to pass to the nearest #' neighbor method. For \code{nn_method = "annoy"}, you can specify #' \code{"n_trees"} and \code{"search_k"}, and these will override the #' \code{n_trees} and \code{search_k} parameters. #' For \code{nn_method = "hnsw"}, you may specify the following arguments: #' \itemize{ #' \item \code{M} The maximum number of neighbors to keep for each vertex. #' Reasonable values are \code{2} to \code{100}. Higher values give better #' recall at the cost of more memory. Default value is \code{16}. #' \item \code{ef_construction} A positive integer specifying the size of #' the dynamic list used during index construction. A higher value will #' provide better results at the cost of a longer time to build the index. #' Default is \code{200}. #' \item \code{ef} A positive integer specifying the size of the dynamic #' list used during search. This cannot be smaller than \code{n_neighbors} #' and cannot be higher than the number of items in the index. Default is #' \code{10}. #' } #' For \code{nn_method = "nndescent"}, you may specify the following #' arguments: #' \itemize{ #' \item \code{n_trees} The number of trees to use in a random projection #' forest to initialize the search. A larger number will give more accurate #' results at the cost of a longer computation time. The default of #' \code{NULL} means that the number is chosen based on the number of #' observations in \code{X}. #' \item \code{max_candidates} The number of potential neighbors to explore #' per iteration. By default, this is set to \code{n_neighbors} or \code{60}, #' whichever is smaller. A larger number will give more accurate results at #' the cost of a longer computation time. #' \item \code{n_iters} The number of iterations to run the search. A larger #' number will give more accurate results at the cost of a longer computation #' time. By default, this will be chosen based on the number of observations #' in \code{X}. You may also need to modify the convergence criterion #' \code{delta}. #' \item \code{delta} The minimum relative change in the neighbor graph #' allowed before early stopping. Should be a value between 0 and 1. The #' smaller the value, the smaller the amount of progress between iterations is #' allowed. Default value of \code{0.001} means that at least 0.1% of the #' neighbor graph must be updated at each iteration. #' \item \code{init} How to initialize the nearest neighbor descent. By #' default this is set to \code{"tree"} and uses a random project forest. #' If you set this to \code{"rand"}, then a random selection is used. Usually #' this is less accurate than using RP trees, but for high-dimensional cases, #' there may be little difference in the quality of the initialization and #' random initialization will be a lot faster. If you set this to #' \code{"rand"}, then the \code{n_trees} parameter is ignored. #' \item \code{pruning_degree_multiplier} The maximum number of edges per node #' to retain in the search graph, relative to \code{n_neighbors}. A larger #' value will give more accurate results at the cost of a longer computation #' time. Default is \code{1.5}. This parameter only affects neighbor search #' when transforming new data with \code{\link{umap_transform}}. #' \item \code{epsilon} Controls the degree of the back-tracking when #' traversing the search graph. Setting this to \code{0.0} will do a greedy #' search with no back-tracking. A larger value will give more accurate #' results at the cost of a longer computation time. Default is \code{0.1}. #' This parameter only affects neighbor search when transforming new data with #' \code{\link{umap_transform}}. #' \item \code{max_search_fraction} Specifies the maximum fraction of the #' search graph to traverse. By default, this is set to \code{1.0}, so the #' entire graph (i.e. all items in \code{X}) may be visited. You may want to #' set this to a smaller value if you have a very large dataset (in #' conjunction with \code{epsilon}) to avoid an inefficient exhaustive search #' of the data in \code{X}. This parameter only affects neighbor search when #' transforming new data with \code{\link{umap_transform}}. #' } #' @param approx_pow If \code{TRUE}, use an approximation to the power function #' in the UMAP gradient, from #' \url{https://martin.ankerl.com/2012/01/25/optimized-approximative-pow-in-c-and-cpp/}. #' Ignored if \code{dens_scale} is non-\code{NULL}. #' @param y Optional target data for supervised dimension reduction. Can be a #' vector, matrix or data frame. Use the \code{target_metric} parameter to #' specify the metrics to use, using the same syntax as \code{metric}. Usually #' either a single numeric or factor column is used, but more complex formats #' are possible. The following types are allowed: #' \itemize{ #' \item Factor columns with the same length as \code{X}. \code{NA} is #' allowed for any observation with an unknown level, in which case #' UMAP operates as a form of semi-supervised learning. Each column is #' treated separately. #' \item Numeric data. \code{NA} is \emph{not} allowed in this case. Use the #' parameter \code{target_n_neighbors} to set the number of neighbors used #' with \code{y}. If unset, \code{n_neighbors} is used. Unlike factors, #' numeric columns are grouped into one block unless \code{target_metric} #' specifies otherwise. For example, if you wish columns \code{a} and #' \code{b} to be treated separately, specify #' \code{target_metric = list(euclidean = "a", euclidean = "b")}. Otherwise, #' the data will be effectively treated as a matrix with two columns. #' \item Nearest neighbor data, consisting of a list of two matrices, #' \code{idx} and \code{dist}. These represent the precalculated nearest #' neighbor indices and distances, respectively. This #' is the same format as that expected for precalculated data in #' \code{nn_method}. This format assumes that the underlying data was a #' numeric vector. Any user-supplied value of the \code{target_n_neighbors} #' parameter is ignored in this case, because the the number of columns in #' the matrices is used for the value. Multiple nearest neighbor data using #' different metrics can be supplied by passing a list of these lists. #' } #' Unlike \code{X}, all factor columns included in \code{y} are automatically #' used. #' @param target_n_neighbors Number of nearest neighbors to use to construct the #' target simplicial set. Default value is \code{n_neighbors}. Applies only if #' \code{y} is non-\code{NULL} and \code{numeric}. #' @param target_metric The metric used to measure distance for \code{y} if #' using supervised dimension reduction. Used only if \code{y} is numeric. #' @param target_weight Weighting factor between data topology and target #' topology. A value of 0.0 weights entirely on data, a value of 1.0 weights #' entirely on target. The default of 0.5 balances the weighting equally #' between data and target. Only applies if \code{y} is non-\code{NULL}. #' @param pca If set to a positive integer value, reduce data to this number of #' columns using PCA. Doesn't applied if the distance \code{metric} is #' \code{"hamming"}, or the dimensions of the data is larger than the #' number specified (i.e. number of rows and columns must be larger than the #' value of this parameter). If you have > 100 columns in a data frame or #' matrix, reducing the number of columns in this way may substantially #' increase the performance of the nearest neighbor search at the cost of a #' potential decrease in accuracy. In many t-SNE applications, a value of 50 #' is recommended, although there's no guarantee that this is appropriate for #' all settings. #' @param pca_center If \code{TRUE}, center the columns of \code{X} before #' carrying out PCA. For binary data, it's recommended to set this to #' \code{FALSE}. #' @param pca_method Method to carry out any PCA dimensionality reduction when #' the \code{pca} parameter is specified. Allowed values are: #' \itemize{ #' \item{\code{"irlba"}}. Uses \code{\link[irlba]{prcomp_irlba}} from the #' \href{https://cran.r-project.org/package=irlba}{irlba} package. #' \item{\code{"rsvd"}}. Uses 5 iterations of \code{\link[irlba]{svdr}} from #' the \href{https://cran.r-project.org/package=irlba}{irlba} package. #' This is likely to give much faster but potentially less accurate results #' than using \code{"irlba"}. For the purposes of nearest neighbor #' calculation and coordinates initialization, any loss of accuracy doesn't #' seem to matter much. #' \item{\code{"bigstatsr"}}. Uses \code{\link[bigstatsr]{big_randomSVD}} #' from the \href{https://cran.r-project.org/package=bigstatsr}{bigstatsr} #' package. The SVD methods used in \code{bigstatsr} may be faster on #' systems without access to efficient linear algebra libraries (e.g. #' Windows). \strong{Note}: \code{bigstatsr} is \emph{not} a dependency of #' uwot: if you choose to use this package for PCA, you \emph{must} install #' it yourself. #' \item{\code{"svd"}}. Uses \code{\link[base]{svd}} for the SVD. This is #' likely to be slow for all but the smallest datasets. #' \item{\code{"auto"}} (the default). Uses \code{"irlba"}, unless more than #' 50% of the full set of singular vectors would be calculated, in which #' case \code{"svd"} is used. #' } #' @param pcg_rand If \code{TRUE}, use the PCG random number generator (O'Neill, #' 2014) during optimization. Otherwise, use the faster (but probably less #' statistically good) Tausworthe "taus88" generator. The default is #' \code{TRUE}. This parameter has been superseded by \code{rng_type} -- if #' both are set, \code{rng_type} takes precedence. #' @param rng_type The type of random number generator to use during #' optimization. One of: #' \itemize{ #' \item{\code{"pcg"}}. Use the PCG random number generator (O'Neill, 2014). #' \item{\code{"tausworthe"}}. Use the Tausworthe "taus88" generator. #' \item{\code{"deterministic"}}. Use a deterministic number generator. This #' isn't actually random, but may provide enough variation in the negative #' sampling to give a good embedding and can provide a noticeable speed-up. #' } #' For backwards compatibility, by default this is unset and the choice of #' \code{pcg_rand} is used (making "pcg" the effective default). #' @param fast_sgd If \code{TRUE}, then the following combination of parameters #' is set: \code{pcg_rand = TRUE}, \code{n_sgd_threads = "auto"} and #' \code{approx_pow = TRUE}. The default is \code{FALSE}. Setting this to #' \code{TRUE} will speed up the stochastic optimization phase, but give a #' potentially less accurate embedding, and which will not be exactly #' reproducible even with a fixed seed. For visualization, \code{fast_sgd = #' TRUE} will give perfectly good results. For more generic dimensionality #' reduction, it's safer to leave \code{fast_sgd = FALSE}. If \code{fast_sgd = #' TRUE}, then user-supplied values of \code{pcg_rand}, \code{n_sgd_threads}, #' and \code{approx_pow} are ignored. #' @param batch If \code{TRUE}, then embedding coordinates are updated at the #' end of each epoch rather than during the epoch. In batch mode, results are #' reproducible with a fixed random seed even with \code{n_sgd_threads > 1}, #' at the cost of a slightly higher memory use. You may also have to modify #' \code{learning_rate} and increase \code{n_epochs}, so whether this provides #' a speed increase over the single-threaded optimization is likely to be #' dataset and hardware-dependent. #' @param ret_model If \code{TRUE}, then return extra data that can be used to #' add new data to an existing embedding via \code{\link{umap_transform}}. The #' embedded coordinates are returned as the list item \code{embedding}. If #' \code{FALSE}, just return the coordinates. This parameter can be used in #' conjunction with \code{ret_nn} and \code{ret_extra}. Note that some #' settings are incompatible with the production of a UMAP model: external #' neighbor data (passed via a list to \code{nn_method}), and factor columns #' that were included via the \code{metric} parameter. In the latter case, the #' model produced is based only on the numeric data. A transformation using #' new data is possible, but the factor columns in the new data are ignored. #' Note that setting \code{ret_model = TRUE} forces the use of the approximate #' nearest neighbors method. Because small datasets would otherwise use exact #' nearest neighbor calculations, setting \code{ret_model = TRUE} means that #' different results may be returned for small datasets in terms of both the #' returned nearest neighbors (if requested) and the final embedded #' coordinates, compared to \code{ret_model = FALSE}, even if the random #' number seed is fixed. To avoid this, explicitly set #' \code{nn_method = "annoy"} in the \code{ret_model = FALSE} case. #' @param ret_nn If \code{TRUE}, then in addition to the embedding, also return #' nearest neighbor data that can be used as input to \code{nn_method} to #' avoid the overhead of repeatedly calculating the nearest neighbors when #' manipulating unrelated parameters (e.g. \code{min_dist}, \code{n_epochs}, #' \code{init}). See the "Value" section for the names of the list items. If #' \code{FALSE}, just return the coordinates. Note that the nearest neighbors #' could be sensitive to data scaling, so be wary of reusing nearest neighbor #' data if modifying the \code{scale} parameter. This parameter can be used in #' conjunction with \code{ret_model} and \code{ret_extra}. #' @param ret_extra A vector indicating what extra data to return. May contain #' any combination of the following strings: #' \itemize{ #' \item \code{"model"} Same as setting \code{ret_model = TRUE}. #' \item \code{"nn"} Same as setting \code{ret_nn = TRUE}. #' \item \code{"fgraph"} the high dimensional fuzzy graph (i.e. the fuzzy #' simplicial set of the merged local views of the input data). The graph #' is returned as a sparse symmetric N x N matrix of class #' \link[Matrix]{dgCMatrix-class}, where a non-zero entry (i, j) gives the #' membership strength of the edge connecting vertex i and vertex j. This #' can be considered analogous to the input probability (or similarity or #' affinity) used in t-SNE and LargeVis. Note that the graph is further #' sparsified by removing edges with sufficiently low membership strength #' that they would not be sampled by the probabilistic edge sampling #' employed for optimization and therefore the number of non-zero elements #' in the matrix is dependent on \code{n_epochs}. If you are only #' interested in the fuzzy input graph (e.g. for clustering), setting #' \code{n_epochs = 0} will avoid any further sparsifying. #' Be aware that setting `binary_edge_weights = TRUE` will affect this #' graph (all non-zero edge weights will be 1). #' \item \code{"sigma"} the normalization value for each observation in the #' dataset when constructing the smoothed distances to each of its #' neighbors. This gives some sense of the local density of each #' observation in the high dimensional space: higher values of #' \code{sigma} indicate a higher dispersion or lower density. #' } #' @param n_threads Number of threads to use (except during stochastic gradient #' descent). Default is half the number of concurrent threads supported by the #' system. For nearest neighbor search, only applies if #' \code{nn_method = "annoy"}. If \code{n_threads > 1}, then the Annoy index #' will be temporarily written to disk in the location determined by #' \code{\link[base]{tempfile}}. #' @param n_sgd_threads Number of threads to use during stochastic gradient #' descent. If set to > 1, then be aware that if \code{batch = FALSE}, results #' will \emph{not} be reproducible, even if \code{set.seed} is called with a #' fixed seed before running. Set to \code{"auto"} to use the same value as #' \code{n_threads}. #' @param grain_size The minimum amount of work to do on each thread. If this #' value is set high enough, then less than \code{n_threads} or #' \code{n_sgd_threads} will be used for processing, which might give a #' performance improvement if the overhead of thread management and context #' switching was outweighing the improvement due to concurrent processing. #' This should be left at default (\code{1}) and work will be spread evenly #' over all the threads specified. #' @param tmpdir Temporary directory to store nearest neighbor indexes during #' nearest neighbor search. Default is \code{\link{tempdir}}. The index is #' only written to disk if \code{n_threads > 1} and #' \code{nn_method = "annoy"}; otherwise, this parameter is ignored. #' @param verbose If \code{TRUE}, log details to the console. #' @param opt_args A list of optimizer parameters, used when #' \code{batch = TRUE}. The default optimization method used is Adam (Kingma #' and Ba, 2014). #' \itemize{ #' \item \code{method} The optimization method to use. Either \code{"adam"} #' or \code{"sgd"} (stochastic gradient descent). Default: \code{"adam"}. #' \item \code{beta1} (Adam only). The weighting parameter for the #' exponential moving average of the first moment estimator. Effectively the #' momentum parameter. Should be a floating point value between 0 and 1. #' Higher values can smooth oscillatory updates in poorly-conditioned #' situations and may allow for a larger \code{learning_rate} to be #' specified, but too high can cause divergence. Default: \code{0.5}. #' \item \code{beta2} (Adam only). The weighting parameter for the #' exponential moving average of the uncentered second moment estimator. #' Should be a floating point value between 0 and 1. Controls the degree of #' adaptivity in the step-size. Higher values put more weight on previous #' time steps. Default: \code{0.9}. #' \item \code{eps} (Adam only). Intended to be a small value to prevent #' division by zero, but in practice can also affect convergence due to its #' interaction with \code{beta2}. Higher values reduce the effect of the #' step-size adaptivity and bring the behavior closer to stochastic gradient #' descent with momentum. Typical values are between 1e-8 and 1e-3. Default: #' \code{1e-7}. #' \item \code{alpha} The initial learning rate. Default: the value of the #' \code{learning_rate} parameter. #' } #' @param epoch_callback A function which will be invoked at the end of every #' epoch. Its signature should be: \code{(epoch, n_epochs, coords)}, where: #' \itemize{ #' \item \code{epoch} The current epoch number (between \code{1} and #' \code{n_epochs}). #' \item \code{n_epochs} Number of epochs to use during the optimization of #' the embedded coordinates. #' \item \code{coords} The embedded coordinates as of the end of the current #' epoch, as a matrix with dimensions (N, \code{n_components}). #' } #' @param binary_edge_weights If \code{TRUE} then edge weights in the input #' graph are treated as binary (0/1) rather than real valued. This affects the #' sampling frequency of neighbors and is the strategy used by the PaCMAP #' method (Wang and co-workers, 2020). Practical (Böhm and co-workers, 2020) #' and theoretical (Damrich and Hamprecht, 2021) work suggests this has little #' effect on UMAP's performance. #' @param dens_scale A value between 0 and 1. If > 0 then the output attempts #' to preserve relative local density around each observation. This uses an #' approximation to the densMAP method (Narayan and co-workers, 2021). The #' larger the value of \code{dens_scale}, the greater the range of output #' densities that will be used to map the input densities. This option is #' ignored if using multiple \code{metric} blocks. #' @param seed Integer seed to use to initialize the random number generator #' state. Combined with \code{n_sgd_threads = 1} or \code{batch = TRUE}, this #' should give consistent output across multiple runs on a given installation. #' Setting this value is equivalent to calling \code{\link[base]{set.seed}}, #' but it may be more convenient in some situations than having to call a #' separate function. The default is to not set a seed. If #' \code{ret_model = TRUE}, the seed will be stored in the output model and #' then used to set the seed inside \code{\link{umap_transform}}. #' @return A matrix of optimized coordinates, or: #' \itemize{ #' \item if \code{ret_model = TRUE} (or \code{ret_extra} contains #' \code{"model"}), returns a list containing extra information that can be #' used to add new data to an existing embedding via #' \code{\link{umap_transform}}. In this case, the coordinates are available #' in the list item \code{embedding}. \bold{NOTE}: The contents of #' the \code{model} list should \emph{not} be considered stable or part of #' the public API, and are purposely left undocumented. #' \item if \code{ret_nn = TRUE} (or \code{ret_extra} contains \code{"nn"}), #' returns the nearest neighbor data as a list called \code{nn}. This #' contains one list for each \code{metric} calculated, itself containing a #' matrix \code{idx} with the integer ids of the neighbors; and a matrix #' \code{dist} with the distances. The \code{nn} list (or a sub-list) can be #' used as input to the \code{nn_method} parameter. #' \item if \code{ret_extra} contains \code{"fgraph"}, returns the high #' dimensional fuzzy graph as a sparse matrix called \code{fgraph}, of type #' \link[Matrix]{dgCMatrix-class}. #' \item if \code{ret_extra} contains \code{"sigma"}, returns a vector of the #' smooth knn distance normalization terms for each observation as #' \code{"sigma"} and a vector \code{"rho"} containing the largest #' distance to the locally connected neighbors of each observation. #' \item if \code{ret_extra} contains \code{"localr"}, returns a vector of #' the estimated local radii, the sum of \code{"sigma"} and \code{"rho"}. #' } #' The returned list contains the combined data from any combination of #' specifying \code{ret_model}, \code{ret_nn} and \code{ret_extra}. #' @examples #' #' iris30 <- iris[c(1:10, 51:60, 101:110), ] #' #' # Non-numeric columns are automatically removed so you can pass data frames #' # directly in a lot of cases without pre-processing #' iris_umap <- umap(iris30, n_neighbors = 5, learning_rate = 0.5, init = "random", n_epochs = 20) #' #' # Faster approximation to the gradient and return nearest neighbors #' iris_umap <- umap(iris30, n_neighbors = 5, approx_pow = TRUE, ret_nn = TRUE, n_epochs = 20) #' #' # Can specify min_dist and spread parameters to control separation and size #' # of clusters and reuse nearest neighbors for efficiency #' nn <- iris_umap$nn #' iris_umap <- umap(iris30, n_neighbors = 5, min_dist = 1, spread = 5, nn_method = nn, n_epochs = 20) #' #' # Supervised dimension reduction using the 'Species' factor column #' iris_sumap <- umap(iris30, #' n_neighbors = 5, min_dist = 0.001, y = iris30$Species, #' target_weight = 0.5, n_epochs = 20 #' ) #' #' # Calculate Petal and Sepal neighbors separately (uses intersection of the resulting sets): #' iris_umap <- umap(iris30, metric = list( #' "euclidean" = c("Sepal.Length", "Sepal.Width"), #' "euclidean" = c("Petal.Length", "Petal.Width") #' )) #' #' @references #' Belkin, M., & Niyogi, P. (2002). #' Laplacian eigenmaps and spectral techniques for embedding and clustering. #' In \emph{Advances in neural information processing systems} #' (pp. 585-591). #' \url{http://papers.nips.cc/paper/1961-laplacian-eigenmaps-and-spectral-techniques-for-embedding-and-clustering.pdf} #' #' Böhm, J. N., Berens, P., & Kobak, D. (2020). #' A unifying perspective on neighbor embeddings along the attraction-repulsion spectrum. #' \emph{arXiv preprint} \emph{arXiv:2007.08902}. #' \url{https://arxiv.org/abs/2007.08902} #' #' Damrich, S., & Hamprecht, F. A. (2021). #' On UMAP's true loss function. #' \emph{Advances in Neural Information Processing Systems}, \emph{34}. #' \url{https://proceedings.neurips.cc/paper/2021/hash/2de5d16682c3c35007e4e92982f1a2ba-Abstract.html} #' #' Dong, W., Moses, C., & Li, K. (2011, March). #' Efficient k-nearest neighbor graph construction for generic similarity measures. #' In \emph{Proceedings of the 20th international conference on World Wide Web} #' (pp. 577-586). #' ACM. #' \doi{10.1145/1963405.1963487}. #' #' Kingma, D. P., & Ba, J. (2014). #' Adam: A method for stochastic optimization. #' \emph{arXiv preprint} \emph{arXiv}:1412.6980. #' \url{https://arxiv.org/abs/1412.6980} #' #' Malkov, Y. A., & Yashunin, D. A. (2018). #' Efficient and robust approximate nearest neighbor search using hierarchical #' navigable small world graphs. #' \emph{IEEE transactions on pattern analysis and machine intelligence}, \emph{42}(4), 824-836. #' #' McInnes, L., Healy, J., & Melville, J. (2018). #' UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction #' \emph{arXiv preprint} \emph{arXiv}:1802.03426. #' \url{https://arxiv.org/abs/1802.03426} #' #' Narayan, A., Berger, B., & Cho, H. (2021). #' Assessing single-cell transcriptomic variability through density-preserving data visualization. #' \emph{Nature biotechnology}, \emph{39}(6), 765-774. #' \doi{10.1038/s41587-020-00801-7} #' #' O'Neill, M. E. (2014). #' \emph{PCG: A family of simple fast space-efficient statistically good #' algorithms for random number generation} #' (Report No. HMC-CS-2014-0905). Harvey Mudd College. #' #' Tang, J., Liu, J., Zhang, M., & Mei, Q. (2016, April). #' Visualizing large-scale and high-dimensional data. #' In \emph{Proceedings of the 25th International Conference on World Wide Web} #' (pp. 287-297). #' International World Wide Web Conferences Steering Committee. #' \url{https://arxiv.org/abs/1602.00370} #' #' Van der Maaten, L., & Hinton, G. (2008). #' Visualizing data using t-SNE. #' \emph{Journal of Machine Learning Research}, \emph{9} (2579-2605). #' \url{https://www.jmlr.org/papers/v9/vandermaaten08a.html} #' #' Wang, Y., Huang, H., Rudin, C., & Shaposhnik, Y. (2021). #' Understanding How Dimension Reduction Tools Work: An Empirical Approach to Deciphering t-SNE, UMAP, TriMap, and PaCMAP for Data Visualization. #' \emph{Journal of Machine Learning Research}, \emph{22}(201), 1-73. #' \url{https://www.jmlr.org/papers/v22/20-1061.html} #' #' @export umap <- function(X, n_neighbors = 15, n_components = 2, metric = "euclidean", n_epochs = NULL, learning_rate = 1, scale = FALSE, init = "spectral", init_sdev = NULL, spread = 1, min_dist = 0.01, set_op_mix_ratio = 1.0, local_connectivity = 1.0, bandwidth = 1.0, repulsion_strength = 1.0, negative_sample_rate = 5.0, a = NULL, b = NULL, nn_method = NULL, n_trees = 50, search_k = 2 * n_neighbors * n_trees, approx_pow = FALSE, y = NULL, target_n_neighbors = n_neighbors, target_metric = "euclidean", target_weight = 0.5, pca = NULL, pca_center = TRUE, pcg_rand = TRUE, fast_sgd = FALSE, ret_model = FALSE, ret_nn = FALSE, ret_extra = c(), n_threads = NULL, n_sgd_threads = 0, grain_size = 1, tmpdir = tempdir(), verbose = getOption("verbose", TRUE), batch = FALSE, opt_args = NULL, epoch_callback = NULL, pca_method = NULL, binary_edge_weights = FALSE, dens_scale = NULL, seed = NULL, nn_args = list(), rng_type = NULL) { uwot( X = X, n_neighbors = n_neighbors, n_components = n_components, metric = metric, n_epochs = n_epochs, alpha = learning_rate, scale = scale, init = init, init_sdev = init_sdev, spread = spread, min_dist = min_dist, set_op_mix_ratio = set_op_mix_ratio, local_connectivity = local_connectivity, bandwidth = bandwidth, gamma = repulsion_strength, negative_sample_rate = negative_sample_rate, a = a, b = b, nn_method = nn_method, n_trees = n_trees, search_k = search_k, method = "umap", approx_pow = approx_pow, n_threads = n_threads, n_sgd_threads = n_sgd_threads, grain_size = grain_size, y = y, target_n_neighbors = target_n_neighbors, target_weight = target_weight, target_metric = target_metric, pca = pca, pca_center = pca_center, pca_method = pca_method, pcg_rand = pcg_rand, fast_sgd = fast_sgd, ret_model = ret_model || "model" %in% ret_extra, ret_nn = ret_nn || "nn" %in% ret_extra, ret_fgraph = "fgraph" %in% ret_extra, ret_sigma = "sigma" %in% ret_extra, ret_localr = "localr" %in% ret_extra, batch = batch, opt_args = opt_args, epoch_callback = epoch_callback, binary_edge_weights = binary_edge_weights, tmpdir = tempdir(), verbose = verbose, dens_scale = dens_scale, seed = seed, nn_args = nn_args, rng_type = rng_type ) } #' Dimensionality Reduction Using t-Distributed UMAP (t-UMAP) #' #' A faster (but less flexible) version of the UMAP (McInnes et al, 2018) #' gradient. For more detail on UMAP, see the \code{\link{umap}} function. #' #' By setting the UMAP curve parameters \code{a} and \code{b} to \code{1}, you #' get back the Cauchy distribution as used in t-SNE (van der Maaten and Hinton, #' 2008) and LargeVis (Tang et al., 2016). It also results in a substantially #' simplified gradient expression. This can give a speed improvement of around #' 50\%. #' #' @param X Input data. Can be a \code{\link{data.frame}}, \code{\link{matrix}}, #' \code{\link[stats]{dist}} object or \code{\link[Matrix]{sparseMatrix}}. #' Matrix and data frames should contain one observation per row. Data frames #' will have any non-numeric columns removed, although factor columns will be #' used if explicitly included via \code{metric} (see the help for #' \code{metric} for details). A sparse matrix is interpreted as a distance #' matrix, and is assumed to be symmetric, so you can also pass in an #' explicitly upper or lower triangular sparse matrix to save storage. There #' must be at least \code{n_neighbors} non-zero distances for each row. Both #' implicit and explicit zero entries are ignored. Set zero distances you want #' to keep to an arbitrarily small non-zero value (e.g. \code{1e-10}). #' \code{X} can also be \code{NULL} if pre-computed nearest neighbor data is #' passed to \code{nn_method}, and \code{init} is not \code{"spca"} or #' \code{"pca"}. #' @param n_neighbors The size of local neighborhood (in terms of number of #' neighboring sample points) used for manifold approximation. Larger values #' result in more global views of the manifold, while smaller values result in #' more local data being preserved. In general values should be in the range #' \code{2} to \code{100}. #' @param n_components The dimension of the space to embed into. This defaults #' to \code{2} to provide easy visualization, but can reasonably be set to any #' integer value in the range \code{2} to \code{100}. #' @param metric Type of distance metric to use to find nearest neighbors. For #' \code{nn_method = "annoy"} this can be one of: #' \itemize{ #' \item \code{"euclidean"} (the default) #' \item \code{"cosine"} #' \item \code{"manhattan"} #' \item \code{"hamming"} #' \item \code{"correlation"} (a distance based on the Pearson correlation) #' \item \code{"categorical"} (see below) #' } #' For \code{nn_method = "hnsw"} this can be one of: #' \itemize{ #' \item \code{"euclidean"} #' \item \code{"cosine"} #' \item \code{"correlation"} #' } #' If \href{https://cran.r-project.org/package=rnndescent}{rnndescent} is #' installed and \code{nn_method = "nndescent"} is specified then many more #' metrics are avaiable, including: #' \itemize{ #' \item \code{"braycurtis"} #' \item \code{"canberra"} #' \item \code{"chebyshev"} #' \item \code{"dice"} #' \item \code{"hamming"} #' \item \code{"hellinger"} #' \item \code{"jaccard"} #' \item \code{"jensenshannon"} #' \item \code{"kulsinski"} #' \item \code{"rogerstanimoto"} #' \item \code{"russellrao"} #' \item \code{"sokalmichener"} #' \item \code{"sokalsneath"} #' \item \code{"spearmanr"} #' \item \code{"symmetrickl"} #' \item \code{"tsss"} #' \item \code{"yule"} #' } #' For more details see the package documentation of \code{rnndescent}. #' For \code{nn_method = "fnn"}, the distance metric is always "euclidean". #' #' If \code{X} is a data frame or matrix, then multiple metrics can be #' specified, by passing a list to this argument, where the name of each item in #' the list is one of the metric names above. The value of each list item should #' be a vector giving the names or integer ids of the columns to be included in #' a calculation, e.g. \code{metric = list(euclidean = 1:4, manhattan = 5:10)}. #' #' Each metric calculation results in a separate fuzzy simplicial set, which are #' intersected together to produce the final set. Metric names can be repeated. #' Because non-numeric columns are removed from the data frame, it is safer to #' use column names than integer ids. #' #' Factor columns can also be used by specifying the metric name #' \code{"categorical"}. Factor columns are treated different from numeric #' columns and although multiple factor columns can be specified in a vector, #' each factor column specified is processed individually. If you specify #' a non-factor column, it will be coerced to a factor. #' #' For a given data block, you may override the \code{pca} and \code{pca_center} #' arguments for that block, by providing a list with one unnamed item #' containing the column names or ids, and then any of the \code{pca} or #' \code{pca_center} overrides as named items, e.g. \code{metric = #' list(euclidean = 1:4, manhattan = list(5:10, pca_center = FALSE))}. This #' exists to allow mixed binary and real-valued data to be included and to have #' PCA applied to both, but with centering applied only to the real-valued data #' (it is typical not to apply centering to binary data before PCA is applied). #' @param n_epochs Number of epochs to use during the optimization of the #' embedded coordinates. By default, this value is set to \code{500} for #' datasets containing 10,000 vertices or less, and \code{200} otherwise. #' If \code{n_epochs = 0}, then coordinates determined by \code{"init"} will #' be returned. #' @param learning_rate Initial learning rate used in optimization of the #' coordinates. #' @param scale Scaling to apply to \code{X} if it is a data frame or matrix: #' \itemize{ #' \item{\code{"none"} or \code{FALSE} or \code{NULL}} No scaling. #' \item{\code{"Z"} or \code{"scale"} or \code{TRUE}} Scale each column to #' zero mean and variance 1. #' \item{\code{"maxabs"}} Center each column to mean 0, then divide each #' element by the maximum absolute value over the entire matrix. #' \item{\code{"range"}} Range scale the entire matrix, so the smallest #' element is 0 and the largest is 1. #' \item{\code{"colrange"}} Scale each column in the range (0,1). #' } #' For t-UMAP, the default is \code{"none"}. #' @param init Type of initialization for the coordinates. Options are: #' \itemize{ #' \item \code{"spectral"} Spectral embedding using the normalized Laplacian #' of the fuzzy 1-skeleton, with Gaussian noise added. #' \item \code{"normlaplacian"}. Spectral embedding using the normalized #' Laplacian of the fuzzy 1-skeleton, without noise. #' \item \code{"random"}. Coordinates assigned using a uniform random #' distribution between -10 and 10. #' \item \code{"lvrandom"}. Coordinates assigned using a Gaussian #' distribution with standard deviation 1e-4, as used in LargeVis #' (Tang et al., 2016) and t-SNE. #' \item \code{"laplacian"}. Spectral embedding using the Laplacian Eigenmap #' (Belkin and Niyogi, 2002). #' \item \code{"pca"}. The first two principal components from PCA of #' \code{X} if \code{X} is a data frame, and from a 2-dimensional classical #' MDS if \code{X} is of class \code{"dist"}. #' \item \code{"spca"}. Like \code{"pca"}, but each dimension is then scaled #' so the standard deviation is 1e-4, to give a distribution similar to that #' used in t-SNE. This is an alias for \code{init = "pca", init_sdev = #' 1e-4}. #' \item \code{"agspectral"} An "approximate global" modification of #' \code{"spectral"} which all edges in the graph to a value of 1, and then #' sets a random number of edges (\code{negative_sample_rate} edges per #' vertex) to 0.1, to approximate the effect of non-local affinities. #' \item A matrix of initial coordinates. #' } #' For spectral initializations, (\code{"spectral"}, \code{"normlaplacian"}, #' \code{"laplacian"}, \code{"agspectral"}), if more than one connected #' component is identified, no spectral initialization is attempted. Instead #' a PCA-based initialization is attempted. If \code{verbose = TRUE} the #' number of connected components are logged to the console. The existence of #' multiple connected components implies that a global view of the data cannot #' be attained with this initialization. Increasing the value of #' \code{n_neighbors} may help. #' @param init_sdev If non-\code{NULL}, scales each dimension of the initialized #' coordinates (including any user-supplied matrix) to this standard #' deviation. By default no scaling is carried out, except when \code{init = #' "spca"}, in which case the value is \code{0.0001}. Scaling the input may #' help if the unscaled versions result in initial coordinates with large #' inter-point distances or outliers. This usually results in small gradients #' during optimization and very little progress being made to the layout. #' Shrinking the initial embedding by rescaling can help under these #' circumstances. Scaling the result of \code{init = "pca"} is usually #' recommended and \code{init = "spca"} as an alias for \code{init = "pca", #' init_sdev = 1e-4} but for the spectral initializations the scaled versions #' usually aren't necessary unless you are using a large value of #' \code{n_neighbors} (e.g. \code{n_neighbors = 150} or higher). For #' compatibility with recent versions of the Python UMAP package, if you are #' using \code{init = "spectral"}, then you should also set #' \code{init_sdev = "range"}, which will range scale each of the columns #' containing the initial data between 0-10. This is not set by default to #' maintain backwards compatibility with previous versions of uwot. #' @param set_op_mix_ratio Interpolate between (fuzzy) union and intersection as #' the set operation used to combine local fuzzy simplicial sets to obtain a #' global fuzzy simplicial sets. Both fuzzy set operations use the product #' t-norm. The value of this parameter should be between \code{0.0} and #' \code{1.0}; a value of \code{1.0} will use a pure fuzzy union, while #' \code{0.0} will use a pure fuzzy intersection. #' @param local_connectivity The local connectivity required -- i.e. the number #' of nearest neighbors that should be assumed to be connected at a local #' level. The higher this value the more connected the manifold becomes #' locally. In practice this should be not more than the local intrinsic #' dimension of the manifold. #' @param bandwidth The effective bandwidth of the kernel if we view the #' algorithm as similar to Laplacian Eigenmaps. Larger values induce more #' connectivity and a more global view of the data, smaller values concentrate #' more locally. #' @param repulsion_strength Weighting applied to negative samples in low #' dimensional embedding optimization. Values higher than one will result in #' greater weight being given to negative samples. #' @param negative_sample_rate The number of negative edge/1-simplex samples to #' use per positive edge/1-simplex sample in optimizing the low dimensional #' embedding. #' @param nn_method Method for finding nearest neighbors. Options are: #' \itemize{ #' \item \code{"fnn"}. Use exact nearest neighbors via the #' \href{https://cran.r-project.org/package=FNN}{FNN} package. #' \item \code{"annoy"} Use approximate nearest neighbors via the #' \href{https://cran.r-project.org/package=RcppAnnoy}{RcppAnnoy} package. #' \item \code{"hnsw"} Use approximate nearest neighbors with the #' Hierarchical Navigable Small World (HNSW) method (Malkov and Yashunin, #' 2018) via the #' \href{https://cran.r-project.org/package=RcppHNSW}{RcppHNSW} package. #' \code{RcppHNSW} is not a dependency of this package: this option is #' only available if you have installed \code{RcppHNSW} yourself. Also, #' HNSW only supports the following arguments for \code{metric} and #' \code{target_metric}: \code{"euclidean"}, \code{"cosine"} and #' \code{"correlation"}. #' \item \code{"nndescent"} Use approximate nearest neighbors with the #' Nearest Neighbor Descent method (Dong et al., 2011) via the #' \href{https://cran.r-project.org/package=rnndescent}{rnndescent} #' package. \code{rnndescent} is not a dependency of this package: this #' option is only available if you have installed \code{rnndescent} #' yourself. #' } #' By default, if \code{X} has less than 4,096 vertices, the exact nearest #' neighbors are found. Otherwise, approximate nearest neighbors are used. #' You may also pass pre-calculated nearest neighbor data to this argument. It #' must be one of two formats, either a list consisting of two elements: #' \itemize{ #' \item \code{"idx"}. A \code{n_vertices x n_neighbors} matrix #' containing the integer indexes of the nearest neighbors in \code{X}. Each #' vertex is considered to be its own nearest neighbor, i.e. #' \code{idx[, 1] == 1:n_vertices}. #' \item \code{"dist"}. A \code{n_vertices x n_neighbors} matrix #' containing the distances of the nearest neighbors. #' } #' or a sparse distance matrix of type \code{dgCMatrix}, with dimensions #' \code{n_vertices x n_vertices}. Distances should be arranged by column, #' i.e. a non-zero entry in row \code{j} of the \code{i}th column indicates #' that the \code{j}th observation in \code{X} is a nearest neighbor of the #' \code{i}th observation with the distance given by the value of that #' element. #' The \code{n_neighbors} parameter is ignored when using precomputed #' nearest neighbor data. If using the sparse distance matrix input, each #' column can contain a different number of neighbors. #' @param n_trees Number of trees to build when constructing the nearest #' neighbor index. The more trees specified, the larger the index, but the #' better the results. With \code{search_k}, determines the accuracy of the #' Annoy nearest neighbor search. Only used if the \code{nn_method} is #' \code{"annoy"}. Sensible values are between \code{10} to \code{100}. #' @param search_k Number of nodes to search during the neighbor retrieval. The #' larger k, the more the accurate results, but the longer the search takes. #' With \code{n_trees}, determines the accuracy of the Annoy nearest neighbor #' search. Only used if the \code{nn_method} is \code{"annoy"}. #' @param nn_args A list containing additional arguments to pass to the nearest #' neighbor method. For \code{nn_method = "annoy"}, you can specify #' \code{"n_trees"} and \code{"search_k"}, and these will override the #' \code{n_trees} and \code{search_k} parameters. #' For \code{nn_method = "hnsw"}, you may specify the following arguments: #' \itemize{ #' \item \code{M} The maximum number of neighbors to keep for each vertex. #' Reasonable values are \code{2} to \code{100}. Higher values give better #' recall at the cost of more memory. Default value is \code{16}. #' \item \code{ef_construction} A positive integer specifying the size of #' the dynamic list used during index construction. A higher value will #' provide better results at the cost of a longer time to build the index. #' Default is \code{200}. #' \item \code{ef} A positive integer specifying the size of the dynamic #' list used during search. This cannot be smaller than \code{n_neighbors} #' and cannot be higher than the number of items in the index. Default is #' \code{10}. #' } #' For \code{nn_method = "nndescent"}, you may specify the following #' arguments: #' \itemize{ #' \item \code{n_trees} The number of trees to use in a random projection #' forest to initialize the search. A larger number will give more accurate #' results at the cost of a longer computation time. The default of #' \code{NULL} means that the number is chosen based on the number of #' observations in \code{X}. #' \item \code{max_candidates} The number of potential neighbors to explore #' per iteration. By default, this is set to \code{n_neighbors} or \code{60}, #' whichever is smaller. A larger number will give more accurate results at #' the cost of a longer computation time. #' \item \code{n_iters} The number of iterations to run the search. A larger #' number will give more accurate results at the cost of a longer computation #' time. By default, this will be chosen based on the number of observations #' in \code{X}. You may also need to modify the convergence criterion #' \code{delta}. #' \item \code{delta} The minimum relative change in the neighbor graph #' allowed before early stopping. Should be a value between 0 and 1. The #' smaller the value, the smaller the amount of progress between iterations is #' allowed. Default value of \code{0.001} means that at least 0.1% of the #' neighbor graph must be updated at each iteration. #' \item \code{init} How to initialize the nearest neighbor descent. By #' default this is set to \code{"tree"} and uses a random project forest. #' If you set this to \code{"rand"}, then a random selection is used. Usually #' this is less accurate than using RP trees, but for high-dimensional cases, #' there may be little difference in the quality of the initialization and #' random initialization will be a lot faster. If you set this to #' \code{"rand"}, then the \code{n_trees} parameter is ignored. #' \item \code{pruning_degree_multiplier} The maximum number of edges per node #' to retain in the search graph, relative to \code{n_neighbors}. A larger #' value will give more accurate results at the cost of a longer computation #' time. Default is \code{1.5}. This parameter only affects neighbor search #' when transforming new data with \code{\link{umap_transform}}. #' \item \code{epsilon} Controls the degree of the back-tracking when #' traversing the search graph. Setting this to \code{0.0} will do a greedy #' search with no back-tracking. A larger value will give more accurate #' results at the cost of a longer computation time. Default is \code{0.1}. #' This parameter only affects neighbor search when transforming new data with #' \code{\link{umap_transform}}. #' \item \code{max_search_fraction} Specifies the maximum fraction of the #' search graph to traverse. By default, this is set to \code{1.0}, so the #' entire graph (i.e. all items in \code{X}) may be visited. You may want to #' set this to a smaller value if you have a very large dataset (in #' conjunction with \code{epsilon}) to avoid an inefficient exhaustive search #' of the data in \code{X}. This parameter only affects neighbor search when #' transforming new data with \code{\link{umap_transform}}. #' } #' For \code{nn_method = "nndescent"}, you may specify the following #' arguments: #' \itemize{ #' \item \code{n_trees} The number of trees to use in a random projection #' forest to initialize the search. A larger number will give more accurate #' results at the cost of a longer computation time. The default of #' \code{NULL} means that the number is chosen based on the number of #' observations in \code{X}. #' \item \code{max_candidates} The number of potential neighbors to explore #' per iteration. By default, this is set to \code{n_neighbors} or \code{60}, #' whichever is smaller. A larger number will give more accurate results at #' the cost of a longer computation time. #' \item \code{n_iters} The number of iterations to run the search. A larger #' number will give more accurate results at the cost of a longer computation #' time. By default, this will be chosen based on the number of observations #' in \code{X}. You may also need to modify the convergence criterion #' \code{delta}. #' \item \code{delta} The minimum relative change in the neighbor graph #' allowed before early stopping. Should be a value between 0 and 1. The #' smaller the value, the smaller the amount of progress between iterations is #' allowed. Default value of \code{0.001} means that at least 0.1% of the #' neighbor graph must be updated at each iteration. #' \item \code{init} How to initialize the nearest neighbor descent. By #' default this is set to \code{"tree"} and uses a random project forest. If #' you set this to \code{"rand"}, then a random selection is used. Usually #' this is less accurate than using RP trees, but for high-dimensional cases, #' there may be little difference in the quality of the initialization and #' random initialization will be a lot faster. If you set this to #' \code{"rand"}, then the \code{n_trees} parameter is ignored. #' \item \code{pruning_degree_multiplier} The maximum number of edges per node #' to retain in the search graph, relative to \code{n_neighbors}. A larger #' value will give more accurate results at the cost of a longer computation #' time. Default is \code{1.5}. This parameter only affects neighbor search #' when transforming new data with \code{\link{umap_transform}}. #' \item \code{epsilon} Controls the degree of the back-tracking when #' traversing the search graph. Setting this to \code{0.0} will do a greedy #' search with no back-tracking. A larger value will give more accurate #' results at the cost of a longer computation time. Default is \code{0.1}. #' This parameter only affects neighbor search when transforming new data with #' \code{\link{umap_transform}}. #' \item \code{max_search_fraction} Specifies the maximum fraction of the #' search graph to traverse. By default, this is set to \code{1.0}, so the #' entire graph (i.e. all items in \code{X}) may be visited. You may want to #' set this to a smaller value if you have a very large dataset (in #' conjunction with \code{epsilon}) to avoid an inefficient exhaustive search #' of the data in \code{X}. This parameter only affects neighbor search when #' transforming new data with \code{\link{umap_transform}}. #' } #' @param y Optional target data for supervised dimension reduction. Can be a #' vector, matrix or data frame. Use the \code{target_metric} parameter to #' specify the metrics to use, using the same syntax as \code{metric}. Usually #' either a single numeric or factor column is used, but more complex formats #' are possible. The following types are allowed: #' \itemize{ #' \item Factor columns with the same length as \code{X}. \code{NA} is #' allowed for any observation with an unknown level, in which case #' UMAP operates as a form of semi-supervised learning. Each column is #' treated separately. #' \item Numeric data. \code{NA} is \emph{not} allowed in this case. Use the #' parameter \code{target_n_neighbors} to set the number of neighbors used #' with \code{y}. If unset, \code{n_neighbors} is used. Unlike factors, #' numeric columns are grouped into one block unless \code{target_metric} #' specifies otherwise. For example, if you wish columns \code{a} and #' \code{b} to be treated separately, specify #' \code{target_metric = list(euclidean = "a", euclidean = "b")}. Otherwise, #' the data will be effectively treated as a matrix with two columns. #' \item Nearest neighbor data, consisting of a list of two matrices, #' \code{idx} and \code{dist}. These represent the precalculated nearest #' neighbor indices and distances, respectively. This #' is the same format as that expected for precalculated data in #' \code{nn_method}. This format assumes that the underlying data was a #' numeric vector. Any user-supplied value of the \code{target_n_neighbors} #' parameter is ignored in this case, because the the number of columns in #' the matrices is used for the value. Multiple nearest neighbor data using #' different metrics can be supplied by passing a list of these lists. #' } #' Unlike \code{X}, all factor columns included in \code{y} are automatically #' used. #' @param target_n_neighbors Number of nearest neighbors to use to construct the #' target simplicial set. Default value is \code{n_neighbors}. Applies only if #' \code{y} is non-\code{NULL} and \code{numeric}. #' @param target_metric The metric used to measure distance for \code{y} if #' using supervised dimension reduction. Used only if \code{y} is numeric. #' @param target_weight Weighting factor between data topology and target #' topology. A value of 0.0 weights entirely on data, a value of 1.0 weights #' entirely on target. The default of 0.5 balances the weighting equally #' between data and target. Only applies if \code{y} is non-\code{NULL}. #' @param pca If set to a positive integer value, reduce data to this number of #' columns using PCA. Doesn't applied if the distance \code{metric} is #' \code{"hamming"}, or the dimensions of the data is larger than the #' number specified (i.e. number of rows and columns must be larger than the #' value of this parameter). If you have > 100 columns in a data frame or #' matrix, reducing the number of columns in this way may substantially #' increase the performance of the nearest neighbor search at the cost of a #' potential decrease in accuracy. In many t-SNE applications, a value of 50 #' is recommended, although there's no guarantee that this is appropriate for #' all settings. #' @param pca_center If \code{TRUE}, center the columns of \code{X} before #' carrying out PCA. For binary data, it's recommended to set this to #' \code{FALSE}. #' @param pca_method Method to carry out any PCA dimensionality reduction when #' the \code{pca} parameter is specified. Allowed values are: #' \itemize{ #' \item{\code{"irlba"}}. Uses \code{\link[irlba]{prcomp_irlba}} from the #' \href{https://cran.r-project.org/package=irlba}{irlba} package. #' \item{\code{"rsvd"}}. Uses 5 iterations of \code{\link[irlba]{svdr}} from #' the \href{https://cran.r-project.org/package=irlba}{irlba} package. #' This is likely to give much faster but potentially less accurate results #' than using \code{"irlba"}. For the purposes of nearest neighbor #' calculation and coordinates initialization, any loss of accuracy doesn't #' seem to matter much. #' \item{\code{"bigstatsr"}}. Uses \code{\link[bigstatsr]{big_randomSVD}} #' from the \href{https://cran.r-project.org/package=bigstatsr}{bigstatsr} #' package. The SVD methods used in \code{bigstatsr} may be faster on #' systems without access to efficient linear algebra libraries (e.g. #' Windows). \strong{Note}: \code{bigstatsr} is \emph{not} a dependency of #' uwot: if you choose to use this package for PCA, you \emph{must} install #' it yourself. #' \item{\code{"svd"}}. Uses \code{\link[base]{svd}} for the SVD. This is #' likely to be slow for all but the smallest datasets. #' \item{\code{"auto"}} (the default). Uses \code{"irlba"}, unless more than #' 50% of the full set of singular vectors would be calculated, in which #' case \code{"svd"} is used. #' } #' @param pcg_rand If \code{TRUE}, use the PCG random number generator (O'Neill, #' 2014) during optimization. Otherwise, use the faster (but probably less #' statistically good) Tausworthe "taus88" generator. The default is #' \code{TRUE}. This parameter has been superseded by \code{rng_type} -- if #' both are set, \code{rng_type} takes precedence. #' @param rng_type The type of random number generator to use during #' optimization. One of: #' \itemize{ #' \item{\code{"pcg"}}. Use the PCG random number generator (O'Neill, 2014). #' \item{\code{"tausworthe"}}. Use the Tausworthe "taus88" generator. #' \item{\code{"deterministic"}}. Use a deterministic number generator. This #' isn't actually random, but may provide enough variation in the negative #' sampling to give a good embedding and can provide a noticeable speed-up. #' } #' For backwards compatibility, by default this is unset and the choice of #' \code{pcg_rand} is used (making "pcg" the effective default). #' @param fast_sgd If \code{TRUE}, then the following combination of parameters #' is set: \code{pcg_rand = TRUE} and \code{n_sgd_threads = "auto"}. The #' default is \code{FALSE}. Setting this to \code{TRUE} will speed up the #' stochastic optimization phase, but give a potentially less accurate #' embedding, and which will not be exactly reproducible even with a fixed #' seed. For visualization, \code{fast_sgd = TRUE} will give perfectly good #' results. For more generic dimensionality reduction, it's safer to leave #' \code{fast_sgd = FALSE}. If \code{fast_sgd = TRUE}, then user-supplied #' values of \code{pcg_rand} and \code{n_sgd_threads}, are ignored. #' @param batch If \code{TRUE}, then embedding coordinates are updated at the #' end of each epoch rather than during the epoch. In batch mode, results are #' reproducible with a fixed random seed even with \code{n_sgd_threads > 1}, #' at the cost of a slightly higher memory use. You may also have to modify #' \code{learning_rate} and increase \code{n_epochs}, so whether this provides #' a speed increase over the single-threaded optimization is likely to be #' dataset and hardware-dependent. #' @param ret_model If \code{TRUE}, then return extra data that can be used to #' add new data to an existing embedding via \code{\link{umap_transform}}. The #' embedded coordinates are returned as the list item \code{embedding}. If #' \code{FALSE}, just return the coordinates. This parameter can be used in #' conjunction with \code{ret_nn} and \code{ret_extra}. Note that some #' settings are incompatible with the production of a UMAP model: external #' neighbor data (passed via a list to \code{nn_method}), and factor columns #' that were included via the \code{metric} parameter. In the latter case, the #' model produced is based only on the numeric data. A transformation using #' new data is possible, but the factor columns in the new data are ignored. #' Note that setting \code{ret_model = TRUE} forces the use of the approximate #' nearest neighbors method. Because small datasets would otherwise use exact #' nearest neighbor calculations, setting \code{ret_model = TRUE} means that #' different results may be returned for small datasets in terms of both the #' returned nearest neighbors (if requested) and the final embedded #' coordinates, compared to \code{ret_model = FALSE}, even if the random #' number seed is fixed. To avoid this, explicitly set #' \code{nn_method = "annoy"} in the \code{ret_model = FALSE} case. #' @param ret_nn If \code{TRUE}, then in addition to the embedding, also return #' nearest neighbor data that can be used as input to \code{nn_method} to #' avoid the overhead of repeatedly calculating the nearest neighbors when #' manipulating unrelated parameters (e.g. \code{min_dist}, \code{n_epochs}, #' \code{init}). See the "Value" section for the names of the list items. If #' \code{FALSE}, just return the coordinates. Note that the nearest neighbors #' could be sensitive to data scaling, so be wary of reusing nearest neighbor #' data if modifying the \code{scale} parameter. This parameter can be used in #' conjunction with \code{ret_model} and \code{ret_extra}. #' @param ret_extra A vector indicating what extra data to return. May contain #' any combination of the following strings: #' \itemize{ #' \item \code{"model"} Same as setting \code{ret_model = TRUE}. #' \item \code{"nn"} Same as setting \code{ret_nn = TRUE}. #' \item \code{"fgraph"} the high dimensional fuzzy graph (i.e. the fuzzy #' simplicial set of the merged local views of the input data). The graph #' is returned as a sparse symmetric N x N matrix of class #' \link[Matrix]{dgCMatrix-class}, where a non-zero entry (i, j) gives the #' membership strength of the edge connecting vertex i and vertex j. This #' can be considered analogous to the input probability (or similarity or #' affinity) used in t-SNE and LargeVis. Note that the graph is further #' sparsified by removing edges with sufficiently low membership strength #' that they would not be sampled by the probabilistic edge sampling #' employed for optimization and therefore the number of non-zero elements #' in the matrix is dependent on \code{n_epochs}. If you are only #' interested in the fuzzy input graph (e.g. for clustering), setting #' \code{n_epochs = 0} will avoid any further sparsifying. Be aware that #' setting \code{binary_edge_weights = TRUE} will affect this graph (all #' non-zero edge weights will be 1). #' \item \code{"sigma"} the normalization value for each observation in the #' dataset when constructing the smoothed distances to each of its #' neighbors. This gives some sense of the local density of each #' observation in the high dimensional space: higher values of #' \code{sigma} indicate a higher dispersion or lower density. #' } #' @param n_threads Number of threads to use (except during stochastic gradient #' descent). Default is half the number of concurrent threads supported by the #' system. For nearest neighbor search, only applies if #' \code{nn_method = "annoy"}. If \code{n_threads > 1}, then the Annoy index #' will be temporarily written to disk in the location determined by #' \code{\link[base]{tempfile}}. #' @param n_sgd_threads Number of threads to use during stochastic gradient #' descent. If set to > 1, then be aware that if \code{batch = FALSE}, results #' will \emph{not} be reproducible, even if \code{set.seed} is called with a #' fixed seed before running. Set to \code{"auto"} to use the same value as #' \code{n_threads}. #' @param grain_size The minimum amount of work to do on each thread. If this #' value is set high enough, then less than \code{n_threads} or #' \code{n_sgd_threads} will be used for processing, which might give a #' performance improvement if the overhead of thread management and context #' switching was outweighing the improvement due to concurrent processing. #' This should be left at default (\code{1}) and work will be spread evenly #' over all the threads specified. #' @param tmpdir Temporary directory to store nearest neighbor indexes during #' nearest neighbor search. Default is \code{\link{tempdir}}. The index is #' only written to disk if \code{n_threads > 1} and #' \code{nn_method = "annoy"}; otherwise, this parameter is ignored. #' @param verbose If \code{TRUE}, log details to the console. #' @param opt_args A list of optimizer parameters, used when #' \code{batch = TRUE}. The default optimization method used is Adam (Kingma #' and Ba, 2014). #' \itemize{ #' \item \code{method} The optimization method to use. Either \code{"adam"} #' or \code{"sgd"} (stochastic gradient descent). Default: \code{"adam"}. #' \item \code{beta1} (Adam only). The weighting parameter for the #' exponential moving average of the first moment estimator. Effectively the #' momentum parameter. Should be a floating point value between 0 and 1. #' Higher values can smooth oscillatory updates in poorly-conditioned #' situations and may allow for a larger \code{learning_rate} to be #' specified, but too high can cause divergence. Default: \code{0.5}. #' \item \code{beta2} (Adam only). The weighting parameter for the #' exponential moving average of the uncentered second moment estimator. #' Should be a floating point value between 0 and 1. Controls the degree of #' adaptivity in the step-size. Higher values put more weight on previous #' time steps. Default: \code{0.9}. #' \item \code{eps} (Adam only). Intended to be a small value to prevent #' division by zero, but in practice can also affect convergence due to its #' interaction with \code{beta2}. Higher values reduce the effect of the #' step-size adaptivity and bring the behavior closer to stochastic gradient #' descent with momentum. Typical values are between 1e-8 and 1e-3. Default: #' \code{1e-7}. #' \item \code{alpha} The initial learning rate. Default: the value of the #' \code{learning_rate} parameter. #' } #' @param epoch_callback A function which will be invoked at the end of every #' epoch. Its signature should be: \code{(epoch, n_epochs, coords)}, where: #' \itemize{ #' \item \code{epoch} The current epoch number (between \code{1} and #' \code{n_epochs}). #' \item \code{n_epochs} Number of epochs to use during the optimization of #' the embedded coordinates. #' \item \code{coords} The embedded coordinates as of the end of the current #' epoch, as a matrix with dimensions (N, \code{n_components}). #' } #' @param binary_edge_weights If \code{TRUE} then edge weights in the input #' graph are treated as binary (0/1) rather than real valued. This affects the #' sampling frequency of neighbors and is the strategy used by the PaCMAP #' method (Wang and co-workers, 2020). Practical (Böhm and co-workers, 2020) #' and theoretical (Damrich and Hamprecht, 2021) work suggests this has little #' effect on UMAP's performance. #' @param seed Integer seed to use to initialize the random number generator #' state. Combined with \code{n_sgd_threads = 1} or \code{batch = TRUE}, this #' should give consistent output across multiple runs on a given installation. #' Setting this value is equivalent to calling \code{\link[base]{set.seed}}, #' but it may be more convenient in some situations than having to call a #' separate function. The default is to not set a seed. If #' \code{ret_model = TRUE}, the seed will be stored in the output model and #' then used to set the seed inside \code{\link{umap_transform}}. #' @return A matrix of optimized coordinates, or: #' \itemize{ #' \item if \code{ret_model = TRUE} (or \code{ret_extra} contains #' \code{"model"}), returns a list containing extra information that can be #' used to add new data to an existing embedding via #' \code{\link{umap_transform}}. In this case, the coordinates are available #' in the list item \code{embedding}. \bold{NOTE}: The contents of #' the \code{model} list should \emph{not} be considered stable or part of #' the public API, and are purposely left undocumented. #' \item if \code{ret_nn = TRUE} (or \code{ret_extra} contains \code{"nn"}), #' returns the nearest neighbor data as a list called \code{nn}. This #' contains one list for each \code{metric} calculated, itself containing a #' matrix \code{idx} with the integer ids of the neighbors; and a matrix #' \code{dist} with the distances. The \code{nn} list (or a sub-list) can be #' used as input to the \code{nn_method} parameter. #' \item if \code{ret_extra} contains \code{"fgraph"} returns the high #' dimensional fuzzy graph as a sparse matrix called \code{fgraph}, of type #' \link[Matrix]{dgCMatrix-class}. #' \item if \code{ret_extra} contains \code{"sigma"}, returns a vector of the #' smooth knn distance normalization terms for each observation as #' \code{"sigma"} and a vector \code{"rho"} containing the largest #' distance to the locally connected neighbors of each observation. #' \item if \code{ret_extra} contains \code{"localr"}, returns a vector of #' the estimated local radii, the sum of \code{"sigma"} and \code{"rho"}. #' } #' The returned list contains the combined data from any combination of #' specifying \code{ret_model}, \code{ret_nn} and \code{ret_extra}. #' @examples #' iris_tumap <- tumap(iris, n_neighbors = 50, learning_rate = 0.5) #' #' @references #' Belkin, M., & Niyogi, P. (2002). #' Laplacian eigenmaps and spectral techniques for embedding and clustering. #' In \emph{Advances in neural information processing systems} #' (pp. 585-591). #' \url{http://papers.nips.cc/paper/1961-laplacian-eigenmaps-and-spectral-techniques-for-embedding-and-clustering.pdf} #' #' Böhm, J. N., Berens, P., & Kobak, D. (2020). #' A unifying perspective on neighbor embeddings along the attraction-repulsion spectrum. #' \emph{arXiv preprint} \emph{arXiv:2007.08902}. #' \url{https://arxiv.org/abs/2007.08902} #' #' Damrich, S., & Hamprecht, F. A. (2021). #' On UMAP's true loss function. #' \emph{Advances in Neural Information Processing Systems}, \emph{34}. #' \url{https://proceedings.neurips.cc/paper/2021/hash/2de5d16682c3c35007e4e92982f1a2ba-Abstract.html} #' #' Dong, W., Moses, C., & Li, K. (2011, March). #' Efficient k-nearest neighbor graph construction for generic similarity measures. #' In \emph{Proceedings of the 20th international conference on World Wide Web} #' (pp. 577-586). #' ACM. #' \doi{10.1145/1963405.1963487}. #' #' Kingma, D. P., & Ba, J. (2014). #' Adam: A method for stochastic optimization. #' \emph{arXiv preprint} \emph{arXiv}:1412.6980. #' \url{https://arxiv.org/abs/1412.6980} #' #' Malkov, Y. A., & Yashunin, D. A. (2018). #' Efficient and robust approximate nearest neighbor search using hierarchical #' navigable small world graphs. #' \emph{IEEE transactions on pattern analysis and machine intelligence}, \emph{42}(4), 824-836. #' #' McInnes, L., Healy, J., & Melville, J. (2018). #' UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction #' \emph{arXiv preprint} \emph{arXiv}:1802.03426. #' \url{https://arxiv.org/abs/1802.03426} #' #' O'Neill, M. E. (2014). #' \emph{PCG: A family of simple fast space-efficient statistically good #' algorithms for random number generation} #' (Report No. HMC-CS-2014-0905). Harvey Mudd College. #' #' Tang, J., Liu, J., Zhang, M., & Mei, Q. (2016, April). #' Visualizing large-scale and high-dimensional data. #' In \emph{Proceedings of the 25th International Conference on World Wide Web} #' (pp. 287-297). #' International World Wide Web Conferences Steering Committee. #' \url{https://arxiv.org/abs/1602.00370} #' #' Van der Maaten, L., & Hinton, G. (2008). #' Visualizing data using t-SNE. #' \emph{Journal of Machine Learning Research}, \emph{9} (2579-2605). #' \url{https://www.jmlr.org/papers/v9/vandermaaten08a.html} #' #' Wang, Y., Huang, H., Rudin, C., & Shaposhnik, Y. (2021). #' Understanding How Dimension Reduction Tools Work: An Empirical Approach to Deciphering t-SNE, UMAP, TriMap, and PaCMAP for Data Visualization. #' \emph{Journal of Machine Learning Research}, \emph{22}(201), 1-73. #' \url{https://www.jmlr.org/papers/v22/20-1061.html} #' #' @export tumap <- function(X, n_neighbors = 15, n_components = 2, metric = "euclidean", n_epochs = NULL, learning_rate = 1, scale = FALSE, init = "spectral", init_sdev = NULL, set_op_mix_ratio = 1.0, local_connectivity = 1.0, bandwidth = 1.0, repulsion_strength = 1.0, negative_sample_rate = 5.0, nn_method = NULL, n_trees = 50, search_k = 2 * n_neighbors * n_trees, n_threads = NULL, n_sgd_threads = 0, grain_size = 1, y = NULL, target_n_neighbors = n_neighbors, target_metric = "euclidean", target_weight = 0.5, pca = NULL, pca_center = TRUE, pcg_rand = TRUE, fast_sgd = FALSE, ret_model = FALSE, ret_nn = FALSE, ret_extra = c(), tmpdir = tempdir(), verbose = getOption("verbose", TRUE), batch = FALSE, opt_args = NULL, epoch_callback = NULL, pca_method = NULL, binary_edge_weights = FALSE, seed = NULL, nn_args = list(), rng_type = NULL) { uwot( X = X, n_neighbors = n_neighbors, n_components = n_components, metric = metric, n_epochs = n_epochs, alpha = learning_rate, scale = scale, init = init, init_sdev = init_sdev, spread = NULL, min_dist = NULL, set_op_mix_ratio = set_op_mix_ratio, local_connectivity = local_connectivity, bandwidth = bandwidth, gamma = repulsion_strength, negative_sample_rate = negative_sample_rate, a = NULL, b = NULL, nn_method = nn_method, n_trees = n_trees, search_k = search_k, method = "tumap", n_threads = n_threads, n_sgd_threads = n_sgd_threads, grain_size = grain_size, y = y, target_n_neighbors = target_n_neighbors, target_weight = target_weight, target_metric = target_metric, pca = pca, pca_center = pca_center, pca_method = pca_method, pcg_rand = pcg_rand, fast_sgd = fast_sgd, ret_model = ret_model || "model" %in% ret_extra, ret_nn = ret_nn || "nn" %in% ret_extra, ret_fgraph = "fgraph" %in% ret_extra, ret_sigma = "sigma" %in% ret_extra, ret_localr = "localr" %in% ret_extra, batch = batch, opt_args = opt_args, epoch_callback = epoch_callback, binary_edge_weights = binary_edge_weights, seed = seed, tmpdir = tmpdir, verbose = verbose, nn_args = nn_args, rng_type = rng_type ) } #' Dimensionality Reduction with a LargeVis-like method #' #' Carry out dimensionality reduction of a dataset using a method similar to #' LargeVis (Tang et al., 2016). #' #' \code{lvish} differs from the official LargeVis implementation in the #' following: #' #' \itemize{ #' \item Only the nearest-neighbor index search phase is multi-threaded. #' \item Matrix input data is not normalized. #' \item The \code{n_trees} parameter cannot be dynamically chosen based on #' data set size. #' \item Nearest neighbor results are not refined via the #' neighbor-of-my-neighbor method. The \code{search_k} parameter is twice #' as large than default to compensate. #' \item Gradient values are clipped to \code{4.0} rather than \code{5.0}. #' \item Negative edges are generated by uniform sampling of vertexes rather #' than their degree ^ 0.75. #' \item The default number of samples is much reduced. The default number of #' epochs, \code{n_epochs}, is set to \code{5000}, much larger than for #' \code{\link{umap}}, but may need to be increased further depending on your #' dataset. Using \code{init = "spectral"} can help. #' } #' #' @param X Input data. Can be a \code{\link{data.frame}}, \code{\link{matrix}}, #' \code{\link[stats]{dist}} object or \code{\link[Matrix]{sparseMatrix}}. #' Matrix and data frames should contain one observation per row. Data frames #' will have any non-numeric columns removed, although factor columns will be #' used if explicitly included via \code{metric} (see the help for #' \code{metric} for details). A sparse matrix is interpreted as a distance #' matrix, and is assumed to be symmetric, so you can also pass in an #' explicitly upper or lower triangular sparse matrix to save storage. There #' must be at least \code{n_neighbors} non-zero distances for each row. Both #' implicit and explicit zero entries are ignored. Set zero distances you want #' to keep to an arbitrarily small non-zero value (e.g. \code{1e-10}). #' \code{X} can also be \code{NULL} if pre-computed nearest neighbor data is #' passed to \code{nn_method}, and \code{init} is not \code{"spca"} or #' \code{"pca"}. #' @param perplexity Controls the size of the local neighborhood used for #' manifold approximation. This is the analogous to \code{n_neighbors} in #' \code{\link{umap}}. Change this, rather than \code{n_neighbors}. #' @param n_neighbors The number of neighbors to use when calculating the #' \code{perplexity}. Usually set to three times the value of the #' \code{perplexity}. Must be at least as large as \code{perplexity}. #' @param n_components The dimension of the space to embed into. This defaults #' to \code{2} to provide easy visualization, but can reasonably be set to any #' integer value in the range \code{2} to \code{100}. #' @param metric Type of distance metric to use to find nearest neighbors. For #' \code{nn_method = "annoy"} this can be one of: #' \itemize{ #' \item \code{"euclidean"} (the default) #' \item \code{"cosine"} #' \item \code{"manhattan"} #' \item \code{"hamming"} #' \item \code{"correlation"} (a distance based on the Pearson correlation) #' \item \code{"categorical"} (see below) #' } #' For \code{nn_method = "hnsw"} this can be one of: #' \itemize{ #' \item \code{"euclidean"} #' \item \code{"cosine"} #' \item \code{"correlation"} #' } #' If \href{https://cran.r-project.org/package=rnndescent}{rnndescent} is #' installed and \code{nn_method = "nndescent"} is specified then many more #' metrics are avaiable, including: #' \itemize{ #' \item \code{"braycurtis"} #' \item \code{"canberra"} #' \item \code{"chebyshev"} #' \item \code{"dice"} #' \item \code{"hamming"} #' \item \code{"hellinger"} #' \item \code{"jaccard"} #' \item \code{"jensenshannon"} #' \item \code{"kulsinski"} #' \item \code{"rogerstanimoto"} #' \item \code{"russellrao"} #' \item \code{"sokalmichener"} #' \item \code{"sokalsneath"} #' \item \code{"spearmanr"} #' \item \code{"symmetrickl"} #' \item \code{"tsss"} #' \item \code{"yule"} #' } #' For more details see the package documentation of \code{rnndescent}. #' For \code{nn_method = "fnn"}, the distance metric is always "euclidean". #' #' If \code{X} is a data frame or matrix, then multiple metrics can be #' specified, by passing a list to this argument, where the name of each item in #' the list is one of the metric names above. The value of each list item should #' be a vector giving the names or integer ids of the columns to be included in #' a calculation, e.g. \code{metric = list(euclidean = 1:4, manhattan = 5:10)}. #' #' Each metric calculation results in a separate fuzzy simplicial set, which are #' intersected together to produce the final set. Metric names can be repeated. #' Because non-numeric columns are removed from the data frame, it is safer to #' use column names than integer ids. #' #' Factor columns can also be used by specifying the metric name #' \code{"categorical"}. Factor columns are treated different from numeric #' columns and although multiple factor columns can be specified in a vector, #' each factor column specified is processed individually. If you specify #' a non-factor column, it will be coerced to a factor. #' #' For a given data block, you may override the \code{pca} and \code{pca_center} #' arguments for that block, by providing a list with one unnamed item #' containing the column names or ids, and then any of the \code{pca} or #' \code{pca_center} overrides as named items, e.g. \code{metric = #' list(euclidean = 1:4, manhattan = list(5:10, pca_center = FALSE))}. This #' exists to allow mixed binary and real-valued data to be included and to have #' PCA applied to both, but with centering applied only to the real-valued data #' (it is typical not to apply centering to binary data before PCA is applied). #' @param n_epochs Number of epochs to use during the optimization of the #' embedded coordinates. The default is calculate the number of epochs #' dynamically based on dataset size, to give the same number of edge samples #' as the LargeVis defaults. This is usually substantially larger than the #' UMAP defaults. If \code{n_epochs = 0}, then coordinates determined by #' \code{"init"} will be returned. #' @param learning_rate Initial learning rate used in optimization of the #' coordinates. #' @param scale Scaling to apply to \code{X} if it is a data frame or matrix: #' \itemize{ #' \item{\code{"none"} or \code{FALSE} or \code{NULL}} No scaling. #' \item{\code{"Z"} or \code{"scale"} or \code{TRUE}} Scale each column to #' zero mean and variance 1. #' \item{\code{"maxabs"}} Center each column to mean 0, then divide each #' element by the maximum absolute value over the entire matrix. #' \item{\code{"range"}} Range scale the entire matrix, so the smallest #' element is 0 and the largest is 1. #' \item{\code{"colrange"}} Scale each column in the range (0,1). #' } #' For lvish, the default is \code{"maxabs"}, for consistency with LargeVis. #' @param init Type of initialization for the coordinates. Options are: #' \itemize{ #' \item \code{"spectral"} Spectral embedding using the normalized Laplacian #' of the fuzzy 1-skeleton, with Gaussian noise added. #' \item \code{"normlaplacian"}. Spectral embedding using the normalized #' Laplacian of the fuzzy 1-skeleton, without noise. #' \item \code{"random"}. Coordinates assigned using a uniform random #' distribution between -10 and 10. #' \item \code{"lvrandom"}. Coordinates assigned using a Gaussian #' distribution with standard deviation 1e-4, as used in LargeVis #' (Tang et al., 2016) and t-SNE. #' \item \code{"laplacian"}. Spectral embedding using the Laplacian Eigenmap #' (Belkin and Niyogi, 2002). #' \item \code{"pca"}. The first two principal components from PCA of #' \code{X} if \code{X} is a data frame, and from a 2-dimensional classical #' MDS if \code{X} is of class \code{"dist"}. #' \item \code{"spca"}. Like \code{"pca"}, but each dimension is then scaled #' so the standard deviation is 1e-4, to give a distribution similar to that #' used in t-SNE and LargeVis. This is an alias for \code{init = "pca", #' init_sdev = 1e-4}. #' \item \code{"agspectral"} An "approximate global" modification of #' \code{"spectral"} which all edges in the graph to a value of 1, and then #' sets a random number of edges (\code{negative_sample_rate} edges per #' vertex) to 0.1, to approximate the effect of non-local affinities. #' \item A matrix of initial coordinates. #' } #' For spectral initializations, (\code{"spectral"}, \code{"normlaplacian"}, #' \code{"laplacian"}, \code{"agspectral"}), if more than one connected #' component is identified, no spectral initialization is attempted. Instead #' a PCA-based initialization is attempted. If \code{verbose = TRUE} the #' number of connected components are logged to the console. The existence of #' multiple connected components implies that a global view of the data cannot #' be attained with this initialization. Increasing the value of #' \code{n_neighbors} may help. #' @param init_sdev If non-\code{NULL}, scales each dimension of the initialized #' coordinates (including any user-supplied matrix) to this standard #' deviation. By default no scaling is carried out, except when \code{init = #' "spca"}, in which case the value is \code{0.0001}. Scaling the input may #' help if the unscaled versions result in initial coordinates with large #' inter-point distances or outliers. This usually results in small gradients #' during optimization and very little progress being made to the layout. #' Shrinking the initial embedding by rescaling can help under these #' circumstances. Scaling the result of \code{init = "pca"} is usually #' recommended and \code{init = "spca"} as an alias for \code{init = "pca", #' init_sdev = 1e-4} but for the spectral initializations the scaled versions #' usually aren't necessary unless you are using a large value of #' \code{n_neighbors} (e.g. \code{n_neighbors = 150} or higher). For #' compatibility with recent versions of the Python UMAP package, if you are #' using \code{init = "spectral"}, then you should also set #' \code{init_sdev = "range"}, which will range scale each of the columns #' containing the initial data between 0-10. This is not set by default to #' maintain backwards compatibility with previous versions of uwot. #' @param repulsion_strength Weighting applied to negative samples in low #' dimensional embedding optimization. Values higher than one will result in #' greater weight being given to negative samples. #' @param negative_sample_rate The number of negative edge/1-simplex samples to #' use per positive edge/1-simplex sample in optimizing the low dimensional #' embedding. #' @param nn_method Method for finding nearest neighbors. Options are: #' \itemize{ #' \item \code{"fnn"}. Use exact nearest neighbors via the #' \href{https://cran.r-project.org/package=FNN}{FNN} package. #' \item \code{"annoy"} Use approximate nearest neighbors via the #' \href{https://cran.r-project.org/package=RcppAnnoy}{RcppAnnoy} package. #' \item \code{"hnsw"} Use approximate nearest neighbors with the #' Hierarchical Navigable Small World (HNSW) method (Malkov and Yashunin, #' 2018) via the #' \href{https://cran.r-project.org/package=RcppHNSW}{RcppHNSW} package. #' \code{RcppHNSW} is not a dependency of this package: this option is #' only available if you have installed \code{RcppHNSW} yourself. Also, #' HNSW only supports the following arguments for \code{metric}: #' \code{"euclidean"}, \code{"cosine"} and \code{"correlation"}. #' \item \code{"nndescent"} Use approximate nearest neighbors with the #' Nearest Neighbor Descent method (Dong et al., 2011) via the #' \href{https://cran.r-project.org/package=rnndescent}{rnndescent} #' package. \code{rnndescent} is not a dependency of this package: this #' option is only available if you have installed \code{rnndescent} #' yourself. #' } #' By default, if \code{X} has less than 4,096 vertices, the exact nearest #' neighbors are found. Otherwise, approximate nearest neighbors are used. #' You may also pass precalculated nearest neighbor data to this argument. It #' must be a list consisting of two elements: #' \itemize{ #' \item \code{"idx"}. A \code{n_vertices x n_neighbors} matrix #' containing the integer indexes of the nearest neighbors in \code{X}. Each #' vertex is considered to be its own nearest neighbor, i.e. #' \code{idx[, 1] == 1:n_vertices}. #' \item \code{"dist"}. A \code{n_vertices x n_neighbors} matrix #' containing the distances of the nearest neighbors. #' } #' Multiple nearest neighbor data (e.g. from two different precomputed #' metrics) can be passed by passing a list containing the nearest neighbor #' data lists as items. #' The \code{n_neighbors} parameter is ignored when using precomputed #' nearest neighbor data. #' @param n_trees Number of trees to build when constructing the nearest #' neighbor index. The more trees specified, the larger the index, but the #' better the results. With \code{search_k}, determines the accuracy of the #' Annoy nearest neighbor search. Only used if the \code{nn_method} is #' \code{"annoy"}. Sensible values are between \code{10} to \code{100}. #' @param search_k Number of nodes to search during the neighbor retrieval. The #' larger k, the more the accurate results, but the longer the search takes. #' With \code{n_trees}, determines the accuracy of the Annoy nearest neighbor #' search. Only used if the \code{nn_method} is \code{"annoy"}. #' @param nn_args A list containing additional arguments to pass to the nearest #' neighbor method. For \code{nn_method = "annoy"}, you can specify #' \code{"n_trees"} and \code{"search_k"}, and these will override the #' \code{n_trees} and \code{search_k} parameters. #' For \code{nn_method = "hnsw"}, you may specify the following arguments: #' \itemize{ #' \item \code{M} The maximum number of neighbors to keep for each vertex. #' Reasonable values are \code{2} to \code{100}. Higher values give better #' recall at the cost of more memory. Default value is \code{16}. #' \item \code{ef_construction} A positive integer specifying the size of #' the dynamic list used during index construction. A higher value will #' provide better results at the cost of a longer time to build the index. #' Default is \code{200}. #' \item \code{ef} A positive integer specifying the size of the dynamic #' list used during search. This cannot be smaller than \code{n_neighbors} #' and cannot be higher than the number of items in the index. Default is #' \code{10}. #' } #' For \code{nn_method = "nndescent"}, you may specify the following #' arguments: #' \itemize{ #' \item \code{n_trees} The number of trees to use in a random projection #' forest to initialize the search. A larger number will give more accurate #' results at the cost of a longer computation time. The default of #' \code{NULL} means that the number is chosen based on the number of #' observations in \code{X}. #' \item \code{max_candidates} The number of potential neighbors to explore #' per iteration. By default, this is set to \code{n_neighbors} or \code{60}, #' whichever is smaller. A larger number will give more accurate results at #' the cost of a longer computation time. #' \item \code{n_iters} The number of iterations to run the search. A larger #' number will give more accurate results at the cost of a longer computation #' time. By default, this will be chosen based on the number of observations #' in \code{X}. You may also need to modify the convergence criterion #' \code{delta}. #' \item \code{delta} The minimum relative change in the neighbor graph #' allowed before early stopping. Should be a value between 0 and 1. The #' smaller the value, the smaller the amount of progress between iterations is #' allowed. Default value of \code{0.001} means that at least 0.1% of the #' neighbor graph must be updated at each iteration. #' \item \code{init} How to initialize the nearest neighbor descent. By #' default this is set to \code{"tree"} and uses a random project forest. #' If you set this to \code{"rand"}, then a random selection is used. Usually #' this is less accurate than using RP trees, but for high-dimensional cases, #' there may be little difference in the quality of the initialization and #' random initialization will be a lot faster. If you set this to #' \code{"rand"}, then the \code{n_trees} parameter is ignored. #' } #' @param n_threads Number of threads to use (except during stochastic gradient #' descent). Default is half the number of concurrent threads supported by the #' system. For nearest neighbor search, only applies if #' \code{nn_method = "annoy"}. If \code{n_threads > 1}, then the Annoy index #' will be temporarily written to disk in the location determined by #' \code{\link[base]{tempfile}}. #' @param n_sgd_threads Number of threads to use during stochastic gradient #' descent. If set to > 1, then be aware that if \code{batch = FALSE}, results #' will \emph{not} be reproducible, even if \code{set.seed} is called with a #' fixed seed before running. Set to \code{"auto"} to use the same value as #' \code{n_threads}. #' @param grain_size The minimum amount of work to do on each thread. If this #' value is set high enough, then less than \code{n_threads} or #' \code{n_sgd_threads} will be used for processing, which might give a #' performance improvement if the overhead of thread management and context #' switching was outweighing the improvement due to concurrent processing. #' This should be left at default (\code{1}) and work will be spread evenly #' over all the threads specified. #' @param kernel Type of kernel function to create input probabilities. Can be #' one of \code{"gauss"} (the default) or \code{"knn"}. \code{"gauss"} uses #' the usual Gaussian weighted similarities. \code{"knn"} assigns equal #' probabilities to every edge in the nearest neighbor graph, and zero #' otherwise, using \code{perplexity} nearest neighbors. The \code{n_neighbors} #' parameter is ignored in this case. #' @param pca If set to a positive integer value, reduce data to this number of #' columns using PCA. Doesn't applied if the distance \code{metric} is #' \code{"hamming"}, or the dimensions of the data is larger than the #' number specified (i.e. number of rows and columns must be larger than the #' value of this parameter). If you have > 100 columns in a data frame or #' matrix, reducing the number of columns in this way may substantially #' increase the performance of the nearest neighbor search at the cost of a #' potential decrease in accuracy. In many t-SNE applications, a value of 50 #' is recommended, although there's no guarantee that this is appropriate for #' all settings. #' @param pca_center If \code{TRUE}, center the columns of \code{X} before #' carrying out PCA. For binary data, it's recommended to set this to #' \code{FALSE}. #' @param pca_method Method to carry out any PCA dimensionality reduction when #' the \code{pca} parameter is specified. Allowed values are: #' \itemize{ #' \item{\code{"irlba"}}. Uses \code{\link[irlba]{prcomp_irlba}} from the #' \href{https://cran.r-project.org/package=irlba}{irlba} package. #' \item{\code{"rsvd"}}. Uses 5 iterations of \code{\link[irlba]{svdr}} from #' the \href{https://cran.r-project.org/package=irlba}{irlba} package. #' This is likely to give much faster but potentially less accurate results #' than using \code{"irlba"}. For the purposes of nearest neighbor #' calculation and coordinates initialization, any loss of accuracy doesn't #' seem to matter much. #' \item{\code{"bigstatsr"}}. Uses \code{\link[bigstatsr]{big_randomSVD}} #' from the \href{https://cran.r-project.org/package=bigstatsr}{bigstatsr} #' package. The SVD methods used in \code{bigstatsr} may be faster on #' systems without access to efficient linear algebra libraries (e.g. #' Windows). \strong{Note}: \code{bigstatsr} is \emph{not} a dependency of #' uwot: if you choose to use this package for PCA, you \emph{must} install #' it yourself. #' \item{\code{"svd"}}. Uses \code{\link[base]{svd}} for the SVD. This is #' likely to be slow for all but the smallest datasets. #' \item{\code{"auto"}} (the default). Uses \code{"irlba"}, unless more than #' 50% of the full set of singular vectors would be calculated, in which #' case \code{"svd"} is used. #' } #' @param pcg_rand If \code{TRUE}, use the PCG random number generator (O'Neill, #' 2014) during optimization. Otherwise, use the faster (but probably less #' statistically good) Tausworthe "taus88" generator. The default is #' \code{TRUE}. This parameter has been superseded by \code{rng_type} -- if #' both are set, \code{rng_type} takes precedence. #' @param rng_type The type of random number generator to use during #' optimization. One of: #' \itemize{ #' \item{\code{"pcg"}}. Use the PCG random number generator (O'Neill, 2014). #' \item{\code{"tausworthe"}}. Use the Tausworthe "taus88" generator. #' \item{\code{"deterministic"}}. Use a deterministic number generator. This #' isn't actually random, but may provide enough variation in the negative #' sampling to give a good embedding and can provide a noticeable speed-up. #' } #' For backwards compatibility, by default this is unset and the choice of #' \code{pcg_rand} is used (making "pcg" the effective default). #' @param fast_sgd If \code{TRUE}, then the following combination of parameters #' is set: \code{pcg_rand = TRUE} and \code{n_sgd_threads = "auto"}. The #' default is \code{FALSE}. Setting this to \code{TRUE} will speed up the #' stochastic optimization phase, but give a potentially less accurate #' embedding, and which will not be exactly reproducible even with a fixed #' seed. For visualization, \code{fast_sgd = TRUE} will give perfectly good #' results. For more generic dimensionality reduction, it's safer to leave #' \code{fast_sgd = FALSE}. If \code{fast_sgd = TRUE}, then user-supplied #' values of \code{pcg_rand} and \code{n_sgd_threads}, are ignored. #' @param batch If \code{TRUE}, then embedding coordinates are updated at the #' end of each epoch rather than during the epoch. In batch mode, results are #' reproducible with a fixed random seed even with \code{n_sgd_threads > 1}, #' at the cost of a slightly higher memory use. You may also have to modify #' \code{learning_rate} and increase \code{n_epochs}, so whether this provides #' a speed increase over the single-threaded optimization is likely to be #' dataset and hardware-dependent. #' @param ret_nn If \code{TRUE}, then in addition to the embedding, also return #' nearest neighbor data that can be used as input to \code{nn_method} to #' avoid the overhead of repeatedly calculating the nearest neighbors when #' manipulating unrelated parameters (e.g. \code{min_dist}, \code{n_epochs}, #' \code{init}). See the "Value" section for the names of the list items. If #' \code{FALSE}, just return the coordinates. Note that the nearest neighbors #' could be sensitive to data scaling, so be wary of reusing nearest neighbor #' data if modifying the \code{scale} parameter. #' @param ret_extra A vector indicating what extra data to return. May contain #' any combination of the following strings: #' \itemize{ #' \item \code{"nn"} same as setting \code{ret_nn = TRUE}. #' \item \code{"P"} the high dimensional probability matrix. The graph #' is returned as a sparse symmetric N x N matrix of class #' \link[Matrix]{dgCMatrix-class}, where a non-zero entry (i, j) gives the #' input probability (or similarity or affinity) of the edge connecting #' vertex i and vertex j. Note that the graph is further sparsified by #' removing edges with sufficiently low membership strength that they #' would not be sampled by the probabilistic edge sampling employed for #' optimization and therefore the number of non-zero elements in the #' matrix is dependent on \code{n_epochs}. If you are only interested in #' the fuzzy input graph (e.g. for clustering), setting #' \code{n_epochs = 0} will avoid any further sparsifying. Be aware that #' setting \code{binary_edge_weights = TRUE} will affect this graph (all #' non-zero edge weights will be 1). #' \item \code{sigma} a vector of the bandwidths used to calibrate the input #' Gaussians to reproduce the target \code{"perplexity"}. #' } #' @param tmpdir Temporary directory to store nearest neighbor indexes during #' nearest neighbor search. Default is \code{\link{tempdir}}. The index is #' only written to disk if \code{n_threads > 1} and #' \code{nn_method = "annoy"}; otherwise, this parameter is ignored. #' @param verbose If \code{TRUE}, log details to the console. #' @param opt_args A list of optimizer parameters, used when #' \code{batch = TRUE}. The default optimization method used is Adam (Kingma #' and Ba, 2014). #' \itemize{ #' \item \code{method} The optimization method to use. Either \code{"adam"} #' or \code{"sgd"} (stochastic gradient descent). Default: \code{"adam"}. #' \item \code{beta1} (Adam only). The weighting parameter for the #' exponential moving average of the first moment estimator. Effectively the #' momentum parameter. Should be a floating point value between 0 and 1. #' Higher values can smooth oscillatory updates in poorly-conditioned #' situations and may allow for a larger \code{learning_rate} to be #' specified, but too high can cause divergence. Default: \code{0.5}. #' \item \code{beta2} (Adam only). The weighting parameter for the #' exponential moving average of the uncentered second moment estimator. #' Should be a floating point value between 0 and 1. Controls the degree of #' adaptivity in the step-size. Higher values put more weight on previous #' time steps. Default: \code{0.9}. #' \item \code{eps} (Adam only). Intended to be a small value to prevent #' division by zero, but in practice can also affect convergence due to its #' interaction with \code{beta2}. Higher values reduce the effect of the #' step-size adaptivity and bring the behavior closer to stochastic gradient #' descent with momentum. Typical values are between 1e-8 and 1e-3. Default: #' \code{1e-7}. #' \item \code{alpha} The initial learning rate. Default: the value of the #' \code{learning_rate} parameter. #' } #' @param epoch_callback A function which will be invoked at the end of every #' epoch. Its signature should be: \code{(epoch, n_epochs, coords)}, where: #' \itemize{ #' \item \code{epoch} The current epoch number (between \code{1} and #' \code{n_epochs}). #' \item \code{n_epochs} Number of epochs to use during the optimization of #' the embedded coordinates. #' \item \code{coords} The embedded coordinates as of the end of the current #' epoch, as a matrix with dimensions (N, \code{n_components}). #' } #' @param binary_edge_weights If \code{TRUE} then edge weights in the input #' graph are treated as binary (0/1) rather than real valued. This affects the #' sampling frequency of neighbors and is the strategy used by the PaCMAP #' method (Wang and co-workers, 2020). Practical (Böhm and co-workers, 2020) #' and theoretical (Damrich and Hamprecht, 2021) work suggests this has little #' effect on UMAP's performance. #' @return A matrix of optimized coordinates, or: #' \itemize{ #' \item if \code{ret_nn = TRUE} (or \code{ret_extra} contains \code{"nn"}), #' returns the nearest neighbor data as a list called \code{nn}. This #' contains one list for each \code{metric} calculated, itself containing a #' matrix \code{idx} with the integer ids of the neighbors; and a matrix #' \code{dist} with the distances. The \code{nn} list (or a sub-list) can be #' used as input to the \code{nn_method} parameter. #' \item if \code{ret_extra} contains \code{"P"}, returns the high #' dimensional probability matrix as a sparse matrix called \code{P}, of #' type \link[Matrix]{dgCMatrix-class}. #' \item if \code{ret_extra} contains \code{"sigma"}, returns a vector of #' the high dimensional gaussian bandwidths for each point, and #' \code{"dint"} a vector of estimates of the intrinsic dimensionality at #' each point, based on the method given by Lee and co-workers (2015). #' } #' The returned list contains the combined data from any combination of #' specifying \code{ret_nn} and \code{ret_extra}. #' #' @examples #' # Default number of epochs is much larger than for UMAP, assumes random #' # initialization. Use perplexity rather than n_neighbors to control the size #' # of the local neighborhood 20 epochs may be too small for a random #' # initialization #' iris_lvish <- lvish(iris, #' perplexity = 50, learning_rate = 0.5, #' init = "random", n_epochs = 20 #' ) #' #' @references #' Belkin, M., & Niyogi, P. (2002). #' Laplacian eigenmaps and spectral techniques for embedding and clustering. #' In \emph{Advances in neural information processing systems} #' (pp. 585-591). #' \url{http://papers.nips.cc/paper/1961-laplacian-eigenmaps-and-spectral-techniques-for-embedding-and-clustering.pdf} #' #' Böhm, J. N., Berens, P., & Kobak, D. (2020). #' A unifying perspective on neighbor embeddings along the attraction-repulsion spectrum. #' \emph{arXiv preprint} \emph{arXiv:2007.08902}. #' \url{https://arxiv.org/abs/2007.08902} #' #' Damrich, S., & Hamprecht, F. A. (2021). #' On UMAP's true loss function. #' \emph{Advances in Neural Information Processing Systems}, \emph{34}. #' \url{https://proceedings.neurips.cc/paper/2021/hash/2de5d16682c3c35007e4e92982f1a2ba-Abstract.html} #' #' Dong, W., Moses, C., & Li, K. (2011, March). #' Efficient k-nearest neighbor graph construction for generic similarity measures. #' In \emph{Proceedings of the 20th international conference on World Wide Web} #' (pp. 577-586). #' ACM. #' \doi{10.1145/1963405.1963487}. #' #' Kingma, D. P., & Ba, J. (2014). #' Adam: A method for stochastic optimization. #' \emph{arXiv preprint} \emph{arXiv}:1412.6980. #' \url{https://arxiv.org/abs/1412.6980} #' #' Lee, J. A., Peluffo-Ordóñez, D. H., & Verleysen, M. (2015). #' Multi-scale similarities in stochastic neighbour embedding: Reducing #' dimensionality while preserving both local and global structure. #' \emph{Neurocomputing}, \emph{169}, 246-261. #' #' Malkov, Y. A., & Yashunin, D. A. (2018). #' Efficient and robust approximate nearest neighbor search using hierarchical #' navigable small world graphs. #' \emph{IEEE transactions on pattern analysis and machine intelligence}, \emph{42}(4), 824-836. #' #' McInnes, L., Healy, J., & Melville, J. (2018). #' UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction #' \emph{arXiv preprint} \emph{arXiv}:1802.03426. #' \url{https://arxiv.org/abs/1802.03426} #' #' O'Neill, M. E. (2014). #' \emph{PCG: A family of simple fast space-efficient statistically good #' algorithms for random number generation} #' (Report No. HMC-CS-2014-0905). Harvey Mudd College. #' #' Tang, J., Liu, J., Zhang, M., & Mei, Q. (2016, April). #' Visualizing large-scale and high-dimensional data. #' In \emph{Proceedings of the 25th International Conference on World Wide Web} #' (pp. 287-297). #' International World Wide Web Conferences Steering Committee. #' \url{https://arxiv.org/abs/1602.00370} #' #' Van der Maaten, L., & Hinton, G. (2008). #' Visualizing data using t-SNE. #' \emph{Journal of Machine Learning Research}, \emph{9} (2579-2605). #' \url{https://www.jmlr.org/papers/v9/vandermaaten08a.html} #' #' Wang, Y., Huang, H., Rudin, C., & Shaposhnik, Y. (2021). #' Understanding How Dimension Reduction Tools Work: An Empirical Approach to Deciphering t-SNE, UMAP, TriMap, and PaCMAP for Data Visualization. #' \emph{Journal of Machine Learning Research}, \emph{22}(201), 1-73. #' \url{https://www.jmlr.org/papers/v22/20-1061.html} #' #' @export lvish <- function(X, perplexity = 50, n_neighbors = perplexity * 3, n_components = 2, metric = "euclidean", n_epochs = -1, learning_rate = 1, scale = "maxabs", init = "lvrandom", init_sdev = NULL, repulsion_strength = 7, negative_sample_rate = 5.0, nn_method = NULL, n_trees = 50, search_k = 2 * n_neighbors * n_trees, n_threads = NULL, n_sgd_threads = 0, grain_size = 1, kernel = "gauss", pca = NULL, pca_center = TRUE, pcg_rand = TRUE, fast_sgd = FALSE, ret_nn = FALSE, ret_extra = c(), tmpdir = tempdir(), verbose = getOption("verbose", TRUE), batch = FALSE, opt_args = NULL, epoch_callback = NULL, pca_method = NULL, binary_edge_weights = FALSE, nn_args = list(), rng_type = NULL) { uwot(X, n_neighbors = n_neighbors, n_components = n_components, metric = metric, n_epochs = n_epochs, alpha = learning_rate, scale = scale, init = init, init_sdev = init_sdev, gamma = repulsion_strength, negative_sample_rate = negative_sample_rate, nn_method = nn_method, n_trees = n_trees, search_k = search_k, method = "largevis", perplexity = perplexity, pca = pca, pca_center = pca_center, pca_method = pca_method, n_threads = n_threads, n_sgd_threads = n_sgd_threads, grain_size = grain_size, kernel = kernel, ret_nn = ret_nn || "nn" %in% ret_extra, ret_fgraph = "P" %in% ret_extra, ret_sigma = "sigma" %in% ret_extra, pcg_rand = pcg_rand, fast_sgd = fast_sgd, batch = batch, opt_args = opt_args, epoch_callback = epoch_callback, tmpdir = tmpdir, binary_edge_weights = binary_edge_weights, verbose = verbose, nn_args = list(), rng_type = rng_type ) } #' Similarity Graph #' #' Create a graph (as a sparse symmetric weighted adjacency matrix) representing #' the similarities between items in a data set. No dimensionality reduction is #' carried out. By default, the similarities are calculated using the merged #' fuzzy simplicial set approach in the Uniform Manifold Approximation and #' Projection (UMAP) method (McInnes et al., 2018), but the approach from #' LargeVis (Tang et al., 2016) can also be used. #' #' This is equivalent to running \code{\link{umap}} with the #' \code{ret_extra = c("fgraph")} parameter, but without the overhead of #' calculating (or returning) the optimized low-dimensional coordinates. #' #' @param X Input data. Can be a \code{\link{data.frame}}, \code{\link{matrix}}, #' \code{\link[stats]{dist}} object or \code{\link[Matrix]{sparseMatrix}}. #' Matrix and data frames should contain one observation per row. Data frames #' will have any non-numeric columns removed, although factor columns will be #' used if explicitly included via \code{metric} (see the help for #' \code{metric} for details). A sparse matrix is interpreted as a distance #' matrix, and is assumed to be symmetric, so you can also pass in an #' explicitly upper or lower triangular sparse matrix to save storage. There #' must be at least \code{n_neighbors} non-zero distances for each row. Both #' implicit and explicit zero entries are ignored. Set zero distances you want #' to keep to an arbitrarily small non-zero value (e.g. \code{1e-10}). #' \code{X} can also be \code{NULL} if pre-computed nearest neighbor data is #' passed to \code{nn_method}. #' @param n_neighbors The size of local neighborhood (in terms of number of #' neighboring sample points) used for manifold approximation. Larger values #' result in more global views of the manifold, while smaller values result in #' more local data being preserved. In general values should be in the range #' \code{2} to \code{100}. #' @param metric Type of distance metric to use to find nearest neighbors. For #' \code{nn_method = "annoy"} this can be one of: #' \itemize{ #' \item \code{"euclidean"} (the default) #' \item \code{"cosine"} #' \item \code{"manhattan"} #' \item \code{"hamming"} #' \item \code{"correlation"} (a distance based on the Pearson correlation) #' \item \code{"categorical"} (see below) #' } #' For \code{nn_method = "hnsw"} this can be one of: #' \itemize{ #' \item \code{"euclidean"} #' \item \code{"cosine"} #' \item \code{"correlation"} #' } #' If \href{https://cran.r-project.org/package=rnndescent}{rnndescent} is #' installed and \code{nn_method = "nndescent"} is specified then many more #' metrics are avaiable, including: #' \itemize{ #' \item \code{"braycurtis"} #' \item \code{"canberra"} #' \item \code{"chebyshev"} #' \item \code{"dice"} #' \item \code{"hamming"} #' \item \code{"hellinger"} #' \item \code{"jaccard"} #' \item \code{"jensenshannon"} #' \item \code{"kulsinski"} #' \item \code{"rogerstanimoto"} #' \item \code{"russellrao"} #' \item \code{"sokalmichener"} #' \item \code{"sokalsneath"} #' \item \code{"spearmanr"} #' \item \code{"symmetrickl"} #' \item \code{"tsss"} #' \item \code{"yule"} #' } #' For more details see the package documentation of \code{rnndescent}. #' For \code{nn_method = "fnn"}, the distance metric is always "euclidean". #' #' If \code{X} is a data frame or matrix, then multiple metrics can be #' specified, by passing a list to this argument, where the name of each item in #' the list is one of the metric names above. The value of each list item should #' be a vector giving the names or integer ids of the columns to be included in #' a calculation, e.g. \code{metric = list(euclidean = 1:4, manhattan = 5:10)}. #' #' Each metric calculation results in a separate fuzzy simplicial set, which are #' intersected together to produce the final set. Metric names can be repeated. #' Because non-numeric columns are removed from the data frame, it is safer to #' use column names than integer ids. #' #' Factor columns can also be used by specifying the metric name #' \code{"categorical"}. Factor columns are treated different from numeric #' columns and although multiple factor columns can be specified in a vector, #' each factor column specified is processed individually. If you specify #' a non-factor column, it will be coerced to a factor. #' #' For a given data block, you may override the \code{pca} and \code{pca_center} #' arguments for that block, by providing a list with one unnamed item #' containing the column names or ids, and then any of the \code{pca} or #' \code{pca_center} overrides as named items, e.g. \code{metric = #' list(euclidean = 1:4, manhattan = list(5:10, pca_center = FALSE))}. This #' exists to allow mixed binary and real-valued data to be included and to have #' PCA applied to both, but with centering applied only to the real-valued data #' (it is typical not to apply centering to binary data before PCA is applied). #' @param scale Scaling to apply to \code{X} if it is a data frame or matrix: #' \itemize{ #' \item{\code{"none"} or \code{FALSE} or \code{NULL}} No scaling. #' \item{\code{"Z"} or \code{"scale"} or \code{TRUE}} Scale each column to #' zero mean and variance 1. #' \item{\code{"maxabs"}} Center each column to mean 0, then divide each #' element by the maximum absolute value over the entire matrix. #' \item{\code{"range"}} Range scale the entire matrix, so the smallest #' element is 0 and the largest is 1. #' \item{\code{"colrange"}} Scale each column in the range (0,1). #' } #' For \code{method} \code{"umap"}, the default is \code{"none"}. For #' \code{"largevis"}, the default is \code{"maxabs"}. #' @param set_op_mix_ratio Interpolate between (fuzzy) union and intersection as #' the set operation used to combine local fuzzy simplicial sets to obtain a #' global fuzzy simplicial sets. Both fuzzy set operations use the product #' t-norm. The value of this parameter should be between \code{0.0} and #' \code{1.0}; a value of \code{1.0} will use a pure fuzzy union, while #' \code{0.0} will use a pure fuzzy intersection. Ignored if #' \code{method = "largevis"} #' @param local_connectivity The local connectivity required -- i.e. the number #' of nearest neighbors that should be assumed to be connected at a local #' level. The higher this value the more connected the manifold becomes #' locally. In practice this should be not more than the local intrinsic #' dimension of the manifold. Ignored if \code{method = "largevis"}. #' @param nn_method Method for finding nearest neighbors. Options are: #' \itemize{ #' \item \code{"fnn"}. Use exact nearest neighbors via the #' \href{https://cran.r-project.org/package=FNN}{FNN} package. #' \item \code{"annoy"} Use approximate nearest neighbors via the #' \href{https://cran.r-project.org/package=RcppAnnoy}{RcppAnnoy} package. #' \item \code{"hnsw"} Use approximate nearest neighbors with the #' Hierarchical Navigable Small World (HNSW) method (Malkov and Yashunin, #' 2018) via the #' \href{https://cran.r-project.org/package=RcppHNSW}{RcppHNSW} package. #' \code{RcppHNSW} is not a dependency of this package: this option is #' only available if you have installed \code{RcppHNSW} yourself. Also, #' HNSW only supports the following arguments for \code{metric} and #' \code{target_metric}: \code{"euclidean"}, \code{"cosine"} and #' \code{"correlation"}. #' \item \code{"nndescent"} Use approximate nearest neighbors with the #' Nearest Neighbor Descent method (Dong et al., 2011) via the #' \href{https://cran.r-project.org/package=rnndescent}{rnndescent} #' package. \code{rnndescent} is not a dependency of this package: this #' option is only available if you have installed \code{rnndescent} #' yourself. #' } #' By default, if \code{X} has less than 4,096 vertices, the exact nearest #' neighbors are found. Otherwise, approximate nearest neighbors are used. #' You may also pass pre-calculated nearest neighbor data to this argument. It #' must be one of two formats, either a list consisting of two elements: #' \itemize{ #' \item \code{"idx"}. A \code{n_vertices x n_neighbors} matrix #' containing the integer indexes of the nearest neighbors in \code{X}. Each #' vertex is considered to be its own nearest neighbor, i.e. #' \code{idx[, 1] == 1:n_vertices}. #' \item \code{"dist"}. A \code{n_vertices x n_neighbors} matrix #' containing the distances of the nearest neighbors. #' } #' or a sparse distance matrix of type \code{dgCMatrix}, with dimensions #' \code{n_vertices x n_vertices}. Distances should be arranged by column, #' i.e. a non-zero entry in row \code{j} of the \code{i}th column indicates #' that the \code{j}th observation in \code{X} is a nearest neighbor of the #' \code{i}th observation with the distance given by the value of that #' element. #' The \code{n_neighbors} parameter is ignored when using precomputed #' nearest neighbor data. If using the sparse distance matrix input, each #' column can contain a different number of neighbors. #' @param n_trees Number of trees to build when constructing the nearest #' neighbor index. The more trees specified, the larger the index, but the #' better the results. With \code{search_k}, determines the accuracy of the #' Annoy nearest neighbor search. Only used if the \code{nn_method} is #' \code{"annoy"}. Sensible values are between \code{10} to \code{100}. #' @param search_k Number of nodes to search during the neighbor retrieval. The #' larger k, the more the accurate results, but the longer the search takes. #' With \code{n_trees}, determines the accuracy of the Annoy nearest neighbor #' search. Only used if the \code{nn_method} is \code{"annoy"}. #' @param nn_args A list containing additional arguments to pass to the nearest #' neighbor method. For \code{nn_method = "annoy"}, you can specify #' \code{"n_trees"} and \code{"search_k"}, and these will override the #' \code{n_trees} and \code{search_k} parameters. #' For \code{nn_method = "hnsw"}, you may specify the following arguments: #' \itemize{ #' \item \code{M} The maximum number of neighbors to keep for each vertex. #' Reasonable values are \code{2} to \code{100}. Higher values give better #' recall at the cost of more memory. Default value is \code{16}. #' \item \code{ef_construction} A positive integer specifying the size of #' the dynamic list used during index construction. A higher value will #' provide better results at the cost of a longer time to build the index. #' Default is \code{200}. #' \item \code{ef} A positive integer specifying the size of the dynamic #' list used during search. This cannot be smaller than \code{n_neighbors} #' and cannot be higher than the number of items in the index. Default is #' \code{10}. #' } #' For \code{nn_method = "nndescent"}, you may specify the following #' arguments: #' \itemize{ #' \item \code{n_trees} The number of trees to use in a random projection #' forest to initialize the search. A larger number will give more accurate #' results at the cost of a longer computation time. The default of #' \code{NULL} means that the number is chosen based on the number of #' observations in \code{X}. #' \item \code{max_candidates} The number of potential neighbors to explore #' per iteration. By default, this is set to \code{n_neighbors} or \code{60}, #' whichever is smaller. A larger number will give more accurate results at #' the cost of a longer computation time. #' \item \code{n_iters} The number of iterations to run the search. A larger #' number will give more accurate results at the cost of a longer computation #' time. By default, this will be chosen based on the number of observations #' in \code{X}. You may also need to modify the convergence criterion #' \code{delta}. #' \item \code{delta} The minimum relative change in the neighbor graph #' allowed before early stopping. Should be a value between 0 and 1. The #' smaller the value, the smaller the amount of progress between iterations is #' allowed. Default value of \code{0.001} means that at least 0.1% of the #' neighbor graph must be updated at each iteration. #' \item \code{init} How to initialize the nearest neighbor descent. By #' default this is set to \code{"tree"} and uses a random project forest. #' If you set this to \code{"rand"}, then a random selection is used. Usually #' this is less accurate than using RP trees, but for high-dimensional cases, #' there may be little difference in the quality of the initialization and #' random initialization will be a lot faster. If you set this to #' \code{"rand"}, then the \code{n_trees} parameter is ignored. #' \item \code{pruning_degree_multiplier} The maximum number of edges per node #' to retain in the search graph, relative to \code{n_neighbors}. A larger #' value will give more accurate results at the cost of a longer computation #' time. Default is \code{1.5}. This parameter only affects neighbor search #' when transforming new data with \code{\link{umap_transform}}. #' \item \code{epsilon} Controls the degree of the back-tracking when #' traversing the search graph. Setting this to \code{0.0} will do a greedy #' search with no back-tracking. A larger value will give more accurate #' results at the cost of a longer computation time. Default is \code{0.1}. #' This parameter only affects neighbor search when transforming new data with #' \code{\link{umap_transform}}. #' \item \code{max_search_fraction} Specifies the maximum fraction of the #' search graph to traverse. By default, this is set to \code{1.0}, so the #' entire graph (i.e. all items in \code{X}) may be visited. You may want to #' set this to a smaller value if you have a very large dataset (in #' conjunction with \code{epsilon}) to avoid an inefficient exhaustive search #' of the data in \code{X}. This parameter only affects neighbor search when #' transforming new data with \code{\link{umap_transform}}. #' } #' @param perplexity Used only if \code{method = "largevis"}. Controls the size #' of the local neighborhood used for manifold approximation. Should be a #' value between 1 and one less than the number of items in \code{X}. If #' specified, you should \emph{not} specify a value for \code{n_neighbors} #' unless you know what you are doing. #' @param kernel Used only if \code{method = "largevis"}. Type of kernel #' function to create input similiarties. Can be one of \code{"gauss"} (the #' default) or \code{"knn"}. \code{"gauss"} uses the usual Gaussian weighted #' similarities. \code{"knn"} assigns equal similiarties. to every edge in the #' nearest neighbor graph, and zero otherwise, using \code{perplexity} nearest #' neighbors. The \code{n_neighbors} parameter is ignored in this case. #' @param method How to generate the similarities between items. One of: #' \itemize{ #' \item \code{"umap"} The UMAP method of McInnes et al. (2018). #' \item \code{"largevis"} The LargeVis method of Tang et al. (2016). #' } #' @param y Optional target data to add supervised or semi-supervised weighting #' to the similarity graph . Can be a vector, matrix or data frame. Use the #' \code{target_metric} parameter to specify the metrics to use, using the #' same syntax as \code{metric}. Usually either a single numeric or factor #' column is used, but more complex formats are possible. The following types #' are allowed: #' \itemize{ #' \item Factor columns with the same length as \code{X}. \code{NA} is #' allowed for any observation with an unknown level, in which case #' UMAP operates as a form of semi-supervised learning. Each column is #' treated separately. #' \item Numeric data. \code{NA} is \emph{not} allowed in this case. Use the #' parameter \code{target_n_neighbors} to set the number of neighbors used #' with \code{y}. If unset, \code{n_neighbors} is used. Unlike factors, #' numeric columns are grouped into one block unless \code{target_metric} #' specifies otherwise. For example, if you wish columns \code{a} and #' \code{b} to be treated separately, specify #' \code{target_metric = list(euclidean = "a", euclidean = "b")}. Otherwise, #' the data will be effectively treated as a matrix with two columns. #' \item Nearest neighbor data, consisting of a list of two matrices, #' \code{idx} and \code{dist}. These represent the precalculated nearest #' neighbor indices and distances, respectively. This #' is the same format as that expected for precalculated data in #' \code{nn_method}. This format assumes that the underlying data was a #' numeric vector. Any user-supplied value of the \code{target_n_neighbors} #' parameter is ignored in this case, because the the number of columns in #' the matrices is used for the value. Multiple nearest neighbor data using #' different metrics can be supplied by passing a list of these lists. #' } #' Unlike \code{X}, all factor columns included in \code{y} are automatically #' used. This parameter is ignored if \code{method = "largevis"}. #' @param target_n_neighbors Number of nearest neighbors to use to construct the #' target simplicial set. Default value is \code{n_neighbors}. Applies only if #' \code{y} is non-\code{NULL} and \code{numeric}. This parameter is ignored #' if \code{method = "largevis"}. #' @param target_metric The metric used to measure distance for \code{y} if #' using supervised dimension reduction. Used only if \code{y} is numeric. #' This parameter is ignored if \code{method = "largevis"}. #' @param target_weight Weighting factor between data topology and target #' topology. A value of 0.0 weights entirely on data, a value of 1.0 weights #' entirely on target. The default of 0.5 balances the weighting equally #' between data and target. Only applies if \code{y} is non-\code{NULL}. This #' parameter is ignored if \code{method = "largevis"}. #' @param pca If set to a positive integer value, reduce data to this number of #' columns using PCA. Doesn't applied if the distance \code{metric} is #' \code{"hamming"}, or the dimensions of the data is larger than the #' number specified (i.e. number of rows and columns must be larger than the #' value of this parameter). If you have > 100 columns in a data frame or #' matrix, reducing the number of columns in this way may substantially #' increase the performance of the nearest neighbor search at the cost of a #' potential decrease in accuracy. In many t-SNE applications, a value of 50 #' is recommended, although there's no guarantee that this is appropriate for #' all settings. #' @param pca_center If \code{TRUE}, center the columns of \code{X} before #' carrying out PCA. For binary data, it's recommended to set this to #' \code{FALSE}. #' @param pca_method Method to carry out any PCA dimensionality reduction when #' the \code{pca} parameter is specified. Allowed values are: #' \itemize{ #' \item{\code{"irlba"}}. Uses \code{\link[irlba]{prcomp_irlba}} from the #' \href{https://cran.r-project.org/package=irlba}{irlba} package. #' \item{\code{"rsvd"}}. Uses 5 iterations of \code{\link[irlba]{svdr}} from #' the \href{https://cran.r-project.org/package=irlba}{irlba} package. #' This is likely to give much faster but potentially less accurate results #' than using \code{"irlba"}. For the purposes of nearest neighbor #' calculation and coordinates initialization, any loss of accuracy doesn't #' seem to matter much. #' \item{\code{"bigstatsr"}}. Uses \code{\link[bigstatsr]{big_randomSVD}} #' from the \href{https://cran.r-project.org/package=bigstatsr}{bigstatsr} #' package. The SVD methods used in \code{bigstatsr} may be faster on #' systems without access to efficient linear algebra libraries (e.g. #' Windows). \strong{Note}: \code{bigstatsr} is \emph{not} a dependency of #' uwot: if you choose to use this package for PCA, you \emph{must} install #' it yourself. #' \item{\code{"svd"}}. Uses \code{\link[base]{svd}} for the SVD. This is #' likely to be slow for all but the smallest datasets. #' \item{\code{"auto"}} (the default). Uses \code{"irlba"}, unless more than #' 50% of the full set of singular vectors would be calculated, in which #' case \code{"svd"} is used. #' } #' @param ret_extra A vector indicating what extra data to return. May contain #' any combination of the following strings: #' \itemize{ #' \item \code{"nn"} nearest neighbor data that can be used as input to #' \code{nn_method} to avoid the overhead of repeatedly calculating the #' nearest neighbors when manipulating unrelated parameters. See the #' "Value" section for the names of the list items. Note that the nearest #' neighbors could be sensitive to data scaling, so be wary of reusing #' nearest neighbor data if modifying the \code{scale} parameter. #' \item \code{"sigma"} the normalization value for each observation in the #' dataset when constructing the smoothed distances to each of its #' neighbors. This gives some sense of the local density of each #' observation in the high dimensional space: higher values of #' \code{sigma} indicate a higher dispersion or lower density. #' } #' @param n_threads Number of threads to use. Default is half the number of #' concurrent threads supported by the system. For nearest neighbor search, #' only applies if \code{nn_method = "annoy"}. If \code{n_threads > 1}, then #' the Annoy index will be temporarily written to disk in the location #' determined by \code{\link[base]{tempfile}}. #' @param grain_size The minimum amount of work to do on each thread. If this #' value is set high enough, then less than \code{n_threads} will be used for #' processing, which might give a performance improvement if the overhead of #' thread management and context switching was outweighing the improvement due #' to concurrent processing. This should be left at default (\code{1}) and #' work will be spread evenly over all the threads specified. #' @param tmpdir Temporary directory to store nearest neighbor indexes during #' nearest neighbor search. Default is \code{\link{tempdir}}. The index is #' only written to disk if \code{n_threads > 1} and #' \code{nn_method = "annoy"}; otherwise, this parameter is ignored. #' @param verbose If \code{TRUE}, log details to the console. #' @param binary_edge_weights If \code{TRUE} then edge weights of the returned #' graph are binary (0/1) rather than reflecting the degree of similarity. #' @return A sparse symmetrized matrix of the similarities between the items in #' \code{X} or if \code{nn_method} contains pre-computed nearest neighbor #' data, the items in \code{nn_method}. Because of the symmetrization, there #' may be more non-zero items in each column than the specified value of #' \code{n_neighbors} (or pre-computed neighbors in \code{nn_method}). #' If \code{ret_extra} is specified then the return value will be a list #' containing: #' \itemize{ #' \item \code{similarity_graph} the similarity graph as a sparse matrix #' as described above. #' \item \code{nn} (if \code{ret_extra} contained \code{"nn"}) the nearest #' neighbor data as a list called \code{nn}. This contains one list for each #' \code{metric} calculated, itself containing a matrix \code{idx} with the #' integer ids of the neighbors; and a matrix \code{dist} with the #' distances. The \code{nn} list (or a sub-list) can be used as input to the #' \code{nn_method} parameter. #' \item \code{sigma} (if \code{ret_extra} contains \code{"sigma"}), #' a vector of calibrated parameters, one for each item in the input data, #' reflecting the local data density for that item. The exact definition of #' the values depends on the choice of the \code{method} parameter. #' \item \code{rho} (if \code{ret_extra} contains \code{"sigma"}), a #' vector containing the largest distance to the locally connected neighbors #' of each item in the input data. This will exist only if #' \code{method = "umap"}. #' \item \code{localr} (if \code{ret_extra} contains \code{"localr"}) a #' vector of the estimated local radii, the sum of \code{"sigma"} and #' \code{"rho"}. This will exist only if \code{method = "umap"}. #' } #' @examples #' #' iris30 <- iris[c(1:10, 51:60, 101:110), ] #' #' # return a 30 x 30 sparse matrix with similarity data based on 10 nearest #' # neighbors per item #' iris30_sim_graph <- similarity_graph(iris30, n_neighbors = 10) #' #' # Default is to use the UMAP method of calculating similarities, but LargeVis #' # is also available: for that method, use perplexity instead of n_neighbors #' # to control neighborhood size. Use ret_extra = "nn" to return nearest #' # neighbor data as well as the similarity graph. Return value is a list #' # containing similarity_graph' and 'nn' items. #' iris30_lv_graph <- similarity_graph(iris30, #' perplexity = 10, #' method = "largevis", ret_extra = "nn" #' ) #' # If you have the neighbor information you don't need the original data #' iris30_lv_graph_nn <- similarity_graph( #' nn_method = iris30_lv_graph$nn, #' perplexity = 10, method = "largevis" #' ) #' all(iris30_lv_graph_nn == iris30_lv_graph$similarity_graph) #' #' @references #' Dong, W., Moses, C., & Li, K. (2011, March). #' Efficient k-nearest neighbor graph construction for generic similarity measures. #' In \emph{Proceedings of the 20th international conference on World Wide Web} #' (pp. 577-586). #' ACM. #' \doi{10.1145/1963405.1963487}. #' #' Malkov, Y. A., & Yashunin, D. A. (2018). #' Efficient and robust approximate nearest neighbor search using hierarchical #' navigable small world graphs. #' \emph{IEEE transactions on pattern analysis and machine intelligence}, \emph{42}(4), 824-836. #' #' McInnes, L., Healy, J., & Melville, J. (2018). #' UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction #' \emph{arXiv preprint} \emph{arXiv}:1802.03426. #' \url{https://arxiv.org/abs/1802.03426} #' #' Tang, J., Liu, J., Zhang, M., & Mei, Q. (2016, April). #' Visualizing large-scale and high-dimensional data. #' In \emph{Proceedings of the 25th International Conference on World Wide Web} #' (pp. 287-297). #' International World Wide Web Conferences Steering Committee. #' \url{https://arxiv.org/abs/1602.00370} #' #' @export similarity_graph <- function(X = NULL, n_neighbors = NULL, metric = "euclidean", scale = NULL, set_op_mix_ratio = 1.0, local_connectivity = 1.0, nn_method = NULL, n_trees = 50, search_k = 2 * n_neighbors * n_trees, perplexity = 50, method = "umap", y = NULL, target_n_neighbors = n_neighbors, target_metric = "euclidean", target_weight = 0.5, pca = NULL, pca_center = TRUE, ret_extra = c(), n_threads = NULL, grain_size = 1, kernel = "gauss", tmpdir = tempdir(), verbose = getOption("verbose", TRUE), pca_method = NULL, binary_edge_weights = FALSE, nn_args = list()) { if (is.null(n_neighbors)) { if (method == "largevis") { n_neighbors <- perplexity * 3 scale <- "maxabs" } else { n_neighbors <- 15 scale <- FALSE } } uwot_res <- uwot( X = X, n_neighbors = n_neighbors, metric = metric, n_epochs = 0, scale = scale, init = NULL, set_op_mix_ratio = set_op_mix_ratio, local_connectivity = local_connectivity, nn_method = nn_method, n_trees = n_trees, search_k = search_k, method = method, n_threads = n_threads, grain_size = grain_size, kernel = kernel, perplexity = perplexity, y = y, target_n_neighbors = target_n_neighbors, target_weight = target_weight, target_metric = target_metric, pca = pca, pca_center = pca_center, pca_method = pca_method, ret_model = FALSE, ret_nn = "nn" %in% ret_extra, ret_fgraph = TRUE, ret_sigma = "sigma" %in% ret_extra, ret_localr = "localr" %in% ret_extra, binary_edge_weights = binary_edge_weights, tmpdir = tempdir(), verbose = verbose, nn_args = nn_args ) res <- list() for (name in names(uwot_res)) { if (name == "embedding") { # embedding will be NULL so remove it next } if (name == "P" || name == "fgraph") { res$similarity_graph <- uwot_res[[name]] } else { res[[name]] <- uwot_res[[name]] } } if (length(names(res)) == 1 && !is.null(res$similarity_graph)) { # return just the similarity graph if no extras were requested res <- res$similarity_graph } res } #' Optimize Graph Layout #' #' Carry out dimensionality reduction on an input graph, where the distances in #' the low dimensional space attempt to reproduce the neighbor relations in the #' input data. By default, the cost function used to optimize the output #' coordinates use the Uniform Manifold Approximation and Projection (UMAP) #' method (McInnes et al., 2018), but the approach from LargeVis (Tang et al., #' 2016) can also be used. This function can be used to produce a low #' dimensional representation of the graph produced by #' \code{\link{similarity_graph}}. #' #' @param graph A sparse, symmetric N x N weighted adjacency matrix #' representing a graph. Non-zero entries indicate an edge between two nodes #' with a given edge weight. There can be a varying number of non-zero entries #' in each row/column. #' @param X Optional input data. Used only for PCA-based initialization. #' @param n_components The dimension of the space to embed into. This defaults #' to \code{2} to provide easy visualization, but can reasonably be set to any #' integer value in the range \code{2} to \code{100}. #' @param n_epochs Number of epochs to use during the optimization of the #' embedded coordinates. By default, this value is set to \code{500} for #' datasets containing 10,000 vertices or less, and \code{200} otherwise. #' If \code{n_epochs = 0}, then coordinates determined by \code{"init"} will #' be returned. #' For UMAP, the default is \code{"none"}. #' @param learning_rate Initial learning rate used in optimization of the #' coordinates. #' @param init Type of initialization for the coordinates. Options are: #' \itemize{ #' \item \code{"spectral"} Spectral embedding using the normalized Laplacian #' of the fuzzy 1-skeleton, with Gaussian noise added. #' \item \code{"normlaplacian"}. Spectral embedding using the normalized #' Laplacian of the fuzzy 1-skeleton, without noise. #' \item \code{"random"}. Coordinates assigned using a uniform random #' distribution between -10 and 10. #' \item \code{"lvrandom"}. Coordinates assigned using a Gaussian #' distribution with standard deviation 1e-4, as used in LargeVis #' (Tang et al., 2016) and t-SNE. #' \item \code{"laplacian"}. Spectral embedding using the Laplacian Eigenmap. #' \item \code{"pca"}. The first two principal components from PCA of #' \code{X} if \code{X} is a data frame, and from a 2-dimensional classical #' MDS if \code{X} is of class \code{"dist"}. #' \item \code{"spca"}. Like \code{"pca"}, but each dimension is then scaled #' so the standard deviation is 1e-4, to give a distribution similar to that #' used in t-SNE. This is an alias for \code{init = "pca", init_sdev = #' 1e-4}. #' \item \code{"agspectral"} An "approximate global" modification of #' \code{"spectral"} which all edges in the graph to a value of 1, and then #' sets a random number of edges (\code{negative_sample_rate} edges per #' vertex) to 0.1, to approximate the effect of non-local affinities. #' \item A matrix of initial coordinates. #' } #' For spectral initializations, (\code{"spectral"}, \code{"normlaplacian"}, #' \code{"laplacian"}, \code{"agspectral"}), if more than one connected #' component is identified, no spectral initialization is attempted. Instead #' a PCA-based initialization is attempted. If \code{verbose = TRUE} the #' number of connected components are logged to the console. The existence of #' multiple connected components implies that a global view of the data cannot #' be attained with this initialization. Increasing the value of #' \code{n_neighbors} may help. #' @param init_sdev If non-\code{NULL}, scales each dimension of the initialized #' coordinates (including any user-supplied matrix) to this standard #' deviation. By default no scaling is carried out, except when \code{init = #' "spca"}, in which case the value is \code{0.0001}. Scaling the input may #' help if the unscaled versions result in initial coordinates with large #' inter-point distances or outliers. This usually results in small gradients #' during optimization and very little progress being made to the layout. #' Shrinking the initial embedding by rescaling can help under these #' circumstances. Scaling the result of \code{init = "pca"} is usually #' recommended and \code{init = "spca"} as an alias for \code{init = "pca", #' init_sdev = 1e-4} but for the spectral initializations the scaled versions #' usually aren't necessary unless you are using a large value of #' \code{n_neighbors} (e.g. \code{n_neighbors = 150} or higher). For #' compatibility with recent versions of the Python UMAP package, if you are #' using \code{init = "spectral"}, then you should also set #' \code{init_sdev = "range"}, which will range scale each of the columns #' containing the initial data between 0-10. This is not set by default to #' maintain backwards compatibility with previous versions of uwot. #' @param spread The effective scale of embedded points. In combination with #' \code{min_dist}, this determines how clustered/clumped the embedded points #' are. #' @param min_dist The effective minimum distance between embedded points. #' Smaller values will result in a more clustered/clumped embedding where #' nearby points on the manifold are drawn closer together, while larger #' values will result on a more even dispersal of points. The value should be #' set relative to the \code{spread} value, which determines the scale at #' which embedded points will be spread out. #' @param repulsion_strength Weighting applied to negative samples in low #' dimensional embedding optimization. Values higher than one will result in #' greater weight being given to negative samples. #' @param negative_sample_rate The number of negative edge/1-simplex samples to #' use per positive edge/1-simplex sample in optimizing the low dimensional #' embedding. #' @param a More specific parameters controlling the embedding. If \code{NULL} #' these values are set automatically as determined by \code{min_dist} and #' \code{spread}. #' @param b More specific parameters controlling the embedding. If \code{NULL} #' these values are set automatically as determined by \code{min_dist} and #' \code{spread}. #' @param approx_pow If \code{TRUE}, use an approximation to the power function #' in the UMAP gradient, from #' \url{https://martin.ankerl.com/2012/01/25/optimized-approximative-pow-in-c-and-cpp/}. #' @param method Cost function to optimize. One of: #' \itemize{ #' \item{\code{"umap"}}. The UMAP method of McInnes and co-workers (2018). #' \item{\code{"tumap"}}. UMAP with the \code{a} and \code{b} parameters fixed #' to 1. #' \item{\code{"largevis"}}. The LargeVis method Tang and co-workers (2016). #' } #' @param pca_method Method to carry out any PCA dimensionality reduction when #' the \code{pca} parameter is specified. Allowed values are: #' \itemize{ #' \item{\code{"irlba"}}. Uses \code{\link[irlba]{prcomp_irlba}} from the #' \href{https://cran.r-project.org/package=irlba}{irlba} package. #' \item{\code{"rsvd"}}. Uses 5 iterations of \code{\link[irlba]{svdr}} from #' the \href{https://cran.r-project.org/package=irlba}{irlba} package. #' This is likely to give much faster but potentially less accurate results #' than using \code{"irlba"}. For the purposes of nearest neighbor #' calculation and coordinates initialization, any loss of accuracy doesn't #' seem to matter much. #' \item{\code{"bigstatsr"}}. Uses \code{\link[bigstatsr]{big_randomSVD}} #' from the \href{https://cran.r-project.org/package=bigstatsr}{bigstatsr} #' package. The SVD methods used in \code{bigstatsr} may be faster on #' systems without access to efficient linear algebra libraries (e.g. #' Windows). \strong{Note}: \code{bigstatsr} is \emph{not} a dependency of #' uwot: if you choose to use this package for PCA, you \emph{must} install #' it yourself. #' \item{\code{"svd"}}. Uses \code{\link[base]{svd}} for the SVD. This is #' likely to be slow for all but the smallest datasets. #' \item{\code{"auto"}} (the default). Uses \code{"irlba"}, unless more than #' 50% of the full set of singular vectors would be calculated, in which #' case \code{"svd"} is used. #' } #' @param pcg_rand If \code{TRUE}, use the PCG random number generator (O'Neill, #' 2014) during optimization. Otherwise, use the faster (but probably less #' statistically good) Tausworthe "taus88" generator. The default is #' \code{TRUE}. This parameter has been superseded by \code{rng_type} -- if #' both are set, \code{rng_type} takes precedence. #' @param rng_type The type of random number generator to use during #' optimization. One of: #' \itemize{ #' \item{\code{"pcg"}}. Use the PCG random number generator (O'Neill, 2014). #' \item{\code{"tausworthe"}}. Use the Tausworthe "taus88" generator. #' \item{\code{"deterministic"}}. Use a deterministic number generator. This #' isn't actually random, but may provide enough variation in the negative #' sampling to give a good embedding and can provide a noticeable speed-up. #' } #' For backwards compatibility, by default this is unset and the choice of #' \code{pcg_rand} is used (making "pcg" the effective default). #' @param fast_sgd If \code{TRUE}, then the following combination of parameters #' is set: \code{pcg_rand = TRUE}, \code{n_sgd_threads = "auto"} and #' \code{approx_pow = TRUE}. The default is \code{FALSE}. Setting this to #' \code{TRUE} will speed up the stochastic optimization phase, but give a #' potentially less accurate embedding, and which will not be exactly #' reproducible even with a fixed seed. For visualization, \code{fast_sgd = #' TRUE} will give perfectly good results. For more generic dimensionality #' reduction, it's safer to leave \code{fast_sgd = FALSE}. If \code{fast_sgd = #' TRUE}, then user-supplied values of \code{pcg_rand}, \code{n_sgd_threads}, #' and \code{approx_pow} are ignored. #' @param batch If \code{TRUE}, then embedding coordinates are updated at the #' end of each epoch rather than during the epoch. In batch mode, results are #' reproducible with a fixed random seed even with \code{n_sgd_threads > 1}, #' at the cost of a slightly higher memory use. You may also have to modify #' \code{learning_rate} and increase \code{n_epochs}, so whether this provides #' a speed increase over the single-threaded optimization is likely to be #' dataset and hardware-dependent. #' @param n_sgd_threads Number of threads to use during stochastic gradient #' descent. If set to > 1, then be aware that if \code{batch = FALSE}, results #' will \emph{not} be reproducible, even if \code{set.seed} is called with a #' fixed seed before running. If set to \code{"auto"} then half the number of #' concurrent threads supported by the system will be used. #' @param grain_size The minimum amount of work to do on each thread. If this #' value is set high enough, then less than \code{n_threads} or #' \code{n_sgd_threads} will be used for processing, which might give a #' performance improvement if the overhead of thread management and context #' switching was outweighing the improvement due to concurrent processing. #' This should be left at default (\code{1}) and work will be spread evenly #' over all the threads specified. #' @param verbose If \code{TRUE}, log details to the console. #' @param opt_args A list of optimizer parameters, used when #' \code{batch = TRUE}. The default optimization method used is Adam (Kingma #' and Ba, 2014). #' \itemize{ #' \item \code{method} The optimization method to use. Either \code{"adam"} #' or \code{"sgd"} (stochastic gradient descent). Default: \code{"adam"}. #' \item \code{beta1} (Adam only). The weighting parameter for the #' exponential moving average of the first moment estimator. Effectively the #' momentum parameter. Should be a floating point value between 0 and 1. #' Higher values can smooth oscillatory updates in poorly-conditioned #' situations and may allow for a larger \code{learning_rate} to be #' specified, but too high can cause divergence. Default: \code{0.5}. #' \item \code{beta2} (Adam only). The weighting parameter for the #' exponential moving average of the uncentered second moment estimator. #' Should be a floating point value between 0 and 1. Controls the degree of #' adaptivity in the step-size. Higher values put more weight on previous #' time steps. Default: \code{0.9}. #' \item \code{eps} (Adam only). Intended to be a small value to prevent #' division by zero, but in practice can also affect convergence due to its #' interaction with \code{beta2}. Higher values reduce the effect of the #' step-size adaptivity and bring the behavior closer to stochastic gradient #' descent with momentum. Typical values are between 1e-8 and 1e-3. Default: #' \code{1e-7}. #' \item \code{alpha} The initial learning rate. Default: the value of the #' \code{learning_rate} parameter. #' } #' @param epoch_callback A function which will be invoked at the end of every #' epoch. Its signature should be: \code{(epoch, n_epochs, coords)}, where: #' \itemize{ #' \item \code{epoch} The current epoch number (between \code{1} and #' \code{n_epochs}). #' \item \code{n_epochs} Number of epochs to use during the optimization of #' the embedded coordinates. #' \item \code{coords} The embedded coordinates as of the end of the current #' epoch, as a matrix with dimensions (N, \code{n_components}). #' } #' @param binary_edge_weights If \code{TRUE} then edge weights in the input #' graph are treated as binary (0/1) rather than real valued. #' @return A matrix of optimized coordinates. #' #' @examples #' #' iris30 <- iris[c(1:10, 51:60, 101:110), ] #' #' # return a 30 x 30 sparse matrix with similarity data based on 10 nearest #' # neighbors per item #' iris30_sim_graph <- similarity_graph(iris30, n_neighbors = 10) #' # produce 2D coordinates replicating the neighbor relations in the similarity #' # graph #' set.seed(42) #' iris30_opt <- optimize_graph_layout(iris30_sim_graph, X = iris30) #' #' # the above two steps are the same as: #' # set.seed(42); iris_umap <- umap(iris30, n_neighbors = 10) #' #' @references #' Kingma, D. P., & Ba, J. (2014). #' Adam: A method for stochastic optimization. #' \emph{arXiv preprint} \emph{arXiv}:1412.6980. #' \url{https://arxiv.org/abs/1412.6980} #' #' McInnes, L., Healy, J., & Melville, J. (2018). #' UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction #' \emph{arXiv preprint} \emph{arXiv}:1802.03426. #' \url{https://arxiv.org/abs/1802.03426} #' #' O'Neill, M. E. (2014). #' \emph{PCG: A family of simple fast space-efficient statistically good #' algorithms for random number generation} #' (Report No. HMC-CS-2014-0905). Harvey Mudd College. #' #' Tang, J., Liu, J., Zhang, M., & Mei, Q. (2016, April). #' Visualizing large-scale and high-dimensional data. #' In \emph{Proceedings of the 25th International Conference on World Wide Web} #' (pp. 287-297). #' International World Wide Web Conferences Steering Committee. #' \url{https://arxiv.org/abs/1602.00370} #' #' @export optimize_graph_layout <- function(graph, X = NULL, n_components = 2, n_epochs = NULL, learning_rate = 1, init = "spectral", init_sdev = NULL, spread = 1, min_dist = 0.01, repulsion_strength = 1.0, negative_sample_rate = 5.0, a = NULL, b = NULL, method = "umap", approx_pow = FALSE, pcg_rand = TRUE, fast_sgd = FALSE, n_sgd_threads = 0, grain_size = 1, verbose = getOption("verbose", TRUE), batch = FALSE, opt_args = NULL, epoch_callback = NULL, pca_method = NULL, binary_edge_weights = FALSE, rng_type = NULL) { if (!is_sparse_matrix(graph)) { stop("graph should be a sparse matrix") } if (nrow(graph) != ncol(graph)) { stop("graph should be a square matrix") } if (!Matrix::isSymmetric(graph)) { stop("graph should be symmetric") } if (!all(diff(graph@p) > 0)) { stop("All items must have at least one neighbor similarity defined") } # Just do things the UMAP way or we will have a very slow largevis # optimization if (is.null(n_epochs)) { n_vertices <- nrow(graph) if (n_vertices <= 10000) { n_epochs <- 500 } else { n_epochs <- 200 } } uwot( X = X, nn_method = graph, is_similarity_graph = TRUE, n_components = n_components, n_epochs = n_epochs, alpha = learning_rate, init = init, init_sdev = init_sdev, spread = spread, min_dist = min_dist, gamma = repulsion_strength, negative_sample_rate = negative_sample_rate, a = a, b = b, method = method, approx_pow = approx_pow, pcg_rand = pcg_rand, fast_sgd = fast_sgd, n_sgd_threads = n_sgd_threads, grain_size = grain_size, verbose = verbose, batch = batch, opt_args = opt_args, epoch_callback = epoch_callback, pca_method = pca_method, rng_type = rng_type ) } #' Merge Similarity Graph by Simplicial Set Union #' #' Combine two similarity graphs by treating them as fuzzy topological sets and #' forming the union. #' #' @param x A sparse matrix representing the first similarity graph in the union #' operation. #' @param y A sparse matrix representing the second similarity graph in the #' union operation. #' @param n_threads Number of threads to use when resetting the local metric. #' Default is half the number of concurrent threads supported by the system. #' @param verbose If \code{TRUE}, log progress to the console. #' @returns A sparse matrix containing the union of \code{x} and \code{y}. #' @examples #' #' # Form two different "views" of the same data #' iris30 <- iris[c(1:10, 51:60, 101:110), ] #' iris_sg12 <- similarity_graph(iris30[, 1:2], n_neighbors = 5) #' iris_sg34 <- similarity_graph(iris30[, 3:4], n_neighbors = 5) #' #' # Combine the two representations into one #' iris_combined <- simplicial_set_union(iris_sg12, iris_sg34) #' #' # Optimize the layout based on the combined view #' iris_combined_umap <- optimize_graph_layout(iris_combined, n_epochs = 100) #' @export simplicial_set_union <- function(x, y, n_threads = NULL, verbose = FALSE) { if (!is_sparse_matrix(x)) { stop("similarity graph x must be a sparse matrix") } if (!is_sparse_matrix(y)) { stop("similarity graph y must be a sparse matrix") } if (!all(dim(x) == dim(y))) { stop("x and y must have identical dimensions") } z <- methods::as(x + y, "TsparseMatrix") z@x <- general_sset_union_cpp( x@p, x@i, x@x, y@p, y@i, y@x, z@i, z@j, z@x ) z <- Matrix::drop0(z) reset_local_connectivity( z, reset_local_metric = TRUE, n_threads = n_threads, verbose = verbose ) } #' Merge Similarity Graph by Simplicial Set Intersection #' #' Combine two similarity graphs by treating them as fuzzy topological sets and #' forming the intersection. #' #' @param x A sparse matrix representing the first similarity graph in the #' intersection operation. #' @param y A sparse matrix representing the second similarity graph in the #' intersection operation. #' @param weight A value between \code{0 - 1}, controlling the relative #' influence of \code{x} and \code{y} in the intersection. Default #' (\code{0.5}) gives equal influence. Values smaller than \code{0.5} put more #' weight on \code{x}. Values greater than \code{0.5} put more weight on #' \code{y}. #' @param n_threads Number of threads to use when resetting the local metric. #' Default is half the number of concurrent threads supported by the system. #' @param verbose If \code{TRUE}, log progress to the console. #' @returns A sparse matrix containing the intersection of \code{x} and #' \code{y}. #' @examples #' #' # Form two different "views" of the same data #' iris30 <- iris[c(1:10, 51:60, 101:110), ] #' iris_sg12 <- similarity_graph(iris30[, 1:2], n_neighbors = 5) #' iris_sg34 <- similarity_graph(iris30[, 3:4], n_neighbors = 5) #' #' # Combine the two representations into one #' iris_combined <- simplicial_set_intersect(iris_sg12, iris_sg34) #' #' # Optimize the layout based on the combined view #' iris_combined_umap <- optimize_graph_layout(iris_combined, n_epochs = 100) #' @export simplicial_set_intersect <- function(x, y, weight = 0.5, n_threads = NULL, verbose = FALSE) { if (weight < 0 || weight > 1) { stop("weight must be between 0-1") } if (!is_sparse_matrix(x)) { stop("similarity graph x must be a sparse matrix") } if (!is_sparse_matrix(y)) { stop("similarity graph y must be a sparse matrix") } if (!all(dim(x) == dim(y))) { stop("x and y must have identical dimensions") } set_intersect( A = x, B = y, weight = weight, reset_connectivity = TRUE, reset_local_metric = TRUE, n_threads = n_threads, verbose = verbose ) } # Function that does all the real work uwot <- function(X, n_neighbors = 15, n_components = 2, metric = "euclidean", n_epochs = NULL, alpha = 1, scale = FALSE, init = "spectral", init_sdev = NULL, spread = 1, min_dist = 0.01, set_op_mix_ratio = 1.0, local_connectivity = 1.0, bandwidth = 1.0, gamma = 1.0, negative_sample_rate = 5.0, a = NULL, b = NULL, nn_method = NULL, n_trees = 50, search_k = 2 * n_neighbors * n_trees, method = "umap", perplexity = 50, approx_pow = FALSE, y = NULL, target_n_neighbors = n_neighbors, target_metric = "euclidean", target_weight = 0.5, n_threads = NULL, n_sgd_threads = 0, grain_size = 1, kernel = "gauss", ret_model = FALSE, ret_nn = FALSE, ret_fgraph = FALSE, ret_sigma = FALSE, ret_localr = FALSE, pca = NULL, pca_center = TRUE, pca_method = NULL, pcg_rand = TRUE, fast_sgd = FALSE, batch = FALSE, opt_args = NULL, tmpdir = tempdir(), verbose = getOption("verbose", TRUE), epoch_callback = NULL, binary_edge_weights = FALSE, dens_scale = NULL, is_similarity_graph = FALSE, seed = NULL, nn_args = list(), sparse_X_is_distance_matrix = TRUE, rng_type = "pcg") { if (is.null(n_threads)) { n_threads <- default_num_threads() } method <- match.arg(tolower(method), c("umap", "tumap", "largevis")) if (method == "umap") { if (is.null(a) || is.null(b)) { ab_res <- find_ab_params(spread = spread, min_dist = min_dist) a <- ab_res[1] b <- ab_res[2] tsmessage("UMAP embedding parameters a = ", formatC(a), " b = ", formatC(b)) } else { # set min_dist and spread to NULL so if ret_model = TRUE, their default # values are not mistaken for having been used for anything min_dist <- NULL spread <- NULL } } if (n_neighbors < 2) { stop("n_neighbors must be >= 2") } if (set_op_mix_ratio < 0.0 || set_op_mix_ratio > 1.0) { stop("set_op_mix_ratio must be between 0.0 and 1.0") } if (local_connectivity < 1.0) { stop("local_connectivity cannot be < 1.0") } if (!is.null(y) && is.numeric(y) && any(is.na(y))) { stop("numeric y cannot contain NA") } if (!is.numeric(n_components) || n_components < 1) { stop("'n_components' must be a positive integer") } if (!is.null(pca)) { if (!is.numeric(pca) || pca < 1) { stop("'pca' must be a positive integer") } if (pca < n_components) { stop("'pca' must be >= n_components") } if (pca > min(nrow(X), ncol(X))) { tsmessage("pca = ", pca, " is greater than the number of rows or columns in X, ", "ignoring pca") pca <- NULL } } if (is.null(pca_method)) { pca_method <- "auto" } pca_method <- match.arg(pca_method, choices = c("irlba", "svdr", "bigstatsr", "svd", "auto") ) if (fast_sgd) { n_sgd_threads <- "auto" pcg_rand <- FALSE approx_pow <- TRUE } if (n_threads < 0) { stop("n_threads cannot be < 0") } if (n_threads %% 1 != 0) { n_threads <- round(n_threads) tsmessage("Non-integer 'n_threads' provided. Setting to ", n_threads) } if (n_sgd_threads == "auto") { n_sgd_threads <- n_threads } if (n_sgd_threads < 0) { stop("n_sgd_threads cannot be < 0") } if (n_sgd_threads %% 1 != 0) { n_sgd_threads <- round(n_sgd_threads) tsmessage("Non-integer 'n_sgd_threads' provided. Setting to ", n_sgd_threads) } if (!is.null(dens_scale) && approx_pow) { warning("approx_pow parameter is ignored when using dens_scale") approx_pow <- FALSE } # 110: for more consistent reproducibility set a user-supplied seed if (!is.null(seed)) { tsmessage("Setting random seed ", seed) set.seed(seed) } if (is.null(rng_type)) { rng_type <- ifelse(pcg_rand, "pcg", "tausworthe") } rng_type <- match.arg(rng_type, c("pcg", "tausworthe", "deterministic")) if (is.character(nn_method) && nn_method == "hnsw") { if (!is_installed("RcppHNSW")) { stop("RcppHNSW is required for nn_method = 'hnsw', please install it") } if (!is_ok_hnsw_metric(metric)) { stop( "bad metric: hnsw only supports 'euclidean', 'cosine' or ", "'correlation' metrics" ) } if (!is_ok_hnsw_metric(target_metric)) { stop( "bad target_metric: hnsw only supports 'euclidean', 'cosine' or ", "'correlation' metrics" ) } } if (is.character(nn_method) && nn_method == "nndescent") { if (!is_installed("rnndescent")) { stop("rnndescent is required for nn_method = 'nndescent',", " please install it") } } ret_extra <- ret_model || ret_nn || ret_fgraph || ret_sigma || ret_localr # Store categorical columns to be used to generate the graph Xcat <- NULL # number of original columns in data frame (or matrix) # will be used only if using df or matrix and ret_model = TRUE norig_col <- NULL # row names for the input data, which we will apply to the embedding if # needed Xnames <- NULL num_precomputed_nns <- 0 if (is.null(X)) { if (!nn_is_precomputed(nn_method)) { stop("If X is NULL, must provide NN data in nn_method") } if (is.character(init) && tolower(init) %in% c("spca", "pca")) { stop("init = 'pca' and 'spca' can't be used with X = NULL") } if (length(nn_method) == 0) { stop("Incorrect format for precalculated neighbor data") } n_vertices <- x2nv(nn_method) stopifnot(n_vertices > 0) num_precomputed_nns <- check_graph_list(nn_method, n_vertices, bipartite = FALSE ) Xnames <- nn_graph_row_names_list(nn_method) } else if (inherits(X, "dist")) { if (ret_model) { stop("Can only create models with dense matrix or data frame input") } checkna(X) n_vertices <- attr(X, "Size") tsmessage("Read ", n_vertices, " rows") Xnames <- labels(X) } else if (is_sparse_matrix(X) && sparse_X_is_distance_matrix) { if (ret_model) { stop("Can only create models with dense matrix or data frame input") } checkna(X) n_vertices <- nrow(X) if (ncol(X) != n_vertices) { stop("Sparse matrices are only supported as distance matrices") } tsmessage("Read ", n_vertices, " rows of sparse distance matrix") Xnames <- row.names(X) } else { cat_ids <- NULL norig_col <- ncol(X) if (methods::is(X, "data.frame") || methods::is(X, "matrix") || is_sparse_matrix(X)) { cat_res <- find_categoricals(metric) metric <- cat_res$metrics cat_ids <- cat_res$categoricals # Convert categorical columns to factors if they aren't already if (!is.null(cat_ids)) { X[, cat_ids] <- sapply(X[, cat_ids, drop = FALSE], factor, simplify = methods::is(X, "matrix") ) Xcat <- X[, cat_ids, drop = FALSE] } if (methods::is(X, "data.frame")) { indexes <- which(vapply(X, is.numeric, logical(1))) if (length(indexes) == 0) { stop("No numeric columns found") } tsmessage("Converting dataframe to numerical matrix") if (length(indexes) != ncol(X)) { X <- X[, indexes] } X <- as.matrix(X) } if (n_components > ncol(X)) { warning( "n_components ", "> number of columns in input data: ", n_components, " > ", ncol(X), ", this may give poor or unexpected results" ) } } else { stop("Unknown input data format") } checkna(X) n_vertices <- nrow(X) tsmessage( "Read ", n_vertices, " rows and found ", ncol(X), " numeric columns", appendLF = is.null(cat_ids) ) if (length(cat_ids) > 0) { tsmessage(" and ", pluralize("categorical column", length(cat_ids)), time_stamp = FALSE ) } Xnames <- row.names(X) X <- remove_scaling_attrs(X) X <- scale_input(X, scale_type = scale, ret_model = ret_model, verbose = verbose ) } # Store number of precomputed nn if X is non-NULL (NULL X case handled above) if (nn_is_precomputed(nn_method) && num_precomputed_nns == 0) { num_precomputed_nns <- check_graph_list(nn_method, n_vertices, bipartite = FALSE ) if (is.null(Xnames)) { Xnames <- nn_graph_row_names_list(nn_method) } } if (method == "largevis" && kernel == "knn") { n_neighbors <- perplexity } if (max(n_neighbors) > n_vertices) { # pre-calculated nearest neighbors ignores the user-supplied n_neighbors # which is handled later if (!is.list(nn_method)) { if (method == "largevis") { # for LargeVis, n_neighbors normally determined from perplexity not an # error to be too large tsmessage("Setting n_neighbors to ", n_vertices) n_neighbors <- n_vertices } else { stop("n_neighbors must be smaller than the dataset size") } } } if (!is.list(metric)) { metrics <- list(c()) names(metrics) <- metric } else { metrics <- metric } # For typical case of numeric matrix X and not using binary metric, save # PCA results here in case initialization uses PCA too pca_models <- NULL pca_shortcut <- FALSE if (!is.null(pca) && length(metric) == 1 && !is_binary_metric(metric) && is.matrix(X) && ncol(X) > pca) { tsmessage("Reducing X column dimension to ", pca, " via PCA") pca_res <- pca_init(X, ndim = pca, center = pca_center, pca_method = pca_method, ret_extra = ret_model, verbose = verbose ) if (ret_model) { X <- pca_res$scores pca_models[["1"]] <- pca_res[c("center", "rotation")] pca_res <- NULL } else { X <- pca_res } pca_shortcut <- TRUE } if (is_similarity_graph) { d2sr <- list( V = nn_method, nns = NULL, pca_models = NULL, sigma = NULL, rho = NULL ) need_sigma <- FALSE } else { need_sigma <- ret_sigma || ret_localr || !is.null(dens_scale) d2sr <- data2set( X, Xcat, n_neighbors, metrics, nn_method, n_trees, search_k, method, set_op_mix_ratio, local_connectivity, bandwidth, perplexity, kernel, need_sigma, n_threads, grain_size, ret_model, pca = pca, pca_center = pca_center, pca_method = pca_method, n_vertices = n_vertices, nn_args = nn_args, tmpdir = tmpdir, sparse_is_distance = sparse_X_is_distance_matrix, verbose = verbose ) } V <- d2sr$V nns <- d2sr$nns if (is.null(pca_models)) { pca_models <- d2sr$pca_models } # Calculate approximate local radii sigma <- NULL rho <- NULL localr <- NULL dint <- NULL if (need_sigma) { sigma <- d2sr$sigma rho <- d2sr$rho dint <- d2sr$dint } if (!is.null(dens_scale) || ret_localr) { localr <- sigma + rho } if (!is.null(y)) { tsmessage("Processing y data") if (!is.list(target_metric)) { target_metrics <- list(c()) names(target_metrics) <- target_metric } else { target_metrics <- target_metric } ycat <- NULL ycat_ids <- NULL if (methods::is(y, "data.frame")) { ycat_res <- find_categoricals(target_metric) target_metric <- ycat_res$metrics ycat_ids <- ycat_res$categoricals if (!is.null(ycat_ids)) { ycat <- y[, ycat_ids, drop = FALSE] } else { ycindexes <- which(vapply(y, is.factor, logical(1))) if (length(ycindexes) > 0) { ycat <- (y[, ycindexes, drop = FALSE]) } } yindexes <- which(vapply(y, is.numeric, logical(1))) if (length(yindexes) > 0) { y <- as.matrix(y[, yindexes]) } else { y <- NULL } } else if (is.list(y)) { nn_method <- y } else if (is.numeric(y)) { y <- as.matrix(y) } else if (is.factor(y)) { ycat <- data.frame(y) y <- NULL } if (!is.null(y)) { yd2sr <- data2set(y, ycat, target_n_neighbors, target_metrics, nn_method, n_trees, search_k, method, set_op_mix_ratio = 1.0, local_connectivity = 1.0, bandwidth = 1.0, perplexity = perplexity, kernel = kernel, ret_sigma = FALSE, n_threads = n_threads, grain_size = grain_size, ret_model = FALSE, pca = pca, pca_center = TRUE, pca_method = pca_method, n_vertices = n_vertices, tmpdir = tmpdir, verbose = verbose ) tsmessage( "Intersecting X and Y sets with target weight = ", formatC(target_weight) ) # behavior for supervised UMAP: do reset local connectivity # don't reset metric (same as Python UMAP as of 0.5.3) V <- set_intersect(V, yd2sr$V, target_weight, reset_connectivity = TRUE) yd2sr$V <- NULL yd2sr$nns <- NULL } else if (!is.null(ycat)) { V <- categorical_intersection_df(ycat, V, weight = target_weight, verbose = verbose ) } } if (!(ret_model || ret_nn)) { nns <- NULL gc() } if (methods::is(init, "matrix")) { if (nrow(init) != n_vertices || ncol(init) != n_components) { stop("init matrix does not match necessary configuration for X: ", "should have dimensions (", n_vertices, ", ", n_components, ")") } tsmessage("Initializing from user-supplied matrix") embedding <- scale_coords(init, init_sdev, verbose = verbose) } else if (!(methods::is(init, "character") && length(init) == 1)) { if (is.null(init) && !is.null(n_epochs) && n_epochs == 0) { embedding <- NULL if (!ret_extra) { warning( "Neither high-dimensional nor low-dimensional data will be ", "returned with this combination of settings" ) } if (ret_model) { warning( "Returning a model but it will not be valid for transforming ", "new data" ) } } else { stop( "init should be either a matrix or string describing the ", "initialization method" ) } } else { init <- match.arg(tolower(init), c( "spectral", "random", "lvrandom", "normlaplacian", "laplacian", "spca", "pca", "inormlaplacian", "ispectral", "agspectral", "irlba_spectral", "irlba_laplacian", "pacpca" )) if (init_is_spectral(init) && init != "agspectral") { connected <- connected_components(V) if (connected$n_components > 1) { tsmessage("Found ", connected$n_components, " connected components, ", appendLF = FALSE) if (is.null(X)) { tsmessage("falling back to random initialization", time_stamp = FALSE) init <- "random" } else { tsmessage("falling back to 'spca' initialization with init_sdev = 1", time_stamp = FALSE ) init <- "spca" init_sdev <- 1 } } } # Don't repeat PCA initialization if we've already done it once if (pca_shortcut && init %in% c("spca", "pca", "pacpca") && pca >= n_components) { embedding <- X[, 1:n_components] switch(init, spca = tsmessage("Initializing from scaled PCA"), pca = tsmessage("Initializing from PCA"), pacpca = tsmessage("Initializing from PaCMAP-style PCA"), stop("Unknown init method '", init, "'") ) } else { embedding <- switch(init, spectral = spectral_init(V, ndim = n_components, verbose = verbose), random = rand_init(n_vertices, n_components, verbose = verbose), lvrandom = rand_init_lv(n_vertices, n_components, verbose = verbose), normlaplacian = normalized_laplacian_init(V, ndim = n_components, verbose = verbose ), laplacian = laplacian_eigenmap(V, ndim = n_components, verbose = verbose), # we handle scaling pca below spca = pca_init(X, ndim = n_components, pca_method = pca_method, verbose = verbose ), pca = pca_init(X, ndim = n_components, pca_method = pca_method, verbose = verbose ), pacpca = pca_init(X, ndim = n_components, pca_method = pca_method, verbose = verbose ), ispectral = irlba_spectral_init(V, ndim = n_components, verbose = verbose), inormlaplacian = irlba_normalized_laplacian_init(V, ndim = n_components, verbose = verbose ), agspectral = agspectral_init(V, n_neg_nbrs = negative_sample_rate, ndim = n_components, verbose = verbose ), irlba_spectral = spectral_init(V, ndim = n_components, verbose = verbose, force_irlba = TRUE), irlba_laplacian = laplacian_eigenmap(V, ndim = n_components, verbose = verbose, force_irlba = TRUE), stop("Unknown initialization method: '", init, "'") ) } if (init == "pacpca") { embedding <- 0.01 * embedding } if (!is.null(init_sdev) || init == "spca") { if (is.null(init_sdev)) { init_sdev <- 1e-4 } embedding <- scale_coords(embedding, init_sdev, verbose = verbose) } } if (any(is.na(embedding))) { stop("Initial data contains NA values: is n_components too high?") } if (is.null(n_epochs) || n_epochs < 0) { if (method == "largevis") { n_epochs <- lvish_epochs(n_vertices, V) } else { if (n_vertices <= 10000) { n_epochs <- 500 } else { n_epochs <- 200 } } } full_opt_args <- get_opt_args(opt_args, alpha) if (binary_edge_weights) { V@x <- rep(1, length(V@x)) } if (n_epochs > 0) { if (any(apply(embedding, 2, stats::sd) > 10.0)) { warning( "Initial embedding standard deviation > 10.0, this can lead to ", "poor optimization" ) } # remove edges which can't be sampled due to n_epochs V@x[V@x < max(V@x) / n_epochs] <- 0 V <- Matrix::drop0(V) # Create the (0-indexed) indices of the head and tail of each directed edge # in V. Graph is symmetric, so both (i->j) and (j->i) are present if (batch) { V <- Matrix::t(V) # head is ordered in non-decreasing order of node index positive_head <- Matrix::which(V != 0, arr.ind = TRUE)[, 2] - 1 # tail is unordered positive_tail <- V@i } else { # Use the Python UMAP ordering # head is unordered positive_head <- V@i # tail is ordered in non-decreasing order of node index positive_tail <- Matrix::which(V != 0, arr.ind = TRUE)[, 2] - 1 } # start/end pointers into the ordered vector positive_ptr <- V@p epochs_per_sample <- make_epochs_per_sample(V@x, n_epochs) tsmessage( "Commencing optimization for ", n_epochs, " epochs, with ", length(positive_head), " positive edges", pluralize("thread", n_sgd_threads, " using") ) ai <- NULL if (!is.null(dens_scale)) { ai <- scale_radii(localr, dens_scale, a) method <- "leopold" if (ret_model) { # store the linear transform from localr to ai for transforming new data lai2 <- 2 * log(range(ai)) llr <- -log(rev(range(localr))) rad_coeff <- stats::lm(lai2 ~ llr)$coefficients } } method <- tolower(method) method_args <- switch(method, umap = list(a = a, b = b, gamma = gamma, approx_pow = approx_pow), tumap = list(gamma = gamma), # a = 1 b = 10 for final phase of PaCMAP optimization pacmap = list(a = a, b = b), largevis = list(gamma = gamma), leopold = list(ai = ai, b = b, ndim = n_components), stop("Unknown dimensionality reduction method '", method, "'") ) tsmessage("Using rng type: ", rng_type) embedding <- t(embedding) embedding <- optimize_layout_r( head_embedding = embedding, tail_embedding = NULL, positive_head = positive_head, positive_tail = positive_tail, positive_ptr = positive_ptr, n_epochs = n_epochs, n_head_vertices = n_vertices, n_tail_vertices = n_vertices, epochs_per_sample = epochs_per_sample, method = method, method_args = method_args, initial_alpha = alpha, opt_args = full_opt_args, negative_sample_rate = negative_sample_rate, rng_type = rng_type, batch = batch, n_threads = n_sgd_threads, grain_size = grain_size, move_other = TRUE, epoch_callback = epoch_callback, verbose = verbose ) embedding <- t(embedding) gc() # Center the points before returning embedding <- scale(embedding, center = TRUE, scale = FALSE) if (is.null(row.names(embedding)) && !is.null(Xnames) && length(Xnames) == nrow(embedding)) { row.names(embedding) <- Xnames } tsmessage("Optimization finished") } if (ret_extra) { nblocks <- length(nns) res <- list(embedding = embedding) if (ret_model) { res <- append(res, list( scale_info = if (!is.null(X)) { attr_to_scale_info(X) } else { NULL }, search_k = search_k, local_connectivity = local_connectivity, n_epochs = n_epochs, alpha = alpha, negative_sample_rate = negative_sample_rate, method = method, a = a, b = b, gamma = gamma, approx_pow = approx_pow, metric = metrics, norig_col = norig_col, pcg_rand = pcg_rand, batch = batch, opt_args = full_opt_args, num_precomputed_nns = num_precomputed_nns, # #95: min_dist and spread are exported for documentation purposes only min_dist = min_dist, spread = spread, binary_edge_weights = binary_edge_weights, seed = seed, nn_method = nn_method, nn_args = nn_args )) if (nn_is_precomputed(nn_method)) { res$n_neighbors <- nn_graph_nbrs_list(nn_method) } else { res$n_neighbors <- n_neighbors } if (method == "leopold") { res$dens_scale <- dens_scale res$ai <- ai res$rad_coeff <- rad_coeff } if (nblocks > 1) { if (!nn_is_precomputed(nn_method)) { res$nn_index <- list() for (i in 1:nblocks) { res$nn_index[[i]] <- nns[[i]]$index } } } else { if (!is.null(nns[[1]]$index)) { res$nn_index <- nns[[1]]$index if (is.null(res$metric[[1]])) { # 31: Metric usually lists column indices or names, NULL means use all # of them, but for loading the NN index we need the number of # columns explicitly (we don't have access to the column dimension of # the input data at load time) if (res$nn_index$type %in% c("annoyv2", "hnswv1", "nndescentv1")) { res$metric[[1]] <- list(ndim = res$nn_index$ndim) } else { # To be sure of the dimensionality, fetch the first item from the # index and see how many elements are in the returned vector. if (!is.null(X)) { rcppannoy <- get_rcppannoy(res$nn_index) res$metric[[1]] <- list(ndim = length(rcppannoy$getItemsVector(0))) } else { res$metric[[1]] <- list() } } } } else { if (nn_is_precomputed(nn_method)) { tsmessage( "Note: model requested with precomputed neighbors. ", "For transforming new data, distance data must be ", "provided separately" ) } } } if (!is.null(pca_models)) { res$pca_models <- pca_models } } if (ret_nn) { res$nn <- list() for (i in 1:nblocks) { if (is.list(nns[[i]])) { res$nn[[i]] <- list(idx = nns[[i]]$idx, dist = nns[[i]]$dist) if (!is.null(Xnames) && nrow(res$nn[[i]]$idx) == length(Xnames)) { row.names(res$nn[[i]]$idx) <- Xnames row.names(res$nn[[i]]$dist) <- Xnames } } else if (is_sparse_matrix(nns[[i]])) { res$nn[[i]] <- nns[[i]] if (!is.null(Xnames) && nrow(res$nn[[i]]) == length(Xnames)) { row.names(res$nn[[i]]) <- Xnames colnames(res$nn[[i]]) <- Xnames } } } names(res$nn) <- names(nns) } if (ret_fgraph) { if (method == "largevis") { res$P <- V } else { res$fgraph <- V } } if (ret_sigma) { res$sigma <- sigma res$rho <- rho res$dint <- dint } if (ret_localr && !is.null(localr)) { res$localr <- localr } } else { res <- embedding } res } #' Save or Load a Model #' #' Functions to write a UMAP model to a file, and to restore. #' #' @param model a UMAP model create by \code{\link{umap}}. #' @param file name of the file where the model is to be saved or read from. #' @param unload if \code{TRUE}, unload all nearest neighbor indexes for the #' model. The \code{model} will no longer be valid for use in #' \code{\link{umap_transform}} and the temporary working directory used #' during model saving will be deleted. You will need to reload the model with #' \code{load_uwot} to use the model. If \code{FALSE}, then the model can be #' re-used without reloading, but you must manually unload the NN index when #' you are finished using it if you want to delete the temporary working #' directory. To unload manually, use \code{\link{unload_uwot}}. The absolute #' path of the working directory is found in the \code{mod_dir} item of the #' return value. #' @param verbose if \code{TRUE}, log information to the console. #' @return \code{model} with one extra item: \code{mod_dir}, which contains the #' path to the working directory. If \code{unload = FALSE} then this directory #' still exists after this function returns, and can be cleaned up with #' \code{\link{unload_uwot}}. If you don't care about cleaning up this #' directory, or \code{unload = TRUE}, then you can ignore the return value. #' @examples #' iris_train <- iris[c(1:10, 51:60), ] #' iris_test <- iris[100:110, ] #' #' # create model #' model <- umap(iris_train, ret_model = TRUE, n_epochs = 20) #' #' # save without unloading: this leaves behind a temporary working directory #' model_file <- tempfile("iris_umap") #' model <- save_uwot(model, file = model_file) #' #' # The model can continue to be used #' test_embedding <- umap_transform(iris_test, model) #' #' # To manually unload the model from memory when finished and to clean up #' # the working directory (this doesn't touch your model file) #' unload_uwot(model) #' #' # At this point, model cannot be used with umap_transform, this would fail: #' # test_embedding2 <- umap_transform(iris_test, model) #' #' # restore the model: this also creates a temporary working directory #' model2 <- load_uwot(file = model_file) #' test_embedding2 <- umap_transform(iris_test, model2) #' #' # Unload and clean up the loaded model temp directory #' unload_uwot(model2) #' #' # clean up the model file #' unlink(model_file) #' #' # save with unloading: this deletes the temporary working directory but #' # doesn't allow the model to be re-used #' model3 <- umap(iris_train, ret_model = TRUE, n_epochs = 20) #' model_file3 <- tempfile("iris_umap") #' model3 <- save_uwot(model3, file = model_file3, unload = TRUE) #' #' @seealso \code{\link{load_uwot}}, \code{\link{unload_uwot}} #' @export save_uwot <- function(model, file, unload = FALSE, verbose = FALSE) { if (!all_nn_indices_are_loaded(model)) { stop("cannot save: NN index is unloaded") } wd <- getwd() model_file <- abspath(file) if (file.exists(model_file)) { stop("model file ", model_file, " already exists") } tmp_model_file <- NULL tryCatch( { # create directory to store files in mod_dir <- tempfile(pattern = "dir") tsmessage("Creating temp model dir ", mod_dir) dir.create(mod_dir) # create the tempdir/uwot subdirectory uwot_dir <- file.path(mod_dir, "uwot") tsmessage("Creating dir ", mod_dir) dir.create(uwot_dir) # save model file to tempdir/uwot/model model_tmpfname <- file.path(uwot_dir, "model") saveRDS(model, file = model_tmpfname) # save each nn index inside tempdir/uwot/model metrics <- names(model$metric) n_metrics <- length(metrics) for (i in 1:n_metrics) { if (n_metrics == 1) { nn_index <- model$nn_index } else { nn_index <- model$nn_index[[i]] } if (startsWith(nn_index$type, "annoy") || startsWith(nn_index$type, "hnsw")) { nn_tmpfname <- file.path(uwot_dir, paste0("nn", i)) nn_meta_tmpfname <- file.path(uwot_dir, paste0("nn-meta", i)) nn_index$ann$save(nn_tmpfname) # save metadata wrapper around the index separately meta_data <- nn_index meta_data$ann <- NULL saveRDS(meta_data, file = nn_meta_tmpfname) } else if (startsWith(nn_index$type, "nndescent")) { nn_tmpfname <- file.path(uwot_dir, paste0("nn", i)) saveRDS(nn_index, file = nn_tmpfname) } else { stop("unsupported nn index type: ", model$nn_index$type) } } # archive the files under the temp dir into the single target file # change directory so the archive only contains one directory tmp_model_file <- abspath(file) tsmessage("Changing to ", mod_dir) setwd(mod_dir) tsmessage("Creating ", tmp_model_file) # #109: Windows 7 tar needs "--force-local" to avoid interpreting colon # as indicating a remote machine extra_flags <- "" if (is_win7()) { extra_flags <- "--force-local" } utils::tar( tarfile = tmp_model_file, extra_flags = extra_flags, files = "uwot/" ) }, finally = { setwd(wd) if (!is.null(tmp_model_file) && model_file != tmp_model_file) { tsmessage("Copying ", tmp_model_file, " to ", model_file) file.copy(from = tmp_model_file, to = model_file) } model$mod_dir <- mod_dir if (unload) { unload_uwot(model, cleanup = TRUE, verbose = verbose) } } ) model } #' Save or Load a Model #' #' Functions to write a UMAP model to a file, and to restore. #' #' @param file name of the file where the model is to be saved or read from. #' @param verbose if \code{TRUE}, log information to the console. #' @return The model saved at \code{file}, for use with #' \code{\link{umap_transform}}. Additionally, it contains an extra item: #' \code{mod_dir}, which contains the path to the temporary working directory #' used during loading of the model. This directory cannot be removed until #' this model has been unloaded by using \code{\link{unload_uwot}}. #' @examples #' library(RSpectra) #' #' iris_train <- iris[c(1:10, 51:60), ] #' iris_test <- iris[100:110, ] #' #' # create model #' model <- umap(iris_train, ret_model = TRUE, n_epochs = 20) #' #' # save without unloading: this leaves behind a temporary working directory #' model_file <- tempfile("iris_umap") #' model <- save_uwot(model, file = model_file) #' #' # The model can continue to be used #' test_embedding <- umap_transform(iris_test, model) #' #' # To manually unload the model from memory when finished and to clean up #' # the working directory (this doesn't touch your model file) #' unload_uwot(model) #' #' # At this point, model cannot be used with umap_transform, this would fail: #' # test_embedding2 <- umap_transform(iris_test, model) #' #' # restore the model: this also creates a temporary working directory #' model2 <- load_uwot(file = model_file) #' test_embedding2 <- umap_transform(iris_test, model2) #' #' # Unload and clean up the loaded model temp directory #' unload_uwot(model2) #' #' # clean up the model file #' unlink(model_file) #' #' # save with unloading: this deletes the temporary working directory but #' # doesn't allow the model to be re-used #' model3 <- umap(iris_train, ret_model = TRUE, n_epochs = 20) #' model_file3 <- tempfile("iris_umap") #' model3 <- save_uwot(model3, file = model_file3, unload = TRUE) #' #' @seealso \code{\link{save_uwot}}, \code{\link{unload_uwot}} #' @export load_uwot <- function(file, verbose = FALSE) { # create directory to store files in mod_dir <- tempfile(pattern = "dir") tsmessage("Creating temp directory ", mod_dir) dir.create(mod_dir) # #109: Windows 7 tar needs "--force-local" to avoid interpreting colon # as indicating a remote machine extras <- NULL if (is_win7()) { extras <- "--force-local" } utils::untar(abspath(file), exdir = mod_dir, extras = extras, verbose = verbose ) model_fname <- file.path(mod_dir, "uwot/model") if (!file.exists(model_fname)) { stop("Can't find model in ", file) } model <- readRDS(file = model_fname) metrics <- names(model$metric) n_metrics <- length(metrics) nn_method <- model$nn_method if (is.null(nn_method)) { nn_method <- "annoy" } for (i in 1:n_metrics) { nn_fname <- file.path(mod_dir, paste0("uwot/nn", i)) if (!file.exists(nn_fname)) { stop("Can't find nearest neighbor index ", nn_fname, " in ", file) } metric <- metrics[[i]] # 31: need to specify the index dimensionality when creating the index if (is.list(model$metric[[i]])) { # in case where there is only one metric, the value is a one-item list # named 'ndim' giving the number of dimensions directly: all columns # are used in this metric ndim <- model$metric[[i]]$ndim } else { # otherwise, metric specifies the name or index used for each metric, # so the dimension is the number of them ndim <- length(model$metric[[i]]) } if (nn_method == "annoy") { annoy_metric <- metric ann <- create_ann(annoy_metric, ndim = ndim) ann$load(nn_fname) idx <- list( ann = ann, type = "annoyv1", metric = annoy_metric, ndim = ndim ) if (n_metrics == 1) { model$nn_index <- idx } else { model$nn_index[[i]] <- idx } } else if (nn_method == "hnsw") { ann <- hnsw_load(metric, ndim = ndim, filename = nn_fname) nn_meta_tmpfname <- file.path(mod_dir, paste0("uwot/nn-meta", i)) idx <- readRDS(nn_meta_tmpfname) idx$ann <- ann if (n_metrics == 1) { model$nn_index <- idx } else { model$nn_index[[i]] <- idx } } else if (nn_method == "nndescent") { idx <- readRDS(nn_fname) if (n_metrics == 1) { model$nn_index <- idx } else { model$nn_index[[i]] <- idx } } else { stop("Unknown nearest neighbor method ", nn_method) } } model$mod_dir <- mod_dir model } #' Unload a Model #' #' Unloads the UMAP model. This prevents the model being used with #' \code{\link{umap_transform}}, but allows the temporary working directory #' associated with saving or loading the model to be removed. #' #' @param model a UMAP model create by \code{\link{umap}}. #' @param cleanup if \code{TRUE}, attempt to delete the temporary working #' directory that was used in either the save or load of the model. #' @param verbose if \code{TRUE}, log information to the console. #' #' @examples #' iris_train <- iris[c(1:10, 51:60), ] #' iris_test <- iris[100:110, ] #' #' # create model #' model <- umap(iris_train, ret_model = TRUE, n_epochs = 20) #' #' # save without unloading: this leaves behind a temporary working directory #' model_file <- tempfile("iris_umap") #' model <- save_uwot(model, file = model_file) #' #' # The model can continue to be used #' test_embedding <- umap_transform(iris_test, model) #' #' # To manually unload the model from memory when finished and to clean up #' # the working directory (this doesn't touch your model file) #' unload_uwot(model) #' #' # At this point, model cannot be used with umap_transform, this would fail: #' # test_embedding2 <- umap_transform(iris_test, model) #' #' # restore the model: this also creates a temporary working directory #' model2 <- load_uwot(file = model_file) #' test_embedding2 <- umap_transform(iris_test, model2) #' #' # Unload and clean up the loaded model temp directory #' unload_uwot(model2) #' #' # clean up the model file #' unlink(model_file) #' #' # save with unloading: this deletes the temporary working directory but #' # doesn't allow the model to be re-used #' model3 <- umap(iris_train, ret_model = TRUE, n_epochs = 20) #' model_file3 <- tempfile("iris_umap") #' model3 <- save_uwot(model3, file = model_file3, unload = TRUE) #' #' @seealso \code{\link{save_uwot}}, \code{\link{load_uwot}} #' @export unload_uwot <- function(model, cleanup = TRUE, verbose = FALSE) { if (is.null(model$nn_method) || model$nn_method == "annoy") { tsmessage("Unloading NN index: model will be invalid") metrics <- names(model$metric) n_metrics <- length(metrics) for (i in 1:n_metrics) { if (n_metrics == 1) { rcppannoy <- get_rcppannoy(model$nn_index) rcppannoy$unload() } else { rcppannoy <- get_rcppannoy(model$nn_index[[i]]) rcppannoy$unload() } } } if (cleanup) { if (is.null(model$mod_dir)) { tsmessage("Model is missing temp dir location, can't clean up") return() } else { mod_dir <- model$mod_dir if (!file.exists(mod_dir)) { tsmessage("model temp dir location '", mod_dir, "' no longer exists") return() } tsmessage("Deleting temp model dir ", mod_dir) res <- unlink(mod_dir, recursive = TRUE) if (res != 0) { tsmessage("Unable to delete tempdir ", mod_dir) } } } } all_nn_indices_are_loaded <- function(model) { if (is.null(model$nn_index)) { stop("Invalid model: has no 'nn_index'") } if (is.list(model$nn_index)) { if (is.null(model$nn_index$type)) { for (i in 1:length(model$nn_index)) { rcppannoy <- get_rcppannoy(model$nn_index[[i]]) if (rcppannoy$getNTrees() == 0) { return(FALSE) } } } else if (model$nn_index$type == "annoyv1") { rcppannoy <- get_rcppannoy(model$nn_index) if (rcppannoy$getNTrees() == 0) { return(FALSE) } } else if (model$nn_index$type == "hnswv1") { return(TRUE) } else if (model$nn_index$type == "nndescentv1") { return(TRUE) } else { stop("Invalid model: has unknown 'nn_index' type ", model$nn_index$type) } } else { rcppannoy <- get_rcppannoy(model$nn_index) if (rcppannoy$getNTrees() == 0) { return(FALSE) } } TRUE } abspath <- function(filename) { file.path(normalizePath(dirname(filename)), basename(filename)) } # Half of whatever the C++ implementation thinks are the number of concurrent # threads supported, but at least 1 default_num_threads <- function() { max(1, hardware_concurrency() / 2) } # Get the number of vertices in X x2nv <- function(X) { if (is.list(X)) { if (!is.null(X$idx)) { n_vertices <- x2nv(X$idx) } else { if (length(X) > 0) { n_vertices <- x2nv(X[[1]]) } else { stop("Can't find n_vertices for list X") } } } else if (inherits(X, "dist")) { n_vertices <- attr(X, "Size") } else if (is_sparse_matrix(X)) { # older code path where distance matrix was part of X rather than nn_method # used nrow, but transform was not supported so nrow == ncol n_vertices <- ncol(X) } else if (methods::is(X, "data.frame") || methods::is(X, "matrix")) { n_vertices <- nrow(X) } else if (is.numeric(X)) { n_vertices <- length(X) } else { stop("Can't find number of vertices for X of type '", class(X)[1], "'") } n_vertices } data2set <- function(X, Xcat, n_neighbors, metrics, nn_method, n_trees, search_k, method, set_op_mix_ratio, local_connectivity, bandwidth, perplexity, kernel, ret_sigma, n_threads, grain_size, ret_model, n_vertices = x2nv(X), tmpdir = tempdir(), pca = NULL, pca_center = TRUE, pca_method = "irlba", nn_args = list(), sparse_is_distance = TRUE, verbose = FALSE) { V <- NULL nns <- list() nblocks <- length(metrics) sigma <- NULL # Check for precalculated NN data in nn_method if (is.list(nn_method)) { if (is.null(nn_method$idx)) { nblocks <- length(nn_method) if (nblocks == 0) { stop("Incorrect format for precalculated neighbor data") } } else { nblocks <- 1 # wrap nn data in a list so data is always a list of lists nn_method <- list(nn_method) } metrics <- replicate(nblocks, NULL, simplify = FALSE) names(metrics) <- rep("precomputed", nblocks) } if (nblocks > 1) { tsmessage("Found ", nblocks, " blocks of data") } mnames <- tolower(names(metrics)) if (is.null(nn_method)) { if (methods::is(X, "matrix")) { if (n_vertices < 4096 && !ret_model && all(mnames == "euclidean")) { tsmessage("Using FNN for neighbor search, n_neighbors = ", n_neighbors) nn_method <- "fnn" } else { tsmessage("Using Annoy for neighbor search, n_neighbors = ", n_neighbors) nn_method <- "annoy" } } else { # It's a dist, or an actual distance matrix (sparse or triangular) nn_method <- "matrix" } } pca_models <- list() for (i in 1:nblocks) { metric <- mnames[[i]] if (is.character(nn_method) && nn_method == "annoy") { metric <- match.arg(metric, c( "euclidean", "cosine", "manhattan", "hamming", "correlation", "precomputed" )) } # Defaults for this block which can be overridden pca_i <- pca pca_center_i <- pca_center subset <- metrics[[i]] if (is.null(subset)) { Xsub <- X } else if (is.list(subset)) { # e.g. "euclidean" = list(1:10, pca_center = FALSE), lsres <- lsplit_unnamed(subset) if (is.null(lsres$unnamed)) { stop("Error: no subset provided for block ", i) } if (length(lsres$unnamed) != 1) { stop("Error: only one unnamed item should be provided for block ", i) } subset <- lsres$unnamed[[1]] # possible overrides if (!is.null(lsres$named)) { lsnamed <- lsres$named lsnames <- names(lsnamed) if (!is.null(lsnamed$pca_center)) { pca_center_i <- lsnamed$pca_center } # PCA argument can be NULL, so need to check if it was explicitly provided if ("pca" %in% lsnames) { pca_i <- lsnamed$pca } } Xsub <- X[, subset, drop = FALSE] } else { Xsub <- X[, subset, drop = FALSE] } if (!is.null(X) && is.matrix(X)) { block_size <- ncol(Xsub) if (block_size == 0) { stop("Block ", i, " has zero size") } if (nblocks > 1) { tsmessage( "Processing block ", i, " of ", nblocks, " with size ", block_size, " using metric '", metric, "'" ) } } else { # X is NULL or dist or something like that if (nblocks > 1) { tsmessage( "Processing block ", i, " of ", nblocks, " using metric '", metric, "'" ) } } if (!is.null(pca_i) && is.matrix(X) && metric != "hamming" && ncol(X) > pca_i && nrow(X) > pca_i) { tsmessage("Reducing column dimension to ", pca_i, " via PCA") pca_res <- pca_init(Xsub, pca_i, ret_extra = ret_model, center = pca_center_i, pca_method = pca_method, verbose = verbose ) if (ret_model) { Xsub <- pca_res$scores pca_models[[as.character(i)]] <- pca_res[c("center", "rotation")] pca_res <- NULL } else { Xsub <- pca_res } } nn_sub <- nn_method # Extract this block of nn data from list of lists if (metric == "precomputed") { nn_sub <- nn_method[[i]] n_neighbors <- NULL } x2set_res <- x2set( Xsub, n_neighbors, metric, nn_method = nn_sub, n_trees, search_k, method, set_op_mix_ratio, local_connectivity, bandwidth, perplexity, kernel, ret_sigma, n_threads, grain_size, ret_model, n_vertices = n_vertices, nn_args = nn_args, tmpdir = tmpdir, sparse_is_distance = sparse_is_distance, verbose = verbose ) Vblock <- x2set_res$V nn <- x2set_res$nn nns[[i]] <- nn names(nns)[[i]] <- metric if (is.null(V)) { V <- Vblock } else { # TODO: should at least offer the option to reset the local metric here # TODO: make , reset_local_metric = TRUE the default (breaking change) V <- set_intersect(V, Vblock, weight = 0.5, reset_connectivity = TRUE) } if (ret_sigma && is.null(sigma)) { # No idea how to combine different neighborhood sizes so just return the # first set sigma <- x2set_res$sigma rho <- x2set_res$rho dint <- x2set_res$dint } } if (!is.null(Xcat)) { V <- categorical_intersection_df(Xcat, V, weight = 0.5, verbose = verbose) } res <- list(V = V, nns = nns, pca_models = pca_models) if (!is.null(sigma)) { res$sigma <- sigma res$rho <- rho res$dint <- dint } res } x2nn <- function(X, n_neighbors, metric, nn_method, n_trees, search_k, tmpdir = tempdir(), n_threads, grain_size, ret_model, n_vertices = x2nv(X), nn_args = list(), sparse_is_distance = TRUE, verbose = FALSE) { if (is.list(nn_method)) { validate_nn(nn_method, n_vertices) nn <- nn_method } else { nn_method <- match.arg(tolower(nn_method), c("annoy", "fnn", "matrix", "hnsw", "nndescent")) if (nn_method == "fnn" && metric != "euclidean") { stop( "nn_method = 'FNN' is only compatible with distance metric ", "'euclidean'" ) } if (nn_method == "fnn" && ret_model) { stop("nn_method = 'FNN' is incompatible with ret_model = TRUE") } nn <- find_nn( X, n_neighbors, method = nn_method, metric = metric, n_trees = n_trees, search_k = search_k, nn_args = nn_args, tmpdir = tmpdir, n_threads = n_threads, grain_size = grain_size, ret_index = ret_model, sparse_is_distance = sparse_is_distance, verbose = verbose ) } nn } validate_nn <- function(nn_method, n_vertices) { if (!is.matrix(nn_method$idx)) { stop("Couldn't find precalculated 'idx' matrix") } if (nrow(nn_method$idx) != n_vertices) { stop( "Precalculated 'idx' matrix must have ", n_vertices, " rows, but found ", nrow(nn_method$idx) ) } if (!is.matrix(nn_method$dist)) { stop("Couldn't find precalculated 'dist' matrix") } if (nrow(nn_method$idx) != n_vertices) { stop("Precalculated 'dist' matrix must have ", n_vertices, " rows, but found ", nrow(nn_method$dist)) } if (ncol(nn_method$dist) != ncol(nn_method$idx)) { stop("Precalculated 'dist' matrix must have ", ncol(nn_method$idx), " cols, but found ", ncol(nn_method$dist)) } } nn2set <- function(method, nn, set_op_mix_ratio, local_connectivity, bandwidth, perplexity, kernel, ret_sigma, n_threads, grain_size, verbose = FALSE) { sigma <- NULL res <- list() if (method == "largevis") { n_vertices <- nrow(nn$dist) if (perplexity >= n_vertices) { stop("perplexity can be no larger than ", n_vertices - 1) } Vres <- perplexity_similarities( nn = nn, perplexity = perplexity, ret_sigma = ret_sigma, n_threads = n_threads, grain_size = grain_size, kernel = kernel, verbose = verbose ) res$V <- Vres$matrix if (ret_sigma && !is.null(Vres$sigma)) { res$sigma <- Vres$sigma res$dint <- Vres$dint } } else { Vres <- fuzzy_simplicial_set( nn = nn, set_op_mix_ratio = set_op_mix_ratio, local_connectivity = local_connectivity, bandwidth = bandwidth, ret_sigma = ret_sigma, n_threads = n_threads, grain_size = grain_size, verbose = verbose ) if (ret_sigma) { res$V <- Vres$matrix res$sigma <- Vres$sigma res$rho <- Vres$rho } else { res$V <- Vres } } res } x2set <- function(X, n_neighbors, metric, nn_method, n_trees, search_k, method, set_op_mix_ratio, local_connectivity, bandwidth, perplexity, kernel, ret_sigma, n_threads, grain_size, ret_model, n_vertices = x2nv(X), tmpdir = tempdir(), nn_args = list(), sparse_is_distance = TRUE, verbose = FALSE) { if (is_sparse_matrix(nn_method)) { nn <- nn_method if (nrow(nn) != ncol(nn)) { stop("Sparse distance matrix must have same number of rows and cols") } if (nrow(nn) != n_vertices) { stop("Sparse distance matrix must have same dimensions as input data") } } else { nn <- x2nn( X, n_neighbors = n_neighbors, metric = metric, nn_method = nn_method, n_trees = n_trees, search_k = search_k, tmpdir = tmpdir, n_threads = n_threads, grain_size = grain_size, ret_model = ret_model, nn_args = nn_args, n_vertices = n_vertices, sparse_is_distance = sparse_is_distance, verbose = verbose ) if (any(is.infinite(nn$dist))) { stop("Infinite distances found in nearest neighbors") } } gc() nn2set_res <- nn2set( method, nn, set_op_mix_ratio, local_connectivity, bandwidth, perplexity, kernel, ret_sigma, n_threads, grain_size, verbose = verbose ) V <- nn2set_res$V if (any(is.na(V))) { stop("Non-finite entries in the input matrix") } gc() res <- list( nn = nn, V = V ) if (ret_sigma && !is.null(nn2set_res$sigma)) { res$sigma <- nn2set_res$sigma res$rho <- nn2set_res$rho res$dint <- nn2set_res$dint } res } set_intersect <- function(A, B, weight = 0.5, reset_connectivity = TRUE, reset_local_metric = FALSE, n_threads = NULL, verbose = FALSE) { A <- general_simplicial_set_intersection( A, B, weight ) A <- Matrix::drop0(A) # https://github.com/lmcinnes/umap/issues/58#issuecomment-437633658 # For now always reset if (reset_connectivity) { A <- reset_local_connectivity(A, reset_local_metric = reset_local_metric, n_threads = n_threads, verbose = verbose ) } A } categorical_intersection_df <- function(X, V, weight = 0.5, verbose = FALSE) { tsmessage( "Carrying out categorical intersection for ", pluralize("column", ncol(X)) ) for (i in 1:ncol(X)) { V <- categorical_intersection(X[, i], V, weight = weight, verbose = (verbose && i == 1) ) } V } categorical_intersection <- function(x, V, weight, verbose = FALSE) { if (is.null(V)) { stop("V cannot be null for categorical intersection") } if (weight < 1.0) { far_dist <- 2.5 * (1.0 / (1.0 - weight)) } else { far_dist <- 1.0e12 } tsmessage( "Applying categorical set intersection, weight = ", formatC(weight), " far distance = ", formatC(far_dist) ) V <- categorical_simplicial_set_intersection(V, x, far_dist = far_dist, verbose = verbose ) V } # Creates the number of epochs per sample for each weight # weights are the non-zero input affinities (1-simplex) # n_epoch the total number of epochs # There is an inverse relationship between the weights and the return vector. make_epochs_per_sample <- function(weights, n_epochs) { result <- rep(-1, length(weights)) n_samples <- n_epochs * (weights / max(weights)) result[n_samples > 0] <- n_epochs / n_samples[n_samples > 0] result } # Create the a/b parameters from spread and min_dist find_ab_params <- function(spread = 1, min_dist = 0.001) { xv <- seq(from = 0, to = spread * 3, length.out = 300) yv <- rep(0, length(xv)) yv[xv < min_dist] <- 1 yv[xv >= min_dist] <- exp(-(xv[xv >= min_dist] - min_dist) / spread) result <- try( { stats::nls(yv ~ 1 / (1 + a * xv^(2 * b)), start = list(a = 1, b = 1) )$m$getPars() }, silent = TRUE ) if (inherits(result, "try-error")) { stop( "Can't find a, b for provided spread = ", spread, " min_dist = ", min_dist ) } result } # The default number of edge samples used by LargeVis lvish_samples <- function(n_vertices) { n_samples <- 0 if (n_vertices < 10000) { n_samples <- 1000 } else if (n_vertices < 1000000) { n_samples <- (n_vertices - 10000) * 9000 / (1000000 - 10000) + 1000 } else { n_samples <- n_vertices / 100 } round(n_samples * 1000000) } # Returns the number of epochs required to generate the default number of edge samples # used in LargeVis lvish_epochs <- function(n_vertices, V) { n_samples <- lvish_samples(n_vertices) round(n_samples * max(V) / sum(V)) } # Scale X according to various strategies scale_input <- function(X, scale_type, ret_model = FALSE, verbose = FALSE) { if (is.null(scale_type)) { scale_type <- "none" } else if (is.logical(scale_type)) { scale_type <- ifelse(scale_type, "scale", "none") } else if (tolower(scale_type) == "z") { scale_type <- "scale" } scale_type <- match.arg( tolower(scale_type), c("none", "scale", "range", "colrange", "maxabs") ) switch(scale_type, range = { tsmessage("Range scaling X") min_X <- min(X) X <- X - min_X max_X <- max(X) X <- X / max_X if (ret_model) { attr(X, "scaled:range:min") <- min_X attr(X, "scaled:range:max") <- max_X } }, colrange = { tsmessage("Column range scaling X") min_X <- apply(X, 2, min) X <- sweep(X, 2, min_X) max_X <- apply(X, 2, max) X <- sweep(X, 2, max_X, `/`) if (ret_model) { attr(X, "scaled:colrange:min") <- min_X attr(X, "scaled:colrange:max") <- max_X } }, maxabs = { tsmessage("Normalizing by max-abs") X <- base::scale(X, scale = FALSE) max_abs <- max(abs(X)) X <- X / max_abs if (ret_model) { attr(X, "scaled:maxabs") <- max_abs } }, scale = { tsmessage("Scaling to zero mean and unit variance") varf <- function(x) { sum((x - sum(x) / length(x))^2) } non_zero_var_cols <- apply(X, 2, varf) >= .Machine$double.xmin if (length(non_zero_var_cols) == 0) { stop("Matrix has zero variance") } X <- X[, non_zero_var_cols] tsmessage("Kept ", ncol(X), " non-zero-variance columns") X <- base::scale(X, scale = TRUE) if (ret_model) { attr(X, "scaled:nzvcols") <- which(non_zero_var_cols) } } ) X } attr_to_scale_info <- function(X) { Xattr <- attributes(X) Xattr <- Xattr[startsWith(names(Xattr), "scaled:")] if (length(Xattr) == 0) { Xattr <- NULL } Xattr } get_opt_args <- function(opt_args, alpha) { default_batch_opt <- "adam" default_opt_args <- list( sgd = list(alpha = alpha), adam = list(alpha = alpha, beta1 = 0.5, beta2 = 0.9, eps = 1e-7) ) if (is.null(opt_args)) { opt_args <- list() } if (is.null(opt_args$method)) { opt_args$method <- "adam" } if (!(opt_args$method %in% names(default_opt_args))) { stop("Unknown optimization method '", opt_args$method, "'") } lmerge(default_opt_args[[opt_args$method]], opt_args) } # Takes local radii from the input dimension and converts to approximate # densities in the output space by mapping them to a vector of a parameters # as used in the UMAP output weight: 1/(1 + a + d^2b). # Based on testing a rough range of usable a values is 0.01-100. To get that # we want each a value to be the product of the local density of i and j, so # a = sqrt(a_i * a_j) # Also, we want dens_scale to control the spread of a values and for # dens_scale = 0, the vector of a_i give the the user-selected scalar value of # a, so we scale the log of the reciprocal of localr to be within [log(a * 1e-(2 # * dens_scale)) ... log(a * 1e(2 * dens_scale))] We take the sqrt of the a_i in # this function to avoid repeatedly calling it inside the optimization loop. scale_radii <- function(localr, dens_scale, a) { log_denso <- -log(localr) min_densl <- a * (10^(-2 * dens_scale)) log_min_densl <- log(min_densl) max_densl <- a * (10^(2 * dens_scale)) log_max_densl <- log(max_densl) log_denso_scale <- range_scale(log_denso, log_min_densl, log_max_densl) sqrt(exp(log_denso_scale)) } #' @useDynLib uwot, .registration=TRUE #' @importFrom Rcpp sourceCpp .onUnload <- function(libpath) { library.dynam.unload("uwot", libpath) } # Remove scaling attributes from a matrix # if the `scale` parameter is set then these attributes are assumed to have # been applied by uwot's internals and the equivalent scaling will be applied # to new data in umap_transform. However, these attributes could have been # applied by manually scaling the data before running any code in uwot, in which # case we should not save them as part of the model. This function is called # before applying any other scaling remove_scaling_attrs <- function(X) { uwot_attrs <- c( "scaled:range:min", "scaled:range:max", "scaled:colrange:min", "scaled:colrange:max", "scaled:maxabs", "scaled:nzvcols", "scaled:center", "scaled:scale" ) attrs <- names(attributes(X)) for (attr in attrs) { if (attr %in% uwot_attrs) { attributes(X)[[attr]] <- NULL } } X } uwot/R/transform.R0000644000176200001440000010700114733074465013623 0ustar liggesusers#' Add New Points to an Existing Embedding #' #' Carry out an embedding of new data using an existing embedding. Requires #' using the result of calling \code{\link{umap}} or \code{\link{tumap}} with #' \code{ret_model = TRUE}. #' #' Note that some settings are incompatible with the production of a UMAP model #' via \code{\link{umap}}: external neighbor data (passed via a list to the #' argument of the \code{nn_method} parameter), and factor columns that were #' included in the UMAP calculation via the \code{metric} parameter. In the #' latter case, the model produced is based only on the numeric data. #' A transformation is possible, but factor columns in the new data are ignored. #' #' @param X The new data to be transformed, either a matrix of data frame. Must #' have the same columns in the same order as the input data used to generate #' the \code{model}. #' @param model Data associated with an existing embedding. #' @param nn_method Optional pre-calculated nearest neighbor data. There are #' two supported formats. The first is a list consisting of two elements: #' \itemize{ #' \item \code{"idx"}. A \code{n_vertices x n_neighbors} matrix where #' \code{n_vertices} is the number of observations in \code{X}. The contents #' of the matrix should be the integer indexes of the data used to generate #' the \code{model}, which are the \code{n_neighbors}-nearest neighbors of #' the data to be transformed. #' \item \code{"dist"}. A \code{n_vertices x n_neighbors} matrix #' containing the distances of the nearest neighbors. #' } #' The second supported format is a sparse distance matrix of type #' \code{dgCMatrix}, with dimensions \code{n_model_vertices x n_vertices}. #' where \code{n_model_vertices} is the number of observations in the original #' data that generated the model. Distances should be arranged by column, i.e. #' a non-zero entry in row \code{j} of the \code{i}th column indicates that #' the \code{j}th observation in the original data used to generate the #' \code{model} is a nearest neighbor of the \code{i}th observation in the new #' data, with the distance given by the value of that element. In this format, #' a different number of neighbors is allowed for each observation, i.e. #' each column can contain a different number of non-zero values. #' Multiple nearest neighbor data (e.g. from two different pre-calculated #' metrics) can be passed by passing a list containing the nearest neighbor #' data lists as items. #' @param init_weighted If \code{TRUE}, then initialize the embedded coordinates #' of \code{X} using a weighted average of the coordinates of the nearest #' neighbors from the original embedding in \code{model}, where the weights #' used are the edge weights from the UMAP smoothed knn distances. Otherwise, #' use an un-weighted average. #' This parameter will be deprecated and removed at version 1.0 of this #' package. Use the \code{init} parameter as a replacement, replacing #' \code{init_weighted = TRUE} with \code{init = "weighted"} and #' \code{init_weighted = FALSE} with \code{init = "average"}. #' @param search_k Number of nodes to search during the neighbor retrieval. The #' larger k, the more the accurate results, but the longer the search takes. #' Default is the value used in building the \code{model} is used. #' @param tmpdir Temporary directory to store nearest neighbor indexes during #' nearest neighbor search. Default is \code{\link{tempdir}}. The index is #' only written to disk if \code{n_threads > 1}; otherwise, this parameter is #' ignored. #' @param n_epochs Number of epochs to use during the optimization of the #' embedded coordinates. A value between \code{30 - 100} is a reasonable trade #' off between speed and thoroughness. By default, this value is set to one #' third the number of epochs used to build the \code{model}. #' @param n_threads Number of threads to use, (except during stochastic gradient #' descent). Default is half the number of concurrent threads supported by the #' system. #' @param n_sgd_threads Number of threads to use during stochastic gradient #' descent. If set to > 1, then be aware that if \code{batch = FALSE}, results #' will \emph{not} be reproducible, even if \code{set.seed} is called with a #' fixed seed before running. Set to \code{"auto"} to use the same value as #' \code{n_threads}. #' @param grain_size Minimum batch size for multithreading. If the number of #' items to process in a thread falls below this number, then no threads will #' be used. Used in conjunction with \code{n_threads} and #' \code{n_sgd_threads}. #' @param verbose If \code{TRUE}, log details to the console. #' @param init how to initialize the transformed coordinates. One of: #' \itemize{ #' \item \code{"weighted"} (The default). Use a weighted average of the #' coordinates of the nearest neighbors from the original embedding in #' \code{model}, where the weights used are the edge weights from the UMAP #' smoothed knn distances. Equivalent to \code{init_weighted = TRUE}. #' \item \code{"average"}. Use the mean average of the coordinates of #' the nearest neighbors from the original embedding in \code{model}. #' Equivalent to \code{init_weighted = FALSE}. #' \item A matrix of user-specified input coordinates, which must have #' dimensions the same as \code{(nrow(X), ncol(model$embedding))}. #' } #' This parameter should be used in preference to \code{init_weighted}. #' @param batch If \code{TRUE}, then embedding coordinates are updated at the #' end of each epoch rather than during the epoch. In batch mode, results are #' reproducible with a fixed random seed even with \code{n_sgd_threads > 1}, #' at the cost of a slightly higher memory use. You may also have to modify #' \code{learning_rate} and increase \code{n_epochs}, so whether this provides #' a speed increase over the single-threaded optimization is likely to be #' dataset and hardware-dependent. If \code{NULL}, the transform will use the #' value provided in the \code{model}, if available. Default: \code{FALSE}. #' @param learning_rate Initial learning rate used in optimization of the #' coordinates. This overrides the value associated with the \code{model}. #' This should be left unspecified under most circumstances. #' @param opt_args A list of optimizer parameters, used when #' \code{batch = TRUE}. The default optimization method used is Adam (Kingma #' and Ba, 2014). #' \itemize{ #' \item \code{method} The optimization method to use. Either \code{"adam"} #' or \code{"sgd"} (stochastic gradient descent). Default: \code{"adam"}. #' \item \code{beta1} (Adam only). The weighting parameter for the #' exponential moving average of the first moment estimator. Effectively the #' momentum parameter. Should be a floating point value between 0 and 1. #' Higher values can smooth oscillatory updates in poorly-conditioned #' situations and may allow for a larger \code{learning_rate} to be #' specified, but too high can cause divergence. Default: \code{0.5}. #' \item \code{beta2} (Adam only). The weighting parameter for the #' exponential moving average of the uncentered second moment estimator. #' Should be a floating point value between 0 and 1. Controls the degree of #' adaptivity in the step-size. Higher values put more weight on previous #' time steps. Default: \code{0.9}. #' \item \code{eps} (Adam only). Intended to be a small value to prevent #' division by zero, but in practice can also affect convergence due to its #' interaction with \code{beta2}. Higher values reduce the effect of the #' step-size adaptivity and bring the behavior closer to stochastic gradient #' descent with momentum. Typical values are between 1e-8 and 1e-3. Default: #' \code{1e-7}. #' \item \code{alpha} The initial learning rate. Default: the value of the #' \code{learning_rate} parameter. #' } #' If \code{NULL}, the transform will use the value provided in the #' \code{model}, if available. #' @param epoch_callback A function which will be invoked at the end of every #' epoch. Its signature should be: #' \code{(epoch, n_epochs, coords, fixed_coords)}, where: #' \itemize{ #' \item \code{epoch} The current epoch number (between \code{1} and #' \code{n_epochs}). #' \item \code{n_epochs} Number of epochs to use during the optimization of #' the embedded coordinates. #' \item \code{coords} The embedded coordinates as of the end of the current #' epoch, as a matrix with dimensions (N, \code{n_components}). #' \item \code{fixed_coords} The originally embedded coordinates from the #' \code{model}. These are fixed and do not change. A matrix with dimensions #' (Nmodel, \code{n_components}) where \code{Nmodel} is the number of #' observations in the original data. #' } #' @param ret_extra A vector indicating what extra data to return. May contain #' any combination of the following strings: #' \itemize{ #' \item \code{"fgraph"} the high dimensional fuzzy graph (i.e. the fuzzy #' simplicial set of the merged local views of the input data). The graph #' is returned as a sparse matrix of class \link[Matrix]{dgCMatrix-class} #' with dimensions \code{NX} x \code{Nmodel}, where \code{NX} is the number #' of items in the data to transform in \code{X}, and \code{NModel} is #' the number of items in the data used to build the UMAP \code{model}. #' A non-zero entry (i, j) gives the membership strength of the edge #' connecting the vertex representing the ith item in \code{X} to the #' jth item in the data used to build the \code{model}. Note that the #' graph is further sparsified by removing edges with sufficiently low #' membership strength that they would not be sampled by the probabilistic #' edge sampling employed for optimization and therefore the number of #' non-zero elements in the matrix is dependent on \code{n_epochs}. If you #' are only interested in the fuzzy input graph (e.g. for clustering), #' setting \code{n_epochs = 0} will avoid any further sparsifying. #' \item \code{"nn"} the nearest neighbor graph for \code{X} with respect to #' the observations in the \code{model}. The graph will be returned as a #' list of two items: \code{idx} a matrix of indices, with as many rows #' as there are items in \code{X} and as many columns as there are nearest #' neighbors to be computed (this value is determined by the \code{model}). #' The indices are those of the rows of the data used to build the #' \code{model}, so they're not necessarily of much use unless you have #' access to that data. The second item, \code{dist} is a matrix of the #' equivalent distances, with the same dimensions as \code{idx}. #' } #' @param seed Integer seed to use to initialize the random number generator #' state. Combined with \code{n_sgd_threads = 1} or \code{batch = TRUE}, this #' should give consistent output across multiple runs on a given installation. #' Setting this value is equivalent to calling \code{\link[base]{set.seed}}, #' but it may be more convenient in some situations than having to call a #' separate function. The default is to not set a seed, in which case this #' function uses the behavior specified by the supplied \code{model}: If the #' model specifies a seed, then the model seed will be used to seed then #' random number generator, and results will still be consistent (if #' \code{n_sgd_threads = 1}). If you want to force the seed to not be set, #' even if it is set in \code{model}, set \code{seed = FALSE}. #' @return A matrix of coordinates for \code{X} transformed into the space #' of the \code{model}, or if \code{ret_extra} is specified, a list #' containing: #' \itemize{ #' \item \code{embedding} the matrix of optimized coordinates. #' \item if \code{ret_extra} contains \code{"fgraph"}, an item of the same #' name containing the high-dimensional fuzzy graph as a sparse matrix, of #' type \link[Matrix]{dgCMatrix-class}. #' \item if \code{ret_extra} contains \code{"sigma"}, returns a vector of #' the smooth knn distance normalization terms for each observation as #' \code{"sigma"} and a vector \code{"rho"} containing the largest #' distance to the locally connected neighbors of each observation. #' \item if \code{ret_extra} contains \code{"localr"}, an item of the same #' name containing a vector of the estimated local radii, the sum of #' \code{"sigma"} and \code{"rho"}. #' \item if \code{ret_extra} contains \code{"nn"}, an item of the same name #' containing the nearest neighbors of each item in \code{X} (with respect #' to the items that created the \code{model}). #' } #' @examples #' #' iris_train <- iris[1:100, ] #' iris_test <- iris[101:150, ] #' #' # You must set ret_model = TRUE to return extra data needed #' iris_train_umap <- umap(iris_train, ret_model = TRUE) #' iris_test_umap <- umap_transform(iris_test, iris_train_umap) #' @export umap_transform <- function(X = NULL, model = NULL, nn_method = NULL, init_weighted = TRUE, search_k = NULL, tmpdir = tempdir(), n_epochs = NULL, n_threads = NULL, n_sgd_threads = 0, grain_size = 1, verbose = FALSE, init = "weighted", batch = NULL, learning_rate = NULL, opt_args = NULL, epoch_callback = NULL, ret_extra = NULL, seed = NULL) { if (is.null(n_threads)) { n_threads <- default_num_threads() } if (is.character(n_sgd_threads) && n_sgd_threads == "auto") { n_sgd_threads <- n_threads } if (!is.numeric(n_sgd_threads)) { stop("Unknown n_sgd_threads value: ", n_sgd_threads, " should be a positive integer or 'auto'") } if (is.null(nn_method)) { if (is.null(X)) { stop('argument "X" is missing, with no default') } if (is.null(model)) { stop('argument "model" is missing, with no default') } if (!all_nn_indices_are_loaded(model)) { stop( "cannot use model: NN index is unloaded.", " Try reloading with `load_uwot`" ) } } else { if (!is.null(X)) { tsmessage('argument "nn_method" is provided, ignoring argument "X"') X <- NULL } } if (is.character(model$nn_method) && model$nn_method == "hnsw" && !is_installed("RcppHNSW")) { stop( "This model requires the RcppHNSW package to be installed." ) } if (is.character(model$nn_method) && model$nn_method == "nndescent" && !is_installed("rnndescent")) { stop( "This model requires the rnndescent package to be installed." ) } if (is.null(n_epochs)) { n_epochs <- model$n_epochs if (is.null(n_epochs)) { if (ncol(graph) <= 10000) { n_epochs <- 100 } else { n_epochs <- 30 } } else { n_epochs <- max(2, round(n_epochs / 3)) } } # Handle setting the random number seed internally: # 1. If the user specifies seed = FALSE, definitely don't set the seed, even # if the model has a seed. # 2. If the user specifies seed = integer, then use that seed, even if the # model has a seed. # 3. If the user does not specify a seed, then use the model seed, if it # exists. Otherwise don't set a seed. Also use this code path if the user # sets seed = TRUE if (is.logical(seed) && !seed) { # do nothing } # handle the seed = TRUE case in this clause too else if (is.logical(seed) || is.null(seed)) { if (!is.null(model$seed)) { tsmessage("Setting model random seed ", model$seed) set.seed(model$seed) } # otherwise no model seed, so do nothing } else { tsmessage("Setting random seed ", seed) set.seed(seed) } if (is.null(search_k)) { search_k <- model$search_k } nn_index <- model$nn_index n_neighbors <- model$n_neighbors local_connectivity <- model$local_connectivity train_embedding <- model$embedding if (!is.matrix(train_embedding)) { # this should only happen if the user set # `n_epochs = 0, init = NULL, ret_model = TRUE` stop( "Invalid embedding coordinates: should be a matrix, but got ", paste0(class(train_embedding), collapse = " ") ) } if (any(is.na(train_embedding))) { stop("Model embedding coordinates contains NA values") } n_train_vertices <- nrow(train_embedding) ndim <- ncol(train_embedding) row.names(train_embedding) <- NULL # uwot model format should be changed so train embedding is stored transposed train_embedding <- t(train_embedding) method <- model$method scale_info <- model$scale_info metric <- model$metric nblocks <- length(metric) pca_models <- model$pca_models if (method == "leopold") { dens_scale <- model$dens_scale aj <- model$ai rad_coeff <- model$rad_coeff } if (is.null(batch)) { if (!is.null(model$batch)) { batch <- model$batch } else { batch <- FALSE } } if (is.null(opt_args)) { if (!is.null(model$opt_args)) { opt_args <- model$opt_args } else { opt_args <- list() } } a <- model$a b <- model$b gamma <- model$gamma if (is.null(learning_rate)) { alpha <- model$alpha } else { alpha <- learning_rate } if (!is.numeric(alpha) || length(alpha) > 1 || alpha < 0) { stop("learning rate should be a positive number, not ", alpha) } negative_sample_rate <- model$negative_sample_rate approx_pow <- model$approx_pow norig_col <- model$norig_col rng_type <- model$rng_type if (is.null(rng_type)) { pcg_rand <- model$pcg_rand if (is.null(pcg_rand)) { rng_type <- "pcg" } else { if (pcg_rand) { rng_type <- "pcg" } else { rng_type <- "tausworthe" } } } num_precomputed_nns <- model$num_precomputed_nns binary_edge_weights <- model$binary_edge_weights if (is.null(binary_edge_weights)) { binary_edge_weights <- FALSE } # the number of model vertices n_vertices <- NULL Xnames <- NULL if (!is.null(X)) { if (!(methods::is(X, "data.frame") || methods::is(X, "matrix") || is_sparse_matrix(X))) { stop("Unknown input data format") } if (!is.null(norig_col) && ncol(X) != norig_col) { stop("Incorrect dimensions: X must have ", norig_col, " columns") } if (methods::is(X, "data.frame")) { indexes <- which(vapply(X, is.numeric, logical(1))) if (length(indexes) == 0) { stop("No numeric columns found") } X <- as.matrix(X[, indexes]) } n_vertices <- nrow(X) if (n_vertices < 1) { stop("Not enough rows in X") } if (!is.null(row.names(X))) { Xnames <- row.names(X) } checkna(X) } else if (nn_is_precomputed(nn_method)) { # https://github.com/jlmelville/uwot/issues/97 # In the case where the training model didn't use pre-computed neighbors # we treat it like it had one block if (num_precomputed_nns == 0) { num_precomputed_nns <- 1 } # store single nn graph as a one-item list if (num_precomputed_nns == 1 && nn_is_single(nn_method)) { nn_method <- list(nn_method) } if (length(nn_method) != num_precomputed_nns) { stop( "Wrong # pre-computed neighbor data blocks, expected: ", num_precomputed_nns, " but got: ", length(nn_method) ) } if (length(n_neighbors) != num_precomputed_nns) { stop( "Wrong # n_neighbor values (one per neighbor block), expected: ", num_precomputed_nns, " but got: ", length(n_neighbors) ) } for (i in 1:num_precomputed_nns) { graph <- nn_method[[i]] if (is.list(graph)) { check_graph(graph, expected_rows = n_vertices, expected_cols = n_neighbors[[i]], bipartite = TRUE ) if (is.null(n_vertices)) { n_vertices <- nrow(graph$idx) } if (is.null(Xnames)) { Xnames <- nn_graph_row_names(graph) } } else if (is_sparse_matrix(graph)) { # nn graph should have dims n_train_obs x n_test_obs graph <- Matrix::drop0(graph) if (is.null(n_vertices)) { n_vertices <- ncol(graph) } if (is.null(Xnames)) { Xnames <- colnames(graph) } } else { stop("Error: unknown neighbor graph format") } } nblocks <- num_precomputed_nns } if (!is.null(init)) { if (is.logical(init)) { init_weighted <- init } else if (is.character(init)) { init <- tolower(init) if (init == "average") { init_weighted <- FALSE } else if (init == "weighted") { init_weighted <- TRUE } else { stop("Unknown option for init: '", init, "'") } } else if (is.matrix(init)) { indim <- dim(init) xdim <- c(n_vertices, ndim) if (!all(indim == xdim)) { stop( "Initial embedding matrix has wrong dimensions, expected (", xdim[1], ", ", xdim[2], "), but was (", indim[1], ", ", indim[2], ")" ) } if (any(is.na(init))) { stop("Initial embedding matrix coordinates contains NA values") } if (is.null(Xnames) && !is.null(row.names(init))) { Xnames <- row.names(init) } init_weighted <- NULL } else { stop("Invalid input format for 'init'") } } if (is.null(n_vertices)) { stop("Failed to read input correctly: invalid input format") } if (verbose) { x_is_matrix <- methods::is(X, "matrix") tsmessage("Read ", n_vertices, " rows", appendLF = !x_is_matrix) if (x_is_matrix) { tsmessage(" and found ", ncol(X), " numeric columns", time_stamp = FALSE) } } if (!is.null(scale_info)) { X <- apply_scaling(X, scale_info = scale_info, verbose = verbose) } adjusted_local_connectivity <- max(0, local_connectivity - 1.0) graph <- NULL embedding <- NULL localr <- NULL sigma <- NULL rho <- NULL export_nns <- NULL ret_nn <- FALSE if ("nn" %in% ret_extra) { ret_nn <- TRUE export_nns <- list() } need_sigma <- (method == "leopold" && nblocks == 1) || "sigma" %in% ret_extra for (i in 1:nblocks) { tsmessage("Processing block ", i, " of ", nblocks) if (!is.null(X)) { if (nblocks == 1) { Xsub <- X ann <- nn_index } else { subset <- metric[[i]] if (is.list(subset)) { subset <- lsplit_unnamed(subset)$unnamed[[1]] } Xsub <- X[, subset, drop = FALSE] ann <- nn_index[[i]] } if (!is.null(pca_models) && !is.null(pca_models[[as.character(i)]])) { Xsub <- apply_pca( X = Xsub, pca_res = pca_models[[as.character(i)]], verbose = verbose ) } if (is.null(ann$type) || startsWith(ann$type, "annoy")) { nn <- annoy_search( Xsub, k = n_neighbors, ann = ann, search_k = search_k, prep_data = TRUE, tmpdir = tmpdir, n_threads = n_threads, grain_size = grain_size, verbose = verbose ) } else if (startsWith(ann$type, "hnsw")) { if (is.list(model$nn_args)) { nn_args <- model$nn_args nn_args_names <- names(nn_args) if ("ef" %in% nn_args_names) { ef <- nn_args$ef } else { ef <- 10 } } nn <- hnsw_search( X, k = n_neighbors, ann = ann, ef = ef, n_threads = n_threads, verbose = verbose ) # We use the L2 HNSW index for Euclidean so we need to process the # distances if (names(model$metric)[[1]] == "euclidean") { nn$dist <- sqrt(nn$dist) } } else if (startsWith(ann$type, "nndescent")) { nn <- nndescent_search( X, k = n_neighbors, ann = ann, nn_args = model$nn_args, n_threads = n_threads, verbose = verbose ) } else { stop("Unknown nn method: ", ann$type) } if (ret_nn) { export_nns[[i]] <- nn names(export_nns)[[i]] <- ann$metric } } else if (is.list(nn_method)) { # otherwise we expect a list of NN graphs nn <- nn_method[[i]] if (ret_nn) { export_nns[[i]] <- nn names(export_nns)[[i]] <- "precomputed" } } else { stop( "Can't transform new data if X is NULL ", "and no sparse distance matrix available" ) } osparse <- NULL if (is_sparse_matrix(nn)) { nn <- Matrix::drop0(nn) osparse <- order_sparse(nn) nn_idxv <- osparse$i + 1 nn_distv <- osparse$x nn_ptr <- osparse$p n_nbrs <- diff(nn_ptr) if (any(n_nbrs < 1)) { stop("All observations need at least one neighbor") } target <- log2(n_nbrs) skip_first <- TRUE } else { nnt <- nn_graph_t(nn) if (length(n_neighbors) == nblocks) { # if model came from multiple different external neighbor data n_nbrs <- n_neighbors[[i]] } else { # multiple internal blocks n_nbrs <- n_neighbors } if (is.na(n_nbrs) || n_nbrs != nrow(nnt$idx)) { # original neighbor data was sparse, but we are using dense knn format # or n_neighbors doesn't match n_nbrs <- nrow(nnt$idx) tsmessage( "Possible mismatch with original vs new neighbor data ", "format, using ", n_nbrs, " nearest neighbors" ) } target <- log2(n_nbrs) nn_ptr <- n_nbrs nn_distv <- as.vector(nnt$dist) nn_idxv <- as.vector(nnt$idx) skip_first <- TRUE } sknn_res <- smooth_knn( nn_dist = nn_distv, nn_ptr = nn_ptr, skip_first = skip_first, target = target, local_connectivity = adjusted_local_connectivity, n_threads = n_threads, grain_size = grain_size, verbose = verbose, ret_sigma = TRUE ) if (is.null(localr) && need_sigma) { # because of the adjusted local connectivity rho is too small compared # to that used to generate the "training" data but sigma is larger, so # let's just stick with sigma + rho even though it tends to be an # underestimate sigma <- sknn_res$sigma rho <- sknn_res$rho localr <- sknn_res$sigma + sknn_res$rho } graph_blockv <- sknn_res$matrix if (is_sparse_matrix(nn)) { graph_block <- Matrix::sparseMatrix( j = osparse$i, p = osparse$p, x = graph_blockv, dims = rev(osparse$dims), index1 = FALSE ) } else { graph_block <- nn_to_sparse(nn_idxv, n_vertices, graph_blockv, self_nbr = FALSE, max_nbr_id = n_train_vertices, by_row = FALSE ) } if (is.logical(init_weighted)) { embedding_block <- init_new_embedding( train_embedding = train_embedding, nn_idx = nn_idxv, n_test_vertices = n_vertices, graph = graph_blockv, weighted = init_weighted, n_threads = n_threads, grain_size = grain_size, verbose = verbose ) if (is.null(embedding)) { embedding <- embedding_block } else { embedding <- embedding + embedding_block } } if (is.null(graph)) { graph <- graph_block } else { graph <- set_intersect(graph, graph_block, weight = 0.5, reset_connectivity = FALSE ) } } if (is.logical(init_weighted)) { if (nblocks > 1) { embedding <- embedding / nblocks } } else { tsmessage("Initializing from user-supplied matrix") embedding <- t(init) } if (binary_edge_weights) { tsmessage("Using binary edge weights") graph@x <- rep(1, length(graph@x)) } if (batch) { # This is the same arrangement as Python UMAP graph <- Matrix::t(graph) } if (n_epochs > 0) { graph@x[graph@x < max(graph@x) / n_epochs] <- 0 graph <- Matrix::drop0(graph) # Edges are (i->j) where i (head) is from the new data and j (tail) is # in the model data # Unlike embedding of initial data, the edge list is therefore NOT symmetric # i.e. the presence of (i->j) does NOT mean (j->i) is also present because # i and j now come from different data if (batch) { # ordered indices of the new data nodes. Coordinates are updated # during optimization positive_head <- Matrix::which(graph != 0, arr.ind = TRUE)[, 2] - 1 # unordered indices of the model nodes (some may not have any incoming # edges), these coordinates will NOT update during the optimization positive_tail <- graph@i } else { # unordered indices of the new data nodes. Coordinates are updated # during optimization positive_head <- graph@i # ordered indices of the model nodes (some may not have any incoming edges) # these coordinates will NOT update during the optimization positive_tail <- Matrix::which(graph != 0, arr.ind = TRUE)[, 2] - 1 } n_head_vertices <- ncol(embedding) n_tail_vertices <- n_train_vertices # if batch = TRUE points into the head (length == n_tail_vertices) # if batch = FALSE, points into the tail (length == n_head_vertices) positive_ptr <- graph@p epochs_per_sample <- make_epochs_per_sample(graph@x, n_epochs) tsmessage( "Commencing optimization for ", n_epochs, " epochs, with ", length(positive_head), " positive edges", pluralize("thread", n_sgd_threads, " using") ) method <- tolower(method) if (method == "leopold") { # Use the linear model 2 log ai = -m log(localr) + c ai <- exp(0.5 * ((-log(localr) * rad_coeff[2]) + rad_coeff[1])) # Prevent too-small/large aj min_ai <- min(sqrt(a * 10^(-2 * dens_scale)), 0.1) ai[ai < min_ai] <- min_ai max_ai <- sqrt(a * 10^(2 * dens_scale)) ai[ai > max_ai] <- max_ai method <- "leopold2" } method_args <- switch(method, umap = list(a = a, b = b, gamma = gamma, approx_pow = approx_pow), tumap = list(gamma = gamma), leopold2 = list(ai = ai, aj = aj, b = b, ndim = ndim), list() ) full_opt_args <- get_opt_args(opt_args, alpha) embedding <- optimize_layout_r( head_embedding = embedding, tail_embedding = train_embedding, positive_head = positive_head, positive_tail = positive_tail, positive_ptr = positive_ptr, n_epochs = n_epochs, n_head_vertices = n_head_vertices, n_tail_vertices = n_tail_vertices, epochs_per_sample = epochs_per_sample, method = tolower(method), method_args = method_args, initial_alpha = alpha / 4.0, opt_args = full_opt_args, negative_sample_rate = negative_sample_rate, rng_type = rng_type, batch = batch, n_threads = n_sgd_threads, grain_size = grain_size, move_other = FALSE, verbose = verbose, epoch_callback = epoch_callback ) } embedding <- t(embedding) tsmessage("Finished") if (!is.null(Xnames)) { row.names(embedding) <- Xnames } if (length(ret_extra) > 0) { res <- list(embedding = embedding) for (name in ret_extra) { if (name == "fgraph") { if (batch) { # #129: we transposed graph in the batch=TRUE case (#118) but need to # transpose back for export graph <- Matrix::t(graph) } res$fgraph <- graph } if (name == "sigma") { res$sigma <- sigma res$rho <- rho } if (name == "localr" && !is.null(localr)) { res$localr <- localr } if (ret_nn && !is.null(export_nns)) { res$nn <- export_nns } } } else { res <- embedding } res } init_new_embedding <- function(train_embedding, nn_idx, n_test_vertices, graph, weighted = TRUE, n_threads = NULL, grain_size = 1, verbose = FALSE) { if (is.null(n_threads)) { n_threads <- default_num_threads() } avtype <- ifelse(weighted, "weighted ", "") tsmessage( "Initializing by ", avtype, "average of neighbor coordinates", pluralize("thread", n_threads, " using") ) nn_weights <- NULL if (weighted) { nn_weights <- graph } init_transform_parallel( train_embedding = train_embedding, nn_index = nn_idx, n_test_vertices = n_test_vertices, nn_weights = nn_weights, n_threads = n_threads, grain_size = grain_size ) } # Pure R implementation of (weighted) average. Superceded by C++ implementations init_transform <- function(train_embedding, nn_index, weights = NULL) { nr <- nrow(nn_index) nc <- ncol(train_embedding) embedding <- matrix(nrow = nr, ncol = nc) if (is.null(weights)) { for (i in 1:nr) { nbr_embedding <- train_embedding[nn_index[i, ], ] embedding[i, ] <- apply(nbr_embedding, 2, mean) } } else { for (i in 1:nr) { nbr_embedding <- train_embedding[nn_index[i, ], ] nbr_weights <- weights[nn_index[i, ], i] embedding[i, ] <- apply( nbr_embedding, 2, function(x) { stats::weighted.mean(x, nbr_weights) } ) } } embedding } apply_scaling <- function(X, scale_info, verbose = FALSE) { if (!is.null(scale_info[["scaled:range:min"]])) { tsmessage("Applying training data range scaling") X <- X - scale_info[["scaled:range:min"]] X <- X / scale_info[["scaled:range:max"]] } else if (!is.null(scale_info[["scaled:maxabs"]])) { tsmessage("Applying training data max-abs scaling") X <- scale(X, center = scale_info[["scaled:center"]], scale = FALSE) X <- X / scale_info[["scaled:maxabs"]] } else if (!is.null(scale_info[["scaled:colrange:min"]])) { tsmessage("Applying training data column range scaling") X <- sweep(X, 2, scale_info[["scaled:colrange:min"]]) X <- sweep(X, 2, scale_info[["scaled:colrange:max"]], `/`) } else { tsmessage("Applying training data column filtering/scaling") X <- X[, scale_info[["scaled:nzvcols"]]] X <- scale(X, center = scale_info[["scaled:center"]], scale = scale_info[["scaled:scale"]] ) } X } # Apply a previously calculated set of PCA rotations apply_pca <- function(X, pca_res, verbose = FALSE) { tsmessage("Applying PCA reducing to ", ncol(X), " dimensions") if (!is.null(pca_res$center)) { X <- sweep(X, 2, pca_res$center) } X %*% pca_res$rotation } uwot/R/umap2.R0000644000176200001440000013363214735021136012632 0ustar liggesusers#' Dimensionality Reduction with UMAP #' #' Carry out dimensionality reduction of a dataset using the Uniform Manifold #' Approximation and Projection (UMAP) method (McInnes et al., 2018). #' #' This function behaves like \code{\link{umap}} except with some updated #' defaults that make it behave more like the Python implementation and which #' cannot be added to \code{\link{umap}} without breaking backwards #' compatibility. In addition: #' #' \itemize{ #' \item if \href{https://cran.r-project.org/package=RcppHNSW}{RcppHNSW} is #' installed, it will be used in preference to Annoy if a compatible metric #' is requested. #' \item if RcppHNSW is not present, but #' \href{https://cran.r-project.org/package=rnndescent}{rnndescent} is #' installed, it will be used in preference to Annoy if a compatible metric #' is requested. #' \item if \code{batch = TRUE} then the default \code{n_sgd_threads} is set #' to the same value as \code{n_threads}. #' \item if the input data \code{X} is a sparse matrix, it is interpreted #' similarly to a dense matrix or dataframe, and not as a distance matrix. #' This requires \code{rnndescent} package to be installed. #' } #' #' @param X Input data. Can be a \code{\link{data.frame}}, \code{\link{matrix}}, #' \code{\link[stats]{dist}} object or \code{\link[Matrix]{sparseMatrix}}. #' Matrix and data frames should contain one observation per row. Data frames #' will have any non-numeric columns removed, although factor columns will be #' used if explicitly included via \code{metric} (see the help for #' \code{metric} for details). Sparse matrices must be in the \code{dgCMatrix} #' format, and you must also install #' \href{https://cran.r-project.org/package=rnndescent}{rnndescent} #' and set \code{nn_method = "nndescent"} #' \code{X} can also be \code{NULL} if pre-computed nearest neighbor data is #' passed to \code{nn_method}, and \code{init} is not \code{"spca"} or #' \code{"pca"}. #' @param n_neighbors The size of local neighborhood (in terms of number of #' neighboring sample points) used for manifold approximation. Larger values #' result in more global views of the manifold, while smaller values result in #' more local data being preserved. In general values should be in the range #' \code{2} to \code{100}. #' @param n_components The dimension of the space to embed into. This defaults #' to \code{2} to provide easy visualization, but can reasonably be set to any #' integer value in the range \code{2} to \code{100}. #' @param metric Type of distance metric to use to find nearest neighbors. For #' \code{nn_method = "annoy"} this can be one of: #' \itemize{ #' \item \code{"euclidean"} (the default) #' \item \code{"cosine"} #' \item \code{"manhattan"} #' \item \code{"hamming"} #' \item \code{"correlation"} (a distance based on the Pearson correlation) #' \item \code{"categorical"} (see below) #' } #' For \code{nn_method = "hnsw"} this can be one of: #' \itemize{ #' \item \code{"euclidean"} #' \item \code{"cosine"} #' \item \code{"correlation"} #' } #' If \href{https://cran.r-project.org/package=rnndescent}{rnndescent} is #' installed and \code{nn_method = "nndescent"} is specified then many more #' metrics are avaiable, including: #' \itemize{ #' \item \code{"braycurtis"} #' \item \code{"canberra"} #' \item \code{"chebyshev"} #' \item \code{"dice"} #' \item \code{"hamming"} #' \item \code{"hellinger"} #' \item \code{"jaccard"} #' \item \code{"jensenshannon"} #' \item \code{"kulsinski"} #' \item \code{"rogerstanimoto"} #' \item \code{"russellrao"} #' \item \code{"sokalmichener"} #' \item \code{"sokalsneath"} #' \item \code{"spearmanr"} #' \item \code{"symmetrickl"} #' \item \code{"tsss"} #' \item \code{"yule"} #' } #' For more details see the package documentation of \code{rnndescent}. #' For \code{nn_method = "fnn"}, the distance metric is always "euclidean". #' #' If \code{X} is a data frame or matrix, then multiple metrics can be #' specified, by passing a list to this argument, where the name of each item in #' the list is one of the metric names above. The value of each list item should #' be a vector giving the names or integer ids of the columns to be included in #' a calculation, e.g. \code{metric = list(euclidean = 1:4, manhattan = 5:10)}. #' #' Each metric calculation results in a separate fuzzy simplicial set, which are #' intersected together to produce the final set. Metric names can be repeated. #' Because non-numeric columns are removed from the data frame, it is safer to #' use column names than integer ids. #' #' Factor columns can also be used by specifying the metric name #' \code{"categorical"}. Factor columns are treated different from numeric #' columns and although multiple factor columns can be specified in a vector, #' each factor column specified is processed individually. If you specify #' a non-factor column, it will be coerced to a factor. #' #' For a given data block, you may override the \code{pca} and \code{pca_center} #' arguments for that block, by providing a list with one unnamed item #' containing the column names or ids, and then any of the \code{pca} or #' \code{pca_center} overrides as named items, e.g. \code{metric = #' list(euclidean = 1:4, manhattan = list(5:10, pca_center = FALSE))}. This #' exists to allow mixed binary and real-valued data to be included and to have #' PCA applied to both, but with centering applied only to the real-valued data #' (it is typical not to apply centering to binary data before PCA is applied). #' @param n_epochs Number of epochs to use during the optimization of the #' embedded coordinates. By default, this value is set to \code{500} for #' datasets containing 10,000 vertices or less, and \code{200} otherwise. #' If \code{n_epochs = 0}, then coordinates determined by \code{"init"} will #' be returned. #' @param scale Scaling to apply to \code{X} if it is a data frame or matrix: #' \itemize{ #' \item{\code{"none"} or \code{FALSE} or \code{NULL}} No scaling. #' \item{\code{"Z"} or \code{"scale"} or \code{TRUE}} Scale each column to #' zero mean and variance 1. #' \item{\code{"maxabs"}} Center each column to mean 0, then divide each #' element by the maximum absolute value over the entire matrix. #' \item{\code{"range"}} Range scale the entire matrix, so the smallest #' element is 0 and the largest is 1. #' \item{\code{"colrange"}} Scale each column in the range (0,1). #' } #' For UMAP, the default is \code{"none"}. #' @param learning_rate Initial learning rate used in optimization of the #' coordinates. #' @param init Type of initialization for the coordinates. Options are: #' \itemize{ #' \item \code{"spectral"} Spectral embedding using the normalized Laplacian #' of the fuzzy 1-skeleton, with Gaussian noise added. #' \item \code{"normlaplacian"}. Spectral embedding using the normalized #' Laplacian of the fuzzy 1-skeleton, without noise. #' \item \code{"random"}. Coordinates assigned using a uniform random #' distribution between -10 and 10. #' \item \code{"lvrandom"}. Coordinates assigned using a Gaussian #' distribution with standard deviation 1e-4, as used in LargeVis #' (Tang et al., 2016) and t-SNE. #' \item \code{"laplacian"}. Spectral embedding using the Laplacian Eigenmap #' (Belkin and Niyogi, 2002). #' \item \code{"pca"}. The first two principal components from PCA of #' \code{X} if \code{X} is a data frame, and from a 2-dimensional classical #' MDS if \code{X} is of class \code{"dist"}. #' \item \code{"spca"}. Like \code{"pca"}, but each dimension is then scaled #' so the standard deviation is 1e-4, to give a distribution similar to that #' used in t-SNE. This is an alias for \code{init = "pca", init_sdev = #' 1e-4}. #' \item \code{"agspectral"} An "approximate global" modification of #' \code{"spectral"} which all edges in the graph to a value of 1, and then #' sets a random number of edges (\code{negative_sample_rate} edges per #' vertex) to 0.1, to approximate the effect of non-local affinities. #' \item A matrix of initial coordinates. #' } #' For spectral initializations, (\code{"spectral"}, \code{"normlaplacian"}, #' \code{"laplacian"}, \code{"agspectral"}), if more than one connected #' component is identified, no spectral initialization is attempted. Instead #' a PCA-based initialization is attempted. If \code{verbose = TRUE} the #' number of connected components are logged to the console. The existence of #' multiple connected components implies that a global view of the data cannot #' be attained with this initialization. Increasing the value of #' \code{n_neighbors} may help. #' @param init_sdev If non-\code{NULL}, scales each dimension of the initialized #' coordinates (including any user-supplied matrix) to this standard #' deviation. By default, (\code{init_sdev = "range"}), each column of the #' initial coordinates are range scaled between 0-10. Scaling the input may #' help if the unscaled versions result in initial coordinates with large #' inter-point distances or outliers. This usually results in small gradients #' during optimization and very little progress being made to the layout. #' Shrinking the initial embedding by rescaling can help under these #' circumstances. Scaling the result of \code{init = "pca"} is usually #' recommended and \code{init = "spca"} as an alias for \code{init = "pca", #' init_sdev = 1e-4} but for the spectral initializations the scaled versions #' usually aren't necessary unless you are using a large value of #' \code{n_neighbors} (e.g. \code{n_neighbors = 150} or higher). #' @param spread The effective scale of embedded points. In combination with #' \code{min_dist}, this determines how clustered/clumped the embedded points #' are. #' @param min_dist The effective minimum distance between embedded points. #' Smaller values will result in a more clustered/clumped embedding where #' nearby points on the manifold are drawn closer together, while larger #' values will result on a more even dispersal of points. The value should be #' set relative to the \code{spread} value, which determines the scale at #' which embedded points will be spread out. #' @param set_op_mix_ratio Interpolate between (fuzzy) union and intersection as #' the set operation used to combine local fuzzy simplicial sets to obtain a #' global fuzzy simplicial sets. Both fuzzy set operations use the product #' t-norm. The value of this parameter should be between \code{0.0} and #' \code{1.0}; a value of \code{1.0} will use a pure fuzzy union, while #' \code{0.0} will use a pure fuzzy intersection. #' @param local_connectivity The local connectivity required -- i.e. the number #' of nearest neighbors that should be assumed to be connected at a local #' level. The higher this value the more connected the manifold becomes #' locally. In practice this should be not more than the local intrinsic #' dimension of the manifold. #' @param bandwidth The effective bandwidth of the kernel if we view the #' algorithm as similar to Laplacian Eigenmaps. Larger values induce more #' connectivity and a more global view of the data, smaller values concentrate #' more locally. #' @param repulsion_strength Weighting applied to negative samples in low #' dimensional embedding optimization. Values higher than one will result in #' greater weight being given to negative samples. #' @param negative_sample_rate The number of negative edge/1-simplex samples to #' use per positive edge/1-simplex sample in optimizing the low dimensional #' embedding. #' @param a More specific parameters controlling the embedding. If \code{NULL} #' these values are set automatically as determined by \code{min_dist} and #' \code{spread}. #' @param b More specific parameters controlling the embedding. If \code{NULL} #' these values are set automatically as determined by \code{min_dist} and #' \code{spread}. #' @param nn_method Method for finding nearest neighbors. Options are: #' \itemize{ #' \item \code{"fnn"}. Use exact nearest neighbors via the #' \href{https://cran.r-project.org/package=FNN}{FNN} package. #' \item \code{"annoy"} Use approximate nearest neighbors via the #' \href{https://cran.r-project.org/package=RcppAnnoy}{RcppAnnoy} package. #' \item \code{"hnsw"} Use approximate nearest neighbors with the #' Hierarchical Navigable Small World (HNSW) method (Malkov and Yashunin, #' 2018) via the #' \href{https://cran.r-project.org/package=RcppHNSW}{RcppHNSW} package. #' \code{RcppHNSW} is not a dependency of this package: this option is #' only available if you have installed \code{RcppHNSW} yourself. Also, #' HNSW only supports the following arguments for \code{metric} and #' \code{target_metric}: \code{"euclidean"}, \code{"cosine"} and #' \code{"correlation"}. #' \item \code{"nndescent"} Use approximate nearest neighbors with the #' Nearest Neighbor Descent method (Dong et al., 2011) via the #' \href{https://cran.r-project.org/package=rnndescent}{rnndescent} #' package. \code{rnndescent} is not a dependency of this package: this #' option is only available if you have installed \code{rnndescent} #' yourself. #' } #' By default, if \code{X} has less than 4,096 vertices, the exact nearest #' neighbors are found. Otherwise, approximate nearest neighbors are used. #' You may also pass pre-calculated nearest neighbor data to this argument. It #' must be one of two formats, either a list consisting of two elements: #' \itemize{ #' \item \code{"idx"}. A \code{n_vertices x n_neighbors} matrix #' containing the integer indexes of the nearest neighbors in \code{X}. Each #' vertex is considered to be its own nearest neighbor, i.e. #' \code{idx[, 1] == 1:n_vertices}. #' \item \code{"dist"}. A \code{n_vertices x n_neighbors} matrix #' containing the distances of the nearest neighbors. #' } #' or a sparse distance matrix of type \code{dgCMatrix}, with dimensions #' \code{n_vertices x n_vertices}. Distances should be arranged by column, #' i.e. a non-zero entry in row \code{j} of the \code{i}th column indicates #' that the \code{j}th observation in \code{X} is a nearest neighbor of the #' \code{i}th observation with the distance given by the value of that #' element. #' The \code{n_neighbors} parameter is ignored when using precomputed #' nearest neighbor data. If using the sparse distance matrix input, each #' column can contain a different number of neighbors. #' @param n_trees Number of trees to build when constructing the nearest #' neighbor index. The more trees specified, the larger the index, but the #' better the results. With \code{search_k}, determines the accuracy of the #' Annoy nearest neighbor search. Only used if the \code{nn_method} is #' \code{"annoy"}. Sensible values are between \code{10} to \code{100}. #' @param search_k Number of nodes to search during the neighbor retrieval. The #' larger k, the more the accurate results, but the longer the search takes. #' With \code{n_trees}, determines the accuracy of the Annoy nearest neighbor #' search. Only used if the \code{nn_method} is \code{"annoy"}. #' @param nn_args A list containing additional arguments to pass to the nearest #' neighbor method. For \code{nn_method = "annoy"}, you can specify #' \code{"n_trees"} and \code{"search_k"}, and these will override the #' \code{n_trees} and \code{search_k} parameters. #' For \code{nn_method = "hnsw"}, you may specify the following arguments: #' \itemize{ #' \item \code{M} The maximum number of neighbors to keep for each vertex. #' Reasonable values are \code{2} to \code{100}. Higher values give better #' recall at the cost of more memory. Default value is \code{16}. #' \item \code{ef_construction} A positive integer specifying the size of #' the dynamic list used during index construction. A higher value will #' provide better results at the cost of a longer time to build the index. #' Default is \code{200}. #' \item \code{ef} A positive integer specifying the size of the dynamic #' list used during search. This cannot be smaller than \code{n_neighbors} #' and cannot be higher than the number of items in the index. Default is #' \code{10}. #' } #' For \code{nn_method = "nndescent"}, you may specify the following #' arguments: #' \itemize{ #' \item \code{n_trees} The number of trees to use in a random projection #' forest to initialize the search. A larger number will give more accurate #' results at the cost of a longer computation time. The default of #' \code{NULL} means that the number is chosen based on the number of #' observations in \code{X}. #' \item \code{max_candidates} The number of potential neighbors to explore #' per iteration. By default, this is set to \code{n_neighbors} or \code{60}, #' whichever is smaller. A larger number will give more accurate results at #' the cost of a longer computation time. #' \item \code{n_iters} The number of iterations to run the search. A larger #' number will give more accurate results at the cost of a longer computation #' time. By default, this will be chosen based on the number of observations #' in \code{X}. You may also need to modify the convergence criterion #' \code{delta}. #' \item \code{delta} The minimum relative change in the neighbor graph #' allowed before early stopping. Should be a value between 0 and 1. The #' smaller the value, the smaller the amount of progress between iterations is #' allowed. Default value of \code{0.001} means that at least 0.1% of the #' neighbor graph must be updated at each iteration. #' \item \code{init} How to initialize the nearest neighbor descent. By #' default this is set to \code{"tree"} and uses a random project forest. #' If you set this to \code{"rand"}, then a random selection is used. Usually #' this is less accurate than using RP trees, but for high-dimensional cases, #' there may be little difference in the quality of the initialization and #' random initialization will be a lot faster. If you set this to #' \code{"rand"}, then the \code{n_trees} parameter is ignored. #' \item \code{pruning_degree_multiplier} The maximum number of edges per node #' to retain in the search graph, relative to \code{n_neighbors}. A larger #' value will give more accurate results at the cost of a longer computation #' time. Default is \code{1.5}. This parameter only affects neighbor search #' when transforming new data with \code{\link{umap_transform}}. #' \item \code{epsilon} Controls the degree of the back-tracking when #' traversing the search graph. Setting this to \code{0.0} will do a greedy #' search with no back-tracking. A larger value will give more accurate #' results at the cost of a longer computation time. Default is \code{0.1}. #' This parameter only affects neighbor search when transforming new data with #' \code{\link{umap_transform}}. #' \item \code{max_search_fraction} Specifies the maximum fraction of the #' search graph to traverse. By default, this is set to \code{1.0}, so the #' entire graph (i.e. all items in \code{X}) may be visited. You may want to #' set this to a smaller value if you have a very large dataset (in #' conjunction with \code{epsilon}) to avoid an inefficient exhaustive search #' of the data in \code{X}. This parameter only affects neighbor search when #' transforming new data with \code{\link{umap_transform}}. #' } #' @param approx_pow If \code{TRUE}, use an approximation to the power function #' in the UMAP gradient, from #' \url{https://martin.ankerl.com/2012/01/25/optimized-approximative-pow-in-c-and-cpp/}. #' Ignored if \code{dens_scale} is non-\code{NULL}. #' @param y Optional target data for supervised dimension reduction. Can be a #' vector, matrix or data frame. Use the \code{target_metric} parameter to #' specify the metrics to use, using the same syntax as \code{metric}. Usually #' either a single numeric or factor column is used, but more complex formats #' are possible. The following types are allowed: #' \itemize{ #' \item Factor columns with the same length as \code{X}. \code{NA} is #' allowed for any observation with an unknown level, in which case #' UMAP operates as a form of semi-supervised learning. Each column is #' treated separately. #' \item Numeric data. \code{NA} is \emph{not} allowed in this case. Use the #' parameter \code{target_n_neighbors} to set the number of neighbors used #' with \code{y}. If unset, \code{n_neighbors} is used. Unlike factors, #' numeric columns are grouped into one block unless \code{target_metric} #' specifies otherwise. For example, if you wish columns \code{a} and #' \code{b} to be treated separately, specify #' \code{target_metric = list(euclidean = "a", euclidean = "b")}. Otherwise, #' the data will be effectively treated as a matrix with two columns. #' \item Nearest neighbor data, consisting of a list of two matrices, #' \code{idx} and \code{dist}. These represent the precalculated nearest #' neighbor indices and distances, respectively. This #' is the same format as that expected for precalculated data in #' \code{nn_method}. This format assumes that the underlying data was a #' numeric vector. Any user-supplied value of the \code{target_n_neighbors} #' parameter is ignored in this case, because the the number of columns in #' the matrices is used for the value. Multiple nearest neighbor data using #' different metrics can be supplied by passing a list of these lists. #' } #' Unlike \code{X}, all factor columns included in \code{y} are automatically #' used. #' @param target_n_neighbors Number of nearest neighbors to use to construct the #' target simplicial set. Default value is \code{n_neighbors}. Applies only if #' \code{y} is non-\code{NULL} and \code{numeric}. #' @param target_metric The metric used to measure distance for \code{y} if #' using supervised dimension reduction. Used only if \code{y} is numeric. #' @param target_weight Weighting factor between data topology and target #' topology. A value of 0.0 weights entirely on data, a value of 1.0 weights #' entirely on target. The default of 0.5 balances the weighting equally #' between data and target. Only applies if \code{y} is non-\code{NULL}. #' @param pca If set to a positive integer value, reduce data to this number of #' columns using PCA. Doesn't applied if the distance \code{metric} is #' \code{"hamming"}, or the dimensions of the data is larger than the #' number specified (i.e. number of rows and columns must be larger than the #' value of this parameter). If you have > 100 columns in a data frame or #' matrix, reducing the number of columns in this way may substantially #' increase the performance of the nearest neighbor search at the cost of a #' potential decrease in accuracy. In many t-SNE applications, a value of 50 #' is recommended, although there's no guarantee that this is appropriate for #' all settings. #' @param pca_center If \code{TRUE}, center the columns of \code{X} before #' carrying out PCA. For binary data, it's recommended to set this to #' \code{FALSE}. #' @param pca_method Method to carry out any PCA dimensionality reduction when #' the \code{pca} parameter is specified. Allowed values are: #' \itemize{ #' \item{\code{"irlba"}}. Uses \code{\link[irlba]{prcomp_irlba}} from the #' \href{https://cran.r-project.org/package=irlba}{irlba} package. #' \item{\code{"rsvd"}}. Uses 5 iterations of \code{\link[irlba]{svdr}} from #' the \href{https://cran.r-project.org/package=irlba}{irlba} package. #' This is likely to give much faster but potentially less accurate results #' than using \code{"irlba"}. For the purposes of nearest neighbor #' calculation and coordinates initialization, any loss of accuracy doesn't #' seem to matter much. #' \item{\code{"bigstatsr"}}. Uses \code{\link[bigstatsr]{big_randomSVD}} #' from the \href{https://cran.r-project.org/package=bigstatsr}{bigstatsr} #' package. The SVD methods used in \code{bigstatsr} may be faster on #' systems without access to efficient linear algebra libraries (e.g. #' Windows). \strong{Note}: \code{bigstatsr} is \emph{not} a dependency of #' uwot: if you choose to use this package for PCA, you \emph{must} install #' it yourself. #' \item{\code{"svd"}}. Uses \code{\link[base]{svd}} for the SVD. This is #' likely to be slow for all but the smallest datasets. #' \item{\code{"auto"}} (the default). Uses \code{"irlba"}, unless more than #' 50% of the full set of singular vectors would be calculated, in which #' case \code{"svd"} is used. #' } #' @param pcg_rand If \code{TRUE}, use the PCG random number generator (O'Neill, #' 2014) during optimization. Otherwise, use the faster (but probably less #' statistically good) Tausworthe "taus88" generator. The default is #' \code{TRUE}. This parameter has been superseded by \code{rng_type} -- if #' both are set, \code{rng_type} takes precedence. #' @param rng_type The type of random number generator to use during #' optimization. One of: #' \itemize{ #' \item{\code{"pcg"}}. Use the PCG random number generator (O'Neill, 2014). #' \item{\code{"tausworthe"}}. Use the Tausworthe "taus88" generator. #' \item{\code{"deterministic"}}. Use a deterministic number generator. This #' isn't actually random, but may provide enough variation in the negative #' sampling to give a good embedding and can provide a noticeable speed-up. #' } #' For backwards compatibility, by default this is unset and the choice of #' \code{pcg_rand} is used (making "pcg" the effective default). #' @param fast_sgd If \code{TRUE}, then the following combination of parameters #' is set: \code{pcg_rand = TRUE}, \code{n_sgd_threads = "auto"} and #' \code{approx_pow = TRUE}. The default is \code{FALSE}. Setting this to #' \code{TRUE} will speed up the stochastic optimization phase, but give a #' potentially less accurate embedding, and which will not be exactly #' reproducible even with a fixed seed. For visualization, \code{fast_sgd = #' TRUE} will give perfectly good results. For more generic dimensionality #' reduction, it's safer to leave \code{fast_sgd = FALSE}. If \code{fast_sgd = #' TRUE}, then user-supplied values of \code{pcg_rand}, \code{n_sgd_threads}, #' and \code{approx_pow} are ignored. #' @param batch If \code{TRUE}, then embedding coordinates are updated at the #' end of each epoch rather than during the epoch. In batch mode, results are #' reproducible with a fixed random seed even with \code{n_sgd_threads > 1}, #' at the cost of a slightly higher memory use. You may also have to modify #' \code{learning_rate} and increase \code{n_epochs}, so whether this provides #' a speed increase over the single-threaded optimization is likely to be #' dataset and hardware-dependent. #' @param ret_model If \code{TRUE}, then return extra data that can be used to #' add new data to an existing embedding via \code{\link{umap_transform}}. The #' embedded coordinates are returned as the list item \code{embedding}. If #' \code{FALSE}, just return the coordinates. This parameter can be used in #' conjunction with \code{ret_nn} and \code{ret_extra}. Note that some #' settings are incompatible with the production of a UMAP model: external #' neighbor data (passed via a list to \code{nn_method}), and factor columns #' that were included via the \code{metric} parameter. In the latter case, the #' model produced is based only on the numeric data. A transformation using #' new data is possible, but the factor columns in the new data are ignored. #' Note that setting \code{ret_model = TRUE} forces the use of the approximate #' nearest neighbors method. Because small datasets would otherwise use exact #' nearest neighbor calculations, setting \code{ret_model = TRUE} means that #' different results may be returned for small datasets in terms of both the #' returned nearest neighbors (if requested) and the final embedded #' coordinates, compared to \code{ret_model = FALSE}, even if the random #' number seed is fixed. To avoid this, explicitly set #' \code{nn_method = "annoy"} in the \code{ret_model = FALSE} case. #' @param ret_nn If \code{TRUE}, then in addition to the embedding, also return #' nearest neighbor data that can be used as input to \code{nn_method} to #' avoid the overhead of repeatedly calculating the nearest neighbors when #' manipulating unrelated parameters (e.g. \code{min_dist}, \code{n_epochs}, #' \code{init}). See the "Value" section for the names of the list items. If #' \code{FALSE}, just return the coordinates. Note that the nearest neighbors #' could be sensitive to data scaling, so be wary of reusing nearest neighbor #' data if modifying the \code{scale} parameter. This parameter can be used in #' conjunction with \code{ret_model} and \code{ret_extra}. #' @param ret_extra A vector indicating what extra data to return. May contain #' any combination of the following strings: #' \itemize{ #' \item \code{"model"} Same as setting \code{ret_model = TRUE}. #' \item \code{"nn"} Same as setting \code{ret_nn = TRUE}. #' \item \code{"fgraph"} the high dimensional fuzzy graph (i.e. the fuzzy #' simplicial set of the merged local views of the input data). The graph #' is returned as a sparse symmetric N x N matrix of class #' \link[Matrix]{dgCMatrix-class}, where a non-zero entry (i, j) gives the #' membership strength of the edge connecting vertex i and vertex j. This #' can be considered analogous to the input probability (or similarity or #' affinity) used in t-SNE and LargeVis. Note that the graph is further #' sparsified by removing edges with sufficiently low membership strength #' that they would not be sampled by the probabilistic edge sampling #' employed for optimization and therefore the number of non-zero elements #' in the matrix is dependent on \code{n_epochs}. If you are only #' interested in the fuzzy input graph (e.g. for clustering), setting #' \code{n_epochs = 0} will avoid any further sparsifying. #' Be aware that setting `binary_edge_weights = TRUE` will affect this #' graph (all non-zero edge weights will be 1). #' \item \code{"sigma"} the normalization value for each observation in the #' dataset when constructing the smoothed distances to each of its #' neighbors. This gives some sense of the local density of each #' observation in the high dimensional space: higher values of #' \code{sigma} indicate a higher dispersion or lower density. #' } #' @param n_threads Number of threads to use (except during stochastic gradient #' descent). Default is half the number of concurrent threads supported by the #' system. For nearest neighbor search, only applies if #' \code{nn_method = "annoy"}. If \code{n_threads > 1}, then the Annoy index #' will be temporarily written to disk in the location determined by #' \code{\link[base]{tempfile}}. #' @param n_sgd_threads Number of threads to use during stochastic gradient #' descent. If set to > 1, then be aware that if \code{batch = FALSE}, results #' will \emph{not} be reproducible, even if \code{set.seed} is called with a #' fixed seed before running. Set to \code{"auto"} to use the same value as #' \code{n_threads}. Default is to use only one thread, unless #' \code{batch = TRUE} in which case \code{"auto"} used. #' @param grain_size The minimum amount of work to do on each thread. If this #' value is set high enough, then less than \code{n_threads} or #' \code{n_sgd_threads} will be used for processing, which might give a #' performance improvement if the overhead of thread management and context #' switching was outweighing the improvement due to concurrent processing. #' This should be left at default (\code{1}) and work will be spread evenly #' over all the threads specified. #' @param tmpdir Temporary directory to store nearest neighbor indexes during #' nearest neighbor search. Default is \code{\link{tempdir}}. The index is #' only written to disk if \code{n_threads > 1} and #' \code{nn_method = "annoy"}; otherwise, this parameter is ignored. #' @param verbose If \code{TRUE}, log details to the console. #' @param opt_args A list of optimizer parameters, used when #' \code{batch = TRUE}. The default optimization method used is Adam (Kingma #' and Ba, 2014). #' \itemize{ #' \item \code{method} The optimization method to use. Either \code{"adam"} #' or \code{"sgd"} (stochastic gradient descent). Default: \code{"adam"}. #' \item \code{beta1} (Adam only). The weighting parameter for the #' exponential moving average of the first moment estimator. Effectively the #' momentum parameter. Should be a floating point value between 0 and 1. #' Higher values can smooth oscillatory updates in poorly-conditioned #' situations and may allow for a larger \code{learning_rate} to be #' specified, but too high can cause divergence. Default: \code{0.5}. #' \item \code{beta2} (Adam only). The weighting parameter for the #' exponential moving average of the uncentered second moment estimator. #' Should be a floating point value between 0 and 1. Controls the degree of #' adaptivity in the step-size. Higher values put more weight on previous #' time steps. Default: \code{0.9}. #' \item \code{eps} (Adam only). Intended to be a small value to prevent #' division by zero, but in practice can also affect convergence due to its #' interaction with \code{beta2}. Higher values reduce the effect of the #' step-size adaptivity and bring the behavior closer to stochastic gradient #' descent with momentum. Typical values are between 1e-8 and 1e-3. Default: #' \code{1e-7}. #' \item \code{alpha} The initial learning rate. Default: the value of the #' \code{learning_rate} parameter. #' } #' @param epoch_callback A function which will be invoked at the end of every #' epoch. Its signature should be: \code{(epoch, n_epochs, coords)}, where: #' \itemize{ #' \item \code{epoch} The current epoch number (between \code{1} and #' \code{n_epochs}). #' \item \code{n_epochs} Number of epochs to use during the optimization of #' the embedded coordinates. #' \item \code{coords} The embedded coordinates as of the end of the current #' epoch, as a matrix with dimensions (N, \code{n_components}). #' } #' @param binary_edge_weights If \code{TRUE} then edge weights in the input #' graph are treated as binary (0/1) rather than real valued. This affects the #' sampling frequency of neighbors and is the strategy used by the PaCMAP #' method (Wang and co-workers, 2020). Practical (Böhm and co-workers, 2020) #' and theoretical (Damrich and Hamprecht, 2021) work suggests this has little #' effect on UMAP's performance. #' @param dens_scale A value between 0 and 1. If > 0 then the output attempts #' to preserve relative local density around each observation. This uses an #' approximation to the densMAP method (Narayan and co-workers, 2021). The #' larger the value of \code{dens_scale}, the greater the range of output #' densities that will be used to map the input densities. This option is #' ignored if using multiple \code{metric} blocks. #' @param seed Integer seed to use to initialize the random number generator #' state. Combined with \code{n_sgd_threads = 1} or \code{batch = TRUE}, this #' should give consistent output across multiple runs on a given installation. #' Setting this value is equivalent to calling \code{\link[base]{set.seed}}, #' but it may be more convenient in some situations than having to call a #' separate function. The default is to not set a seed. If #' \code{ret_model = TRUE}, the seed will be stored in the output model and #' then used to set the seed inside \code{\link{umap_transform}}. #' @return A matrix of optimized coordinates, or: #' \itemize{ #' \item if \code{ret_model = TRUE} (or \code{ret_extra} contains #' \code{"model"}), returns a list containing extra information that can be #' used to add new data to an existing embedding via #' \code{\link{umap_transform}}. In this case, the coordinates are available #' in the list item \code{embedding}. \bold{NOTE}: The contents of #' the \code{model} list should \emph{not} be considered stable or part of #' the public API, and are purposely left undocumented. #' \item if \code{ret_nn = TRUE} (or \code{ret_extra} contains \code{"nn"}), #' returns the nearest neighbor data as a list called \code{nn}. This #' contains one list for each \code{metric} calculated, itself containing a #' matrix \code{idx} with the integer ids of the neighbors; and a matrix #' \code{dist} with the distances. The \code{nn} list (or a sub-list) can be #' used as input to the \code{nn_method} parameter. #' \item if \code{ret_extra} contains \code{"fgraph"}, returns the high #' dimensional fuzzy graph as a sparse matrix called \code{fgraph}, of type #' \link[Matrix]{dgCMatrix-class}. #' \item if \code{ret_extra} contains \code{"sigma"}, returns a vector of the #' smooth knn distance normalization terms for each observation as #' \code{"sigma"} and a vector \code{"rho"} containing the largest #' distance to the locally connected neighbors of each observation. #' \item if \code{ret_extra} contains \code{"localr"}, returns a vector of #' the estimated local radii, the sum of \code{"sigma"} and \code{"rho"}. #' } #' The returned list contains the combined data from any combination of #' specifying \code{ret_model}, \code{ret_nn} and \code{ret_extra}. #' @examples #' #' iris30 <- iris[c(1:10, 51:60, 101:110), ] #' iris_umap <- umap2(iris30, n_neighbors = 5) #' #' @references #' Belkin, M., & Niyogi, P. (2002). #' Laplacian eigenmaps and spectral techniques for embedding and clustering. #' In \emph{Advances in neural information processing systems} #' (pp. 585-591). #' \url{http://papers.nips.cc/paper/1961-laplacian-eigenmaps-and-spectral-techniques-for-embedding-and-clustering.pdf} #' #' Böhm, J. N., Berens, P., & Kobak, D. (2020). #' A unifying perspective on neighbor embeddings along the attraction-repulsion spectrum. #' \emph{arXiv preprint} \emph{arXiv:2007.08902}. #' \url{https://arxiv.org/abs/2007.08902} #' #' Damrich, S., & Hamprecht, F. A. (2021). #' On UMAP's true loss function. #' \emph{Advances in Neural Information Processing Systems}, \emph{34}. #' \url{https://proceedings.neurips.cc/paper/2021/hash/2de5d16682c3c35007e4e92982f1a2ba-Abstract.html} #' #' Dong, W., Moses, C., & Li, K. (2011, March). #' Efficient k-nearest neighbor graph construction for generic similarity measures. #' In \emph{Proceedings of the 20th international conference on World Wide Web} #' (pp. 577-586). #' ACM. #' \doi{10.1145/1963405.1963487}. #' #' Kingma, D. P., & Ba, J. (2014). #' Adam: A method for stochastic optimization. #' \emph{arXiv preprint} \emph{arXiv}:1412.6980. #' \url{https://arxiv.org/abs/1412.6980} #' #' Malkov, Y. A., & Yashunin, D. A. (2018). #' Efficient and robust approximate nearest neighbor search using hierarchical #' navigable small world graphs. #' \emph{IEEE transactions on pattern analysis and machine intelligence}, \emph{42}(4), 824-836. #' #' McInnes, L., Healy, J., & Melville, J. (2018). #' UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction #' \emph{arXiv preprint} \emph{arXiv}:1802.03426. #' \url{https://arxiv.org/abs/1802.03426} #' #' Narayan, A., Berger, B., & Cho, H. (2021). #' Assessing single-cell transcriptomic variability through density-preserving data visualization. #' \emph{Nature biotechnology}, \emph{39}(6), 765-774. #' \doi{10.1038/s41587-020-00801-7} #' #' O'Neill, M. E. (2014). #' \emph{PCG: A family of simple fast space-efficient statistically good #' algorithms for random number generation} #' (Report No. HMC-CS-2014-0905). Harvey Mudd College. #' #' Tang, J., Liu, J., Zhang, M., & Mei, Q. (2016, April). #' Visualizing large-scale and high-dimensional data. #' In \emph{Proceedings of the 25th International Conference on World Wide Web} #' (pp. 287-297). #' International World Wide Web Conferences Steering Committee. #' \url{https://arxiv.org/abs/1602.00370} #' #' Van der Maaten, L., & Hinton, G. (2008). #' Visualizing data using t-SNE. #' \emph{Journal of Machine Learning Research}, \emph{9} (2579-2605). #' \url{https://www.jmlr.org/papers/v9/vandermaaten08a.html} #' #' Wang, Y., Huang, H., Rudin, C., & Shaposhnik, Y. (2021). #' Understanding How Dimension Reduction Tools Work: An Empirical Approach to Deciphering t-SNE, UMAP, TriMap, and PaCMAP for Data Visualization. #' \emph{Journal of Machine Learning Research}, \emph{22}(201), 1-73. #' \url{https://www.jmlr.org/papers/v22/20-1061.html} #' #' @export umap2 <- function(X, n_neighbors = 15, n_components = 2, metric = "euclidean", n_epochs = NULL, learning_rate = 1, scale = FALSE, init = "spectral", init_sdev = "range", spread = 1, min_dist = 0.1, set_op_mix_ratio = 1.0, local_connectivity = 1.0, bandwidth = 1.0, repulsion_strength = 1.0, negative_sample_rate = 5.0, a = NULL, b = NULL, nn_method = NULL, n_trees = 50, search_k = 2 * n_neighbors * n_trees, approx_pow = FALSE, y = NULL, target_n_neighbors = n_neighbors, target_metric = "euclidean", target_weight = 0.5, pca = NULL, pca_center = TRUE, pcg_rand = TRUE, fast_sgd = FALSE, ret_model = FALSE, ret_nn = FALSE, ret_extra = c(), n_threads = NULL, n_sgd_threads = 0, grain_size = 1, tmpdir = tempdir(), verbose = getOption("verbose", TRUE), batch = TRUE, opt_args = NULL, epoch_callback = NULL, pca_method = NULL, binary_edge_weights = FALSE, dens_scale = NULL, seed = NULL, nn_args = list(), rng_type = NULL) { if (is.null(nn_method)) { if (is_installed("RcppHNSW") && is.character(metric) && is_ok_hnsw_metric(metric) && is_ok_hnsw_metric(target_metric)) { nn_method <- "hnsw" tsmessage("Using HNSW for nearest neighbor search") } } if (is.null(nn_method)) { if (is_installed("rnndescent")) { nn_method <- "nndescent" tsmessage("Using NN-Descent for nearest neighbor search") } } if (is.null(n_threads)) { n_threads <- default_num_threads() } if (batch && is.numeric(n_sgd_threads) && n_sgd_threads == 0) { n_sgd_threads <- n_threads } if (is_sparse_matrix(X)) { if (!methods::is(X, "dgCMatrix")) { stop("sparse X must be a dgCMatrix object") } if (!is.list(nn_method) && !is_sparse_matrix(nn_method)) { if (!is_installed("rnndescent")) { stop( "nearest neighbor search for sparse matrices requires the ", "'rnndescent' package, please install it" ) } if (!is.null(nn_method) && is.character(nn_method) && nn_method != "nndescent") { stop( "nearest neighbor search for sparse matrices only supports ", "the 'nndescent' method" ) } tsmessage("Using nndescent for nearest neighbor search") nn_method <- "nndescent" } } if (is.null(n_epochs)) { n_epochs <- 500 } if (is.numeric(a) && is.numeric(b) && a == 1 && b == 1 && is.null(dens_scale)) { method <- "tumap" } else { method <- "umap" } uwot( X = X, n_neighbors = n_neighbors, n_components = n_components, metric = metric, n_epochs = n_epochs, alpha = learning_rate, scale = scale, init = init, init_sdev = init_sdev, spread = spread, min_dist = min_dist, set_op_mix_ratio = set_op_mix_ratio, local_connectivity = local_connectivity, bandwidth = bandwidth, gamma = repulsion_strength, negative_sample_rate = negative_sample_rate, a = a, b = b, nn_method = nn_method, n_trees = n_trees, search_k = search_k, method = method, approx_pow = approx_pow, n_threads = n_threads, n_sgd_threads = n_sgd_threads, grain_size = grain_size, y = y, target_n_neighbors = target_n_neighbors, target_weight = target_weight, target_metric = target_metric, pca = pca, pca_center = pca_center, pca_method = pca_method, pcg_rand = pcg_rand, fast_sgd = fast_sgd, ret_model = ret_model || "model" %in% ret_extra, ret_nn = ret_nn || "nn" %in% ret_extra, ret_fgraph = "fgraph" %in% ret_extra, ret_sigma = "sigma" %in% ret_extra, ret_localr = "localr" %in% ret_extra, batch = batch, opt_args = opt_args, epoch_callback = epoch_callback, binary_edge_weights = binary_edge_weights, tmpdir = tempdir(), verbose = verbose, dens_scale = dens_scale, seed = seed, nn_args = nn_args, sparse_X_is_distance_matrix = FALSE, rng_type = rng_type ) } uwot/R/bigstatsr_init.R0000644000176200001440000000171114730166740014631 0ustar liggesusersbigstatsr_is_installed <- function() { is_installed("bigstatsr") } bigstatsr_scores <- function(X, ncol, center = TRUE, ret_extra = FALSE, ncores = 1, verbose = FALSE) { res <- bigstatsr::big_randomSVD( X = bigstatsr::as_FBM(X), fun.scaling = bigstatsr::big_scale(center = center, scale = FALSE), k = ncol, ncores = ncores ) if (verbose) { totalvar <- sum(apply(X, 2, stats::var)) lambda <- sum((res$d^2) / (nrow(X) - 1)) varex <- lambda / totalvar tsmessage( "PCA: ", ncol, " components explained ", formatC(varex * 100), "% variance" ) } scores <- stats::predict(res) if (ret_extra) { list( scores = scores, rotation = res$v, center = res$center ) } else { scores } } uwot/R/neighbors.R0000644000176200001440000003462414730166740013575 0ustar liggesusersfind_nn <- function(X, k, include_self = TRUE, method = "fnn", metric = "euclidean", n_trees = 50, search_k = 2 * k * n_trees, nn_args = nn_args, tmpdir = tempdir(), n_threads = NULL, grain_size = 1, ret_index = FALSE, sparse_is_distance = TRUE, verbose = FALSE) { if (is.null(n_threads)) { n_threads <- default_num_threads() } if (inherits(X, "dist")) { res <- dist_nn(X, k, include_self = include_self, verbose = verbose) } else if (sparse_is_distance && is_sparse_matrix(X)) { # sparse distance matrix if (Matrix::isTriangular(X)) { res <- sparse_tri_nn(X, k, include_self = include_self, verbose = verbose) } else { res <- sparse_nn(X, k, include_self = include_self, verbose = verbose) } } else { if (is_sparse_matrix(X) && method != "nndescent") { stop("Sparse matrix input only supported for nndescent method.") } # normal matrix switch(method, "fnn" = { res <- FNN_nn(X, k = k, include_self = include_self) }, "annoy" = { nn_args_names <- names(nn_args) if ("n_trees" %in% nn_args_names) { n_trees <- nn_args$n_trees } if ("search_k" %in% nn_args_names) { search_k <- nn_args$search_k } res <- annoy_nn( X, k = k, metric = metric, n_trees = n_trees, search_k = search_k, tmpdir = tmpdir, n_threads = n_threads, ret_index = ret_index, verbose = verbose ) }, "hnsw" = { nn_args$X <- X nn_args$k <- k nn_args$metric <- metric nn_args$n_threads <- n_threads nn_args$verbose <- verbose nn_args$ret_index <- ret_index res <- do.call(hnsw_nn, nn_args) }, "nndescent" = { res <- nndescent_nn( X, k = k, metric = metric, nn_args = nn_args, n_threads = n_threads, ret_index = ret_index, verbose = verbose ) }, stop("Unknown method: ", method) ) } res } # an nn graph not in a list nn_is_single <- function(nn) { (is.list(nn) && !is.null(nn$idx)) || is_sparse_matrix(nn) } # TRUE if nn is a sparse matrix or an untagged list. This covers passing in # a single nn graph, sparse distance matrix or list thereof, but excludes a # tagged annoy index or a string like "euclidean" nn_is_precomputed <- function(nn) { (is.list(nn) && is.null(nn$type)) || is_sparse_matrix(nn) } # TRUE if we are using an annoy index nn_is_annoy <- function(ann) { is.list(ann) && !is.null(ann$type) && startsWith(ann$type, "annoy") } nn_is_hnsw <- function(ann) { is.list(ann) && !is.null(ann$type) && startsWith(ann$type, "hnsw") } # n_trees - number of trees to build when constructing the index. The more trees # specified, the larger the index, but the better the results. largeVis uses 10 # trees for datasets with N = 10,000 observations, 20 trees for datasets up to N # = 1,000,000, 50 trees for N up to 5,000,000 and 100 trees otherwise # search_k - the number of nodes to search during the neighbor retrieval. The # larger k, the more accurate results, but the longer the search takes. Default # is k * n_trees. #' @importFrom methods new annoy_nn <- function(X, k = 10, metric = "euclidean", n_trees = 50, search_k = 2 * k * n_trees, tmpdir = tempdir(), n_threads = NULL, grain_size = 1, ret_index = FALSE, verbose = FALSE) { if (is.null(n_threads)) { n_threads <- default_num_threads() } ann <- annoy_build(X, metric = metric, n_trees = n_trees, verbose = verbose ) res <- annoy_search(X, k = k, ann = ann, search_k = search_k, tmpdir = tmpdir, n_threads = n_threads, prep_data = TRUE, grain_size = grain_size, verbose = verbose ) nn_acc <- sum(res$idx == 1:nrow(X)) / nrow(X) tsmessage("Annoy recall = ", formatC(nn_acc * 100.0), "%") res <- list(idx = res$idx, dist = res$dist, recall = nn_acc) if (ret_index) { res$index <- ann } res } annoy_create <- function(metric, ndim) { rcppannoy <- create_ann(metric, ndim) list( ann = rcppannoy, type = "annoyv1", metric = metric, ndim = ndim ) } annoy_build <- function(X, metric = "euclidean", n_trees = 50, verbose = FALSE) { nr <- nrow(X) nc <- ncol(X) annoy <- annoy_create(metric, nc) if (metric == "correlation") { tsmessage("Annoy build: subtracting row means for correlation") X <- sweep(X, 1, rowMeans(X)) } tsmessage( "Building Annoy index with metric = ", metric, ", n_trees = ", n_trees ) ann <- annoy$ann nstars <- 50 if (verbose && nr > nstars) { progress_for( nr, nstars, function(chunk_start, chunk_end) { for (i in chunk_start:chunk_end) { ann$addItem(i - 1, X[i, , drop = FALSE]) } } ) } else { for (i in 1:nr) { ann$addItem(i - 1, X[i, ]) } } # Build index ann$build(n_trees) annoy } # create RcppAnnoy class from metric name with ndim dimensions # Correlation uses AnnoyAngular, input data needs to be centered first create_ann <- function(name, ndim) { ann <- switch(name, cosine = methods::new(RcppAnnoy::AnnoyAngular, ndim), manhattan = methods::new(RcppAnnoy::AnnoyManhattan, ndim), euclidean = methods::new(RcppAnnoy::AnnoyEuclidean, ndim), hamming = methods::new(RcppAnnoy::AnnoyHamming, ndim), correlation = methods::new(RcppAnnoy::AnnoyAngular, ndim), stop("BUG: unknown Annoy metric '", name, "'") ) } # fetch the underlying RcppAnnoy class from inside an index get_rcppannoy <- function(nni) { if (startsWith(class(nni), "Rcpp_Annoy")) { rcppannoy <- nni } else if (nn_is_annoy(nni)) { rcppannoy <- nni$ann } else if (nn_is_hnsw(nni)) { rcppannoy <- nni$ann } else { stop( "BUG: Found an unknown ann implementation of class: '", class(nni), "'" ) } rcppannoy } # Search a pre-built Annoy index for neighbors of X annoy_search <- function(X, k, ann, search_k = 100 * k, prep_data = FALSE, tmpdir = tempdir(), n_threads = NULL, grain_size = 1, verbose = FALSE) { # newer NN structures hide impl in a tagged list if (nn_is_annoy(ann)) { lann <- ann ann <- lann$ann if (prep_data && lann$metric == "correlation") { tsmessage("Annoy search: subtracting row means for correlation") X <- sweep(X, 1, rowMeans(X)) } } if (is.null(n_threads)) { n_threads <- default_num_threads() } if (n_threads > 0) { annoy_res <- annoy_search_parallel( X = X, k = k, ann = ann, search_k = search_k, tmpdir = tmpdir, n_threads = n_threads, grain_size = grain_size, verbose = verbose ) res <- list(idx = annoy_res$item + 1, dist = annoy_res$distance) } else { res <- annoy_search_serial( X = X, k = k, ann = ann, search_k = search_k, verbose = verbose ) } # Convert from angular distance to the UMAP/sklearn definition of cosine # distance # Current Annoy README defines cosine distance as sqrt(2 - 2 cos(u,v)) # where cos(u, v) is the cosine of the angle between two unit-scaled vectors # u and v (i.e. the cosine similarity). That expression is known to be # equivalent to the euclidean distance between u and v. # We shall convert back to 1 - cos(u, v) which is the definition of cosine # distance used by UMAP. if (methods::is(ann, "Rcpp_AnnoyAngular")) { res$dist <- 0.5 * res$dist * res$dist } res } annoy_search_serial <- function(X, k, ann, search_k = 100 * k, verbose = FALSE) { tsmessage("Searching Annoy index, search_k = ", search_k) nr <- nrow(X) idx <- matrix(nrow = nr, ncol = k) dist <- matrix(nrow = nr, ncol = k) nstars <- 50 if (verbose && nr > nstars) { progress_for( nr, nstars, function(chunk_start, chunk_end) { for (i in chunk_start:chunk_end) { res <- ann$getNNsByVectorList(X[i, ], k, search_k, TRUE) if (length(res$item) != k) { stop( "search_k/n_trees settings were unable to find ", k, " neighbors for item ", i ) } idx[i, ] <<- res$item dist[i, ] <<- res$distance } } ) } else { for (i in 1:nr) { res <- ann$getNNsByVectorList(X[i, ], k, search_k, TRUE) if (length(res$item) != k) { stop( "search_k/n_trees settings were unable to find ", k, " neighbors for item ", i ) } idx[i, ] <- res$item dist[i, ] <- res$distance } } list(idx = idx + 1, dist = dist) } annoy_search_parallel <- function(X, k, ann, search_k = 100 * k, tmpdir = tempdir(), n_threads = NULL, grain_size = 1, verbose = FALSE) { if (is.null(n_threads)) { n_threads <- default_num_threads() } index_file <- tempfile(tmpdir = tmpdir) tsmessage("Writing NN index file to temp file ", index_file) ann$save(index_file) fsize <- file.size(index_file) tsmessage( "Searching Annoy index using ", pluralize("thread", n_threads), ", search_k = ", search_k ) ann_class <- class(ann) metric <- switch(ann_class, Rcpp_AnnoyAngular = "cosine", Rcpp_AnnoyManhattan = "manhattan", Rcpp_AnnoyEuclidean = "euclidean", Rcpp_AnnoyHamming = "hamming", stop("BUG: unknown Annoy class '", ann_class, "'") ) res <- annoy_search_parallel_cpp(index_file, X, k, search_k, metric = metric, n_threads = n_threads, grain_size = grain_size ) unlink(index_file) if (any(res$item == -1)) { msg <- paste0( "search_k/n_trees settings were unable to find ", k, " neighbors for all items." ) if (fsize > 2147483647) { msg <- paste0( msg, " Index file may have been too large to process.", " Try repeating with n_threads = 0, reducing n_trees,", " or reducing to a smaller dimensionality, e.g. pca = 50" ) } stop(msg) } res } FNN_nn <- function(X, k = 10, include_self = TRUE) { if (include_self) { k <- k - 1 } fnn <- FNN::get.knn(X, k) idx <- fnn$nn.index dist <- fnn$nn.dist if (include_self) { idx <- cbind(seq_len(nrow(X)), idx) dist <- cbind(rep(0, nrow(X)), dist) } list(idx = idx, dist = dist) } dist_nn <- function(X, k, include_self = TRUE, verbose = FALSE) { tsmessage("Finding nearest neighbors from distance matrix") X <- as.matrix(X) if (!include_self) { k <- k + 1 } nn_idx <- t(apply(X, 2, order))[, 1:k] nn_dist <- matrix(0, nrow = nrow(X), ncol = k) for (i in seq_len(nrow(nn_idx))) { nn_dist[i, ] <- X[i, nn_idx[i, ]] } if (!include_self) { nn_idx <- nn_idx[, 2:ncol(nn_idx)] nn_dist <- nn_dist[, 2:ncol(nn_dist)] } attr(nn_idx, "dimnames") <- NULL attr(nn_dist, "dimnames") <- NULL list(idx = nn_idx, dist = nn_dist) } sparse_nn <- function(X, k, include_self = TRUE, verbose = FALSE) { tsmessage("Finding nearest neighbors from sparse matrix") if (include_self) { k <- k - 1 } n <- nrow(X) nn_idx <- matrix(0, nrow = n, ncol = k) nn_dist <- matrix(0, nrow = n, ncol = k) for (i in 1:n) { dists <- X[, i] is_nonzero <- dists != 0 dist_nonzero <- dists[is_nonzero] if (length(dist_nonzero) < k) { stop( "Row ", i, " of distance matrix has only ", length(dist_nonzero), " defined distances" ) } k_order <- order(dist_nonzero)[1:k] idx_nonzero <- which(is_nonzero, arr.ind = TRUE) nn_idx[i, ] <- idx_nonzero[k_order] nn_dist[i, ] <- dist_nonzero[k_order] } if (include_self) { nn_idx <- cbind(1:n, nn_idx) nn_dist <- cbind(rep(0, n), nn_dist) } list(idx = nn_idx, dist = nn_dist) } # Extract knn data from sparse lower/upper triangular matrix sparse_tri_nn <- function(X, k, include_self = TRUE, verbose = FALSE) { tsmessage("Finding nearest neighbors from sparse triangular matrix") if (include_self) { k <- k - 1 } n <- nrow(X) nn_idx <- matrix(0, nrow = n, ncol = k) nn_dist <- matrix(0, nrow = n, ncol = k) # this will get the i,j,x values no matter the internal representation Xsumm <- summary(X) for (i in 1:n) { # get indices where $i/j == i idxji <- Xsumm$j == i idxii <- Xsumm$i == i idxi <- idxji | idxii # find non-zero distances dists <- Xsumm$x[idxi] is_nonzero <- dists != 0 dist_nonzero <- dists[is_nonzero] if (length(dist_nonzero) < k) { stop( "Row ", i, " of distance matrix has only ", length(dist_nonzero), " defined distances" ) } # find indices of k-smallest distances k_order <- order(dist_nonzero)[1:k] nn_dist[i, ] <- dist_nonzero[k_order] # get indices into original vector isk <- which(idxi)[k_order] Xis <- Xsumm$i[isk] Xjs <- Xsumm$j[isk] # We don't know if the non-i index is in the i or j column # so do this slightly horrible logical * integer arithmetic # which will add the correct index to 0 nn_idx[i, ] <- ((Xis != i) * Xis) + ((Xjs != i) * Xjs) } if (include_self) { nn_idx <- cbind(1:n, nn_idx) nn_dist <- cbind(rep(0, n), nn_dist) } list(idx = nn_idx, dist = nn_dist) } is_binary_metric <- function(metric) { metric %in% c( "dice", "hamming", "jaccard", "kulsinski", "matching", "rogerstanimoto", "russellrao", "sokalmichener", "sokalsneath", "yule" ) } uwot/R/nn_hnsw.R0000644000176200001440000000535214730166740013263 0ustar liggesusershnsw_nn <- function(X, k = 10, metric = "euclidean", M = 16, ef_construction = 200, ef = 10, n_threads = NULL, ret_index = FALSE, verbose = FALSE) { if (is.null(n_threads)) { n_threads <- default_num_threads() } ann <- hnsw_build(X, metric = metric, M = M, ef_construction = ef_construction, n_threads = n_threads, verbose = verbose ) res <- hnsw_search(X, k, ann, ef = ef, n_threads = n_threads, verbose = verbose) # We actually use the L2 HNSW metric so we need to convert here # (also umap_transform must do this) if (metric == "euclidean") { res$dist <- sqrt(res$dist) } res <- list(idx = res$idx, dist = res$dist) if (ret_index) { res$index <- ann } res } hnsw_build <- function(X, metric, M, ef_construction, n_threads, verbose) { hnsw_distance <- metric if (metric == "correlation") { tsmessage("Annoy build: subtracting row means for correlation") X <- sweep(X, 1, rowMeans(X)) hnsw_distance <- "cosine" } # To avoid issues with whether a dedicated Euclidean class exists in RcppHNSW # we will always use L2 and manually process the distances when we are done if (metric == "euclidean") { hnsw_distance <- "l2" } index <- RcppHNSW::hnsw_build( X, distance = hnsw_distance, M = M, ef = ef_construction, verbose = verbose, n_threads = n_threads ) list( ann = index, type = "hnswv1", metric = metric, ndim = ncol(X) ) } # called by hnsw_nn when building a model, and by umap_transform directly hnsw_search <- function(X, k, ann, ef, n_threads = NULL, verbose = FALSE) { if (is.null(n_threads)) { n_threads <- default_num_threads() } if (ann$metric == "correlation") { tsmessage("HNSW search: subtracting row means for correlation") X <- sweep(X, 1, rowMeans(X)) } res <- RcppHNSW::hnsw_search( X = X, k = k, ann = ann$ann, ef = ef, n_threads = n_threads, verbose = verbose ) res } hnsw_load <- function(name, ndim, filename) { class_name <- switch( name, cosine = RcppHNSW::HnswCosine, euclidean = RcppHNSW::HnswL2, correlation = RcppHNSW::HnswCosine, stop("BUG: unknown HNSW metric '", name, "'") ) methods::new(class_name, ndim, filename) } is_ok_hnsw_metric <- function(metric) { hnsw_metrics <- c("euclidean", "cosine", "correlation") metric %in% hnsw_metrics } uwot/R/supervised.R0000644000176200001440000001414414730166740014001 0ustar liggesusers# Combine a fuzzy simplicial set with another fuzzy simplicial set # generated from categorical data using categorical distances. The target # data is assumed to be categorical label data (a vector of labels), # and this will update the fuzzy simplicial set to respect that label data. # TODO: optional category cardinality based weighting of distance # simplicial_set The input fuzzy simplicial set. # target The categorical labels to use in the intersection. # unknown_dist The distance an unknown label (-1) is assumed to be from any point. # far_dist The distance between unmatched labels. # Return The resulting intersected fuzzy simplicial set. categorical_simplicial_set_intersection <- function(simplicial_set, target, unknown_dist = 1.0, far_dist = 5.0, verbose = FALSE) { # Convert to dgTMatrix to get to the j indices simplicial_set <- methods::as(simplicial_set, "TsparseMatrix") simplicial_set@x <- fast_intersection_cpp( simplicial_set@i, simplicial_set@j, simplicial_set@x, target, unknown_dist, far_dist ) # drop0 converts back to dgCMatrix reset_local_connectivity(Matrix::drop0(simplicial_set)) } # Reset the local connectivity requirement -- each data sample should # have complete confidence in at least one 1-simplex in the simplicial set. # We can enforce this by locally rescaling confidences, and then remerging the # different local simplicial sets together. reset_local_connectivity <- function(simplicial_set, reset_local_metric = FALSE, num_local_metric_neighbors = 15, n_threads = NULL, verbose = FALSE) { # Python UMAP stores graph as CSR uwot uses CSC so need to be careful about # which axis to normalize simplicial_set <- col_max_normalize(simplicial_set) if (reset_local_metric) { if (is.null(n_threads)) { n_threads <- default_num_threads() } tsmessage( "Resetting local metric", pluralize("thread", n_threads, " using") ) metric_res <- reset_local_metrics_parallel(simplicial_set@p, simplicial_set@x, num_local_metric_neighbors = num_local_metric_neighbors, n_threads = n_threads ) simplicial_set@x <- metric_res$values # TODO: at least some failures are very typical and it doesn't seem to # affect results, so not worth reporting this for now. # if (metric_res$n_failures > 0) { # tsmessage(metric_res$n_failures, " local metric reset failures") # } } fuzzy_set_union(simplicial_set) } # Under the assumption of categorical distance for the intersecting # simplicial set perform a fast intersection. # This is not at all fast in R, use fast_intersection_cpp instead fast_intersection <- function(rows, cols, values, target, unknown_dist = 1.0, far_dist = 5.0) { ex_unknown <- exp(-unknown_dist) ex_far <- exp(-far_dist) for (nz in seq_len(length(values))) { i <- rows[nz] j <- cols[nz] if (is.na(target[i]) || is.na(target[j])) { values[nz] <- values[nz] * ex_unknown } else if (target[i] != target[j]) { values[nz] <- values[nz] * ex_far } } values } general_simplicial_set_intersection <- function(left, right, weight) { result <- methods::as(left + right, "TsparseMatrix") result@x <- general_sset_intersection_cpp( left@p, left@i, left@x, right@p, right@i, right@x, result@i, result@j, result@x, weight ) result } # An R translation of the Python function. Not very fast, # so use the C++ version instead general_sset_intersection <- function(indptr1, indices1, data1, indptr2, indices2, data2, result_row, result_col, result_val, mix_weight = 0.5) { left_min <- max(min(data1) / 2.0, 1.0e-8) right_min <- max(min(data2) / 2.0, 1.0e-8) for (idx in seq_len(length(result_row))) { i <- result_col[idx] + 1 j <- result_row[idx] left_val <- left_min for (k in (indptr1[i]):(indptr1[i + 1] - 1)) { if (indices1[k + 1] == j) { left_val <- data1[k + 1] } } right_val <- right_min for (k in (indptr2[i]):(indptr2[i + 1] - 1)) { if (indices2[k + 1] == j) { right_val <- data2[k + 1] } } if (left_val > left_min || right_val > right_min) { if (mix_weight < 0.5) { result_val[idx] <- left_val * right_val^(mix_weight / (1.0 - mix_weight)) } else { result_val[idx] <- right_val * left_val^(((1.0 - mix_weight) / mix_weight)) } } } result_val } # Sparse Matrix functions ------------------------------------------------- # normalize each column of a dgCMatrix by its maximum # https://stackoverflow.com/questions/39284774/column-rescaling-for-a-very-large-sparse-matrix-in-r col_max_normalize <- function(X) { X@x <- X@x / rep.int(colMaxs(X), diff(X@p)) X } # normalize each row of a dgCMatrix by its maximum row_max_normalize <- function(X) { Matrix::t(col_max_normalize(Matrix::t(X))) } col_sum_normalize <- function(X) { X@x <- X@x / rep.int(Matrix::colSums(X), diff(X@p)) X } row_sum_normalize <- function(X) { Matrix::t(col_sum_normalize(Matrix::t(X))) } # column maximums of a dgCMatrix colMaxs <- function(X) { ptr <- X@p xs <- X@x vapply( 1:ncol(X), function(i) { if (ptr[i + 1] > ptr[i]) { max(xs[(ptr[i] + 1):ptr[i + 1]]) } else { 0 } }, numeric(1) ) } # row maximums of a dgCMatrix rowMaxs <- function(X) { colMaxs(Matrix::t(X)) } uwot/R/affinity.R0000644000176200001440000002265014733566012013421 0ustar liggesusers# set_op_mix_ratio = between 0 and 1 mixes in fuzzy set intersection # set to 0 for intersection only #' @import Matrix fuzzy_set_union <- function(X, set_op_mix_ratio = 1) { XX <- X * Matrix::t(X) if (set_op_mix_ratio == 0) { Matrix::drop0(XX) } else if (set_op_mix_ratio == 1) { Matrix::drop0(X + Matrix::t(X) - XX) } else { Matrix::drop0( set_op_mix_ratio * (X + Matrix::t(X) - XX) + (1 - set_op_mix_ratio) * XX ) } } # Calculate the (asymmetric) affinity matrix based on the nearest neighborhoods # default target for calibration is the sum of affinities = log2(n_nbrs) # nn distances should be stored column-wise smooth_knn <- function(nn_dist, nn_ptr = NULL, skip_first = TRUE, target = NULL, local_connectivity = 1.0, n_threads = NULL, grain_size = 1, ret_sigma = FALSE, verbose = FALSE) { if (is.null(n_threads)) { n_threads <- default_num_threads() } tsmessage( "Commencing smooth kNN distance calibration", pluralize("thread", n_threads, " using"), appendLF = FALSE ) if (length(target) == 1) { tsmessage(" with target n_neighbors = ", formatC(2^target), time_stamp = FALSE) } else { tsmessage(time_stamp = FALSE) } affinity_matrix_res <- smooth_knn_distances_parallel( nn_dist = nn_dist, nn_ptr = nn_ptr, skip_first = skip_first, target = target, n_iter = 64, local_connectivity = local_connectivity, tol = 1e-5, min_k_dist_scale = 1e-3, n_threads = n_threads, grain_size = grain_size, ret_sigma = ret_sigma ) if (verbose && affinity_matrix_res$n_failures > 0) { tsmessage(affinity_matrix_res$n_failures, " smooth knn distance failures") } affinity_matrix_res } smooth_knn_matrix <- function(nn, target = NULL, local_connectivity = 1.0, bandwidth = 1.0, ret_sigma = FALSE, n_threads = NULL, grain_size = 1, verbose = FALSE) { if (is.null(n_threads)) { n_threads <- default_num_threads() } osparse <- NULL if (is_sparse_matrix(nn)) { nn <- Matrix::drop0(nn) osparse <- order_sparse(nn) nn_dist <- osparse$x nn_ptr <- osparse$p n_nbrs <- diff(nn_ptr) if (any(n_nbrs < 1)) { stop("All observations need at least one neighbor") } if (is.null(target)) { # add 1 to n_nbrs to account for implicit self neighbor target <- log2(n_nbrs + 1) * bandwidth } skip_first <- FALSE } else { nnt <- nn_graph_t(nn) n_nbrs <- nrow(nnt$dist) if (is.null(target)) { target <- log2(n_nbrs) * bandwidth } nn_ptr <- n_nbrs nn_dist <- as.vector(nnt$dist) skip_first <- TRUE } affinity_matrix_res <- smooth_knn( nn_dist = nn_dist, nn_ptr = nn_ptr, skip_first = skip_first, target = target, local_connectivity = local_connectivity, ret_sigma = ret_sigma, n_threads = n_threads, grain_size = grain_size, verbose = verbose ) v <- affinity_matrix_res$matrix if (is_sparse_matrix(nn)) { # use j instead of i to transpose it v <- Matrix::sparseMatrix( j = osparse$i, p = osparse$p, x = v, dims = osparse$dims, index1 = FALSE ) Matrix::diag(v) <- 0.0 v <- Matrix::drop0(v) } else { v <- nng_to_sparse(nnt$idx, v, self_nbr = TRUE, by_row = FALSE) } affinity_matrix_res$matrix <- v affinity_matrix_res } # Given nearest neighbor data and a measure of distance compute # the fuzzy simplicial set (here represented as a fuzzy graph in the form of a # sparse matrix) associated to the data. This is done by locally approximating # geodesic distance at each point, creating a fuzzy simplicial set for each such # point, and then combining all the local fuzzy simplicial sets into a global # one via a fuzzy union fuzzy_simplicial_set <- function(nn, target = NULL, set_op_mix_ratio = 1.0, local_connectivity = 1.0, bandwidth = 1.0, ret_sigma = FALSE, n_threads = NULL, grain_size = 1, verbose = FALSE) { affinity_matrix_res <- smooth_knn_matrix( nn = nn, target = target, local_connectivity = local_connectivity, bandwidth = bandwidth, ret_sigma = ret_sigma, n_threads = n_threads, grain_size = grain_size, verbose = verbose ) res <- fuzzy_set_union(affinity_matrix_res$matrix, set_op_mix_ratio = set_op_mix_ratio) if (ret_sigma) { res <- list(matrix = res) res$sigma <- affinity_matrix_res$sigma res$rho <- affinity_matrix_res$rho } res } symmetrize <- function(P) { 0.5 * (P + Matrix::t(P)) } perplexity_similarities <- function(nn, perplexity = NULL, ret_sigma = FALSE, n_threads = NULL, grain_size = 1, kernel = "gauss", verbose = FALSE) { if (is.null(n_threads)) { n_threads <- default_num_threads() } if (is.null(perplexity) && kernel != "knn") { stop("Must provide perplexity") } sigma <- NULL if (kernel == "gauss") { tsmessage( "Commencing calibration for perplexity = ", formatC(perplexity), pluralize("thread", n_threads, " using") ) nnt <- nn_graph_t(nn) n_vertices <- ncol(nnt$dist) affinity_matrix_res <- calc_row_probabilities_parallel( nn_dist = as.vector(nnt$dist), n_vertices = n_vertices, perplexity = perplexity, ret_sigma = ret_sigma, n_threads = n_threads, grain_size = grain_size ) if (verbose && affinity_matrix_res$n_failures > 0) { tsmessage(affinity_matrix_res$n_failures, " perplexity failures") } dint <- NULL if (ret_sigma && !is.null(affinity_matrix_res$sigma)) { # An analytical version of the "soft" correlation dimension estimate of # intrinsic dimensionality from multi-scale SNE by Lee et al (2015). # http://jlmelville.github.io/sneer/dimensionality.html d <- nnt$dist p <- affinity_matrix_res$matrix logp <- log(p + .Machine$double.eps) s <- affinity_matrix_res$sigma h <- -colSums(p * logp) lph <- sweep(logp, 2, h, `+`) dhdb <- colSums(d * d * p * lph) dint <- -2 * dhdb / (s * s) } affinity_matrix <- nng_to_sparse(nnt$idx, as.vector(affinity_matrix_res$matrix), self_nbr = TRUE, by_row = FALSE ) if (!is.null(affinity_matrix_res$sigma)) { sigma <- affinity_matrix_res$sigma } } else { # knn kernel tsmessage("Using knn graph for input weights with k = ", ncol(nn$idx)) # Make each row sum to 1, ignoring the self-index # i.e. diagonal will be zero affinity_matrix <- nng_to_sparse(nn$idx, val = 1 / (ncol(nn$idx) - 1)) Matrix::diag(affinity_matrix) <- 0 affinity_matrix <- Matrix::drop0(affinity_matrix) } res <- list(matrix = symmetrize(affinity_matrix)) if (ret_sigma && !is.null(sigma)) { res$sigma <- sigma if (!is.null(dint)) { res$dint <- dint } } res } # Convert the matrix of NN indices to a sparse asymmetric matrix where each # edge has a weight of val (scalar or vector) # return a sparse matrix with dimensions of nrow(nn_idx) x max_nbr_id nn_to_sparse <- function(nn_idxv, n_obs, val = 1, self_nbr = FALSE, max_nbr_id = NULL, by_row = TRUE) { n_nbrs <- length(nn_idxv) / n_obs if (is.null(max_nbr_id)) { max_nbr_id <- ifelse(self_nbr, n_obs, max(nn_idxv)) } if (length(val) == 1) { xs <- rep(val, n_obs * n_nbrs) } else { xs <- val } if (by_row) { is <- rep(1:n_obs, times = n_nbrs) } else { is <- rep(1:n_obs, each = n_nbrs) } dims <- c(n_obs, max_nbr_id) res <- Matrix::sparseMatrix(i = is, j = nn_idxv, x = xs, dims = dims) if (self_nbr) { Matrix::diag(res) <- 0 res <- Matrix::drop0(res) } res } nng_to_sparse <- function(nn_idx, val = 1, self_nbr = FALSE, max_nbr_id = NULL, by_row = TRUE) { if (by_row) { n_obs <- nrow(nn_idx) } else { n_obs <- ncol(nn_idx) } nn_to_sparse(as.vector(nn_idx), n_obs, val = val, self_nbr = self_nbr, max_nbr_id = max_nbr_id, by_row = by_row ) } # transpose the index and distance matrix nn_graph_t <- function(nn_graph) { list(idx = t(nn_graph$idx), dist = t(nn_graph$dist)) } order_sparse <- function(spm) { x <- spm@x i <- spm@i p <- spm@p x_sort <- rep(0, length(x)) i_sort <- rep(0, length(i)) n_vertices <- length(p) - 1 for (v in 1:n_vertices) { p_begin <- p[v] p_end <- p[v + 1] if (p_end - p_begin == 0) { next } pb1 <- p_begin + 1 x_order <- order(x[pb1:p_end]) x_sort[pb1:p_end] <- x[x_order + p_begin] i_sort[pb1:p_end] <- i[x_order + p_begin] } list(i = i_sort, p = p, x = x_sort, order = x_order, dims = spm@Dim) } uwot/R/init.R0000644000176200001440000005514514754234312012556 0ustar liggesusers# Laplacian Eigenmap (Belkin & Niyogi, 2002) # Original formulation solves the generalized eigenvalue problem of the # unnormalized graph Laplacian: L v = lambda D v, where L = D - A # and uses the bottom eigenvectors v that result # (ignoring the constant eigenvector associated with the smallest eigenvalue). # # This is equivalent to using the top eigenvectors from the usual # eigendecomposition of a row-normalized Laplacian P = D^-1 A: P v = lambda' v # so we don't need to depend on an external package for generalized eigenvalues. # Note that while the eigenvectors are the same, the eigenvalues are # different: lambda' = 1 - lambda, but we don't use them with Laplacian # Eigenmaps anyway. # # As we only need to calculate the top ndim + 1 eigenvectors (i.e. normally 3) # it's incredibly wasteful to calculate all of them. # A must be symmetric and positive semi definite, but not necessarily # normalized in any specific way. #' @import Matrix laplacian_eigenmap <- function(A, ndim = 2, verbose = FALSE, force_irlba = FALSE) { if (rspectra_is_installed() && !force_irlba) { coords <- rspectra_laplacian_eigenmap(A, ndim, verbose = verbose) } else { coords <- irlba_laplacian_eigenmap(A, ndim, verbose = verbose) } coords } rspectra_laplacian_eigenmap <- function(A, ndim = 2, verbose = FALSE) { if (nrow(A) < 3) { tsmessage("Graph too small, using random initialization instead") return(rand_init(nrow(A), ndim)) } tsmessage("Initializing from Laplacian Eigenmap (via RSpectra)") # Equivalent to: D <- diag(colSums(A)); M <- solve(D) %*% A # This effectively row-normalizes A: colSums is normally faster than rowSums # and because A is symmetric, they're equivalent M <- A / colSums(A) res <- rspectra_eigs_asym(M, ndim) if (is.null(res) || !is.list(res) || !"vectors" %in% names(res) || is.null(res$vectors) || tryCatch( is.na(ncol(res$vectors)), error = function(e) { TRUE } ) || ncol(res$vectors) < ndim) { message( "Laplacian Eigenmap failed to converge, ", "using random initialization instead" ) n <- nrow(M) return(rand_init(n, ndim)) } # return the smallest eigenvalues as.matrix(Re(res$vectors[, 2:(ndim + 1)])) } irlba_laplacian_eigenmap <- function(A, ndim = 2, verbose = FALSE) { if (nrow(A) < 3) { tsmessage("Graph too small, using random initialization instead") return(rand_init(nrow(A), ndim)) } tsmessage("Initializing from Laplacian Eigenmap (via irlba)") lapA <- form_modified_laplacian(A, ret_d = TRUE) res <- irlba_spectral_tsvd(lapA$L, ndim + 1) if (is.null(res) || ncol(res$vectors) < ndim || !res$converged) { message( "Laplacian Eigenmap failed to converge, ", "using random initialization instead" ) return(rand_init(nrow(A), ndim)) } res <- lapA$Disqrt * res$vectors[, 2:(ndim + 1), drop = FALSE] # re-scale the vectors to length 1 sweep(res, 2, sqrt(colSums(res * res)), `/`) } form_normalized_laplacian <- function(A) { # Normalized Laplacian: clear and close to UMAP code, but very slow in R # I <- diag(1, nrow = n, ncol = n) # D <- diag(1 / sqrt(colSums(A))) # L <- I - D %*% A %*% D # A lot faster (order of magnitude when n = 1000) Dsq <- sqrt(Matrix::colSums(A)) L <- -Matrix::t(A / Dsq) / Dsq Matrix::diag(L) <- 1 + Matrix::diag(L) L } # The symmetrized graph Laplacian (Lsym) but shifted so that: # the bottom eigenvectors of Lsym correspond to the top singular vectors of # this matrix (hence can be used with truncated SVD), and the eigenvalues # are all positive, so we don't lose sign and hence correct eigenvector ordering # when using the singular values (lambda = 2 - d) # effectively we form 2I - Lsym = D^-1/2 W D^-1/2 + I form_modified_laplacian <- function(A, ret_d = FALSE) { Dsq <- sqrt(Matrix::colSums(A)) L <- Matrix::t(A / Dsq) / Dsq Matrix::diag(L) <- 1 + Matrix::diag(L) if (ret_d) { list(L = L, Disqrt = 1 / Dsq) } else { L } } # Return the ndim eigenvectors associated with the ndim largest eigenvalues sort_eigenvectors <- function(eig_res, ndim, decreasing = TRUE) { vec_indices <- rev(order(eig_res$values, decreasing = decreasing)[1:ndim]) as.matrix(Re(eig_res$vectors[, vec_indices])) } normalized_laplacian_init <- function(A, ndim = 2, verbose = FALSE, force_irlba = FALSE) { if (rspectra_is_installed() && !force_irlba) { coords <- rspectra_normalized_laplacian_init(A, ndim, verbose = verbose) } else { coords <- irlba_normalized_laplacian_init(A, ndim, verbose = verbose) } coords } rspectra_normalized_laplacian_init <- function(A, ndim = 2, verbose = FALSE) { if (nrow(A) < 3) { tsmessage("Graph too small, using random initialization instead") return(rand_init(nrow(A), ndim)) } tsmessage("Initializing from normalized Laplacian") L <- form_normalized_laplacian(A) res <- rspectra_eigs_sym(L, ndim, verbose = verbose) if (is.null(res) || !is.list(res) || !"vectors" %in% names(res) || is.null(res$vectors) || tryCatch( is.na(ncol(res$vectors)), error = function(e) { TRUE } ) || ncol(res$vectors) < ndim) { message( "Spectral initialization failed to converge, ", "using random initialization instead" ) n <- nrow(A) return(rand_init(n, ndim)) } sort_eigenvectors(res, ndim) } # maybe this should become an option one day # reminder to me for the next time I experiment: Shift-invert just doesn't work # in all cases: causes MNIST (k = 15) to hang forever even with a guess for # initvec of D^1/2, no matter which shift value (or maxitr or tol value) is # used. But we can form the shifted Laplacian like we do for tsvd approaches # and look for the LM eigenvalues (plus initialize with D^1/2 as a guess) # However the tolerance needs to be lower for similar quality of output # and most initializations are < 1 second anyway, so we don't gain much from # speeding them up. For slower initializations (e.g. `tomoradar`, `mammoth`) # there is a small speed up, but not down to a few seconds rspectra_normalized_laplacian_init_shift_inv <- function(A, ndim = 2, verbose = FALSE) { if (nrow(A) < 3) { tsmessage("Graph too small, using random initialization instead") return(rand_init(nrow(A), ndim)) } tsmessage("Initializing from normalized Laplacian") L <- form_modified_laplacian(A) initvec <- sqrt(Matrix::colSums(A)) res <- rspectra_eigs_shift_sym(L, ndim, verbose = verbose, initvec = initvec, tol = 1e-6 ) norm_initvec <- initvec / sqrt(sum(initvec * initvec)) if (is.null(res) || !is.list(res) || !"vectors" %in% names(res) || is.null(res$vectors) || tryCatch( is.na(ncol(res$vectors)), error = function(e) { TRUE } ) || ncol(res$vectors) < ndim) { message( "Spectral initialization failed to converge, ", "using random initialization instead" ) n <- nrow(A) return(rand_init(n, ndim)) } sort_eigenvectors(res, ndim, decreasing = FALSE) } # Use a normalized Laplacian and use truncated SVD irlba_tsvd_normalized_laplacian_init <- function(A, ndim = 2, verbose = FALSE) { if (nrow(A) < 3) { tsmessage("Graph too small, using random initialization instead") return(rand_init(nrow(A), ndim)) } tsmessage("Initializing from normalized Laplacian") L <- form_modified_laplacian(A) res <- irlba_spectral_tsvd(L, ndim + 1) if (is.null(res) || ncol(res$vectors) < ndim || !res$converged) { message( "Spectral initialization failed to converge, ", "using random initialization instead" ) n <- nrow(A) return(rand_init(n, ndim)) } res$vectors[, 2:(ndim + 1), drop = FALSE] } irlba_spectral_tsvd <- function(L, n, iters = 1000) { irlba_args <- list( A = L, nv = n, nu = 0, maxit = iters ) suppressWarnings(res <- tryCatch( do.call(irlba::irlba, irlba_args), error = function(c) { irlba_args$fastpath <- FALSE do.call(irlba::irlba, irlba_args) } )) list( vectors = res$v, values = 2.0 - res$d, converged = res$iter != iters ) } irlba_eigs_asym <- function(L, ndim) { irlba_args <- list( x = L, n = ndim + 1, symmetric = FALSE, smallest = TRUE, tol = 1e-3, maxit = 1000 ) suppressWarnings(res <- tryCatch( do.call(irlba::partial_eigen, irlba_args), error = function(e) { irlba_args$fastpath <- FALSE tryCatch( do.call(irlba::partial_eigen, irlba_args), error = function(e) { NULL } ) } )) if (!is.null(res)) { res$values <- sqrt(res$values) } res } irlba_eigs_sym <- function(L, ndim, smallest = TRUE) { irlba_args <- list( x = L, n = ndim + 1, symmetric = TRUE, smallest = smallest, tol = 1e-3, maxit = 1000 ) suppressWarnings(res <- tryCatch( do.call(irlba::partial_eigen, irlba_args), error = function(e) { irlba_args$fastpath <- FALSE tryCatch( do.call(irlba::partial_eigen, irlba_args), error = function(e) { NULL } ) } )) res } # Use irlba's partial_eigen instead of RSpectra irlba_normalized_laplacian_init <- function(A, ndim = 2, verbose = FALSE) { if (nrow(A) < 3) { tsmessage("Graph too small, using random initialization instead") return(rand_init(nrow(A), ndim)) } tsmessage("Initializing from normalized Laplacian (using irlba)") # Using the normalized Laplacian and looking for smallest eigenvalues does # not work well with irlba's partial_eigen routine, so form the shifted # Laplacian and look for largest eigenvalues L <- form_modified_laplacian(A) res <- irlba_eigs_sym(L, ndim, smallest = FALSE) if (is.null(res) || !is.list(res) || !"vectors" %in% names(res) || is.null(res$vectors) || tryCatch( is.na(ncol(res$vectors)), error = function(e) { TRUE } ) || ncol(res$vectors) < ndim) { message( "Spectral initialization failed to converge, ", "using random initialization instead" ) n <- nrow(A) return(rand_init(n, ndim)) } # shift back the eigenvalues res$values <- 2.0 - res$values sort_eigenvectors(res, ndim) } # Default UMAP initialization # spectral decomposition of the normalized Laplacian + some noise spectral_init <- function(A, ndim = 2, verbose = FALSE, force_irlba = FALSE) { if (nrow(A) < 3) { tsmessage("Graph too small, using random initialization instead") return(rand_init(nrow(A), ndim)) } if (rspectra_is_installed() && !force_irlba) { tsmessage("Initializing from normalized Laplacian + noise (using RSpectra)") coords <- rspectra_normalized_laplacian_init(A, ndim, verbose = FALSE) } else { tsmessage("Initializing from normalized Laplacian + noise (using irlba)") coords <- irlba_tsvd_normalized_laplacian_init(A, ndim, verbose = FALSE) } scale_and_jitter(coords, max_coord = 10.0, sd = 0.0001) } irlba_spectral_init <- function(A, ndim = 2, verbose = FALSE) { if (nrow(A) < 3) { tsmessage("Graph too small, using random initialization instead") return(rand_init(nrow(A), ndim)) } tsmessage("Initializing from normalized Laplacian (using irlba) + noise") coords <- irlba_normalized_laplacian_init(A, ndim, verbose = FALSE) scale_and_jitter(coords, max_coord = 10.0, sd = 0.0001) } # Scales coords so that the largest absolute coordinate is 10.0 then jitters by # adding gaussian noise with mean 0 and standard deviation sd scale_and_jitter <- function(coords, max_coord = 10.0, sd = 0.0001) { expansion <- 10.0 / max(abs(coords)) (coords * expansion) + matrix(stats::rnorm(n = prod(dim(coords)), sd = sd), ncol = ncol(coords) ) } # Return the number of connected components in a graph (represented as a # sparse matrix). connected_components <- function(X) { Xt <- Matrix::t(X) connected_components_undirected(nrow(X), Xt@i, Xt@p, X@i, X@p) } # UMAP random initialization: uniform between +10 and -10 along each axis rand_init <- function(n, ndim, verbose = FALSE) { tsmessage("Initializing from uniform random") matrix(stats::runif(n = n * ndim, min = -10, max = 10), ncol = ndim) } # LargeVis random initialization: Gaussian with sd 1e-4 (like t-SNE) rand_init_lv <- function(n, ndim, verbose = FALSE) { tsmessage("Initializing from random Gaussian with sd = 1e-4") matrix(stats::rnorm(ndim * n, sd = 1e-4), n) } # Rescale embedding so that the standard deviation is the specified value. # Default gives initialization like t-SNE, but not random. Large initial # distances lead to small gradients, and hence small updates, so should be # avoided. scale_coords <- function(X, sdev = 1e-4, verbose = FALSE) { if (is.character(sdev) && sdev == "range") { # #99: range scale coordinates like python UMAP does tsmessage("Range-scaling initial input columns to 0-10") return(apply(X, 2, range_scale, max = 10.0)) } if (is.null(sdev)) { return(X) } tsmessage("Scaling init to sdev = ", sdev) scale_factor <- apply(X, 2, stats::sd) scale(X, scale = scale_factor / sdev) } # PCA # Calculates a matrix containing the first ndim columns of the PCA scores. # Returns the score matrix unless ret_extra is TRUE, in which case a list # is returned also containing the eigenvalues pca_init <- function(X, ndim = min(dim(X)), center = TRUE, ret_extra = FALSE, pca_method = "auto", verbose = FALSE) { if (inherits(X, "dist")) { res_mds <- stats::cmdscale(X, x.ret = TRUE, eig = TRUE, k = ndim) if (ret_extra || verbose) { lambda <- res_mds$eig lambda[lambda < 0] <- 0 varex <- sum(lambda[1:ndim]) / sum(lambda) tsmessage( "PCA (using classical MDS): ", ndim, " components explained ", formatC(varex * 100), "% variance" ) } scores <- res_mds$points return(scores) } # irlba warns about using too large a percentage of total singular value # so don't use if dataset is small compared to ndim if (pca_method == "auto") { if (ndim < 0.5 * min(dim(X))) { pca_method <- "irlba" } else { pca_method <- "svd" } } if (pca_method == "bigstatsr") { if (!bigstatsr_is_installed()) { warning( "PCA via bigstatsr requires the 'bigstatsr' package. ", "Please install it. Falling back to 'irlba'" ) pca_method <- "irlba" } } tsmessage("Using '", pca_method, "' for PCA") pca_fun <- switch(pca_method, irlba = irlba_scores, svdr = irlba_svdr_scores, svd = svd_scores, bigstatsr = bigstatsr_scores, stop("BUG: unknown svd method '", pca_method, "'") ) do.call(pca_fun, list( X = X, ncol = ndim, center = center, ret_extra = ret_extra, verbose = verbose )) } # Get scores by SVD svd_scores <- function(X, ncol = min(dim(X)), center = TRUE, ret_extra = FALSE, verbose = FALSE) { # need extra data if we want to re-apply PCA to new points in umap_transform rotation <- NULL xcenter <- NULL X <- scale(X, center = center, scale = FALSE) # do SVD on X directly rather than forming covariance matrix s <- svd(X, nu = ncol, nv = ifelse(ret_extra, ncol, 0)) D <- diag(c(s$d[1:ncol]), ncol, ncol) if (verbose || ret_extra) { # calculate eigenvalues of covariance matrix from singular values lambda <- (s$d^2) / (nrow(X) - 1) varex <- sum(lambda[1:ncol]) / sum(lambda) tsmessage( "PCA: ", ncol, " components explained ", formatC(varex * 100), "% variance" ) } scores <- s$u %*% D if (ret_extra) { rotation <- s$v xcenter <- attr(X, "scaled:center") } if (ret_extra) { list( scores = scores, lambda = lambda[1:ncol], rotation = rotation, center = xcenter ) } else { scores } } # Get PCA scores via irlba irlba_scores <- function(X, ncol, center = TRUE, ret_extra = FALSE, verbose = FALSE) { if (is.logical(X)) { tsmessage("Converting logical input to numeric for PCA initialization") # convert logical matrix to numeric X <- X * 1 } res <- irlba::prcomp_irlba(X, n = ncol, retx = TRUE, center = center, scale = FALSE ) report_varex(res, verbose) if (ret_extra) { list(scores = res$x, rotation = res$rotation, center = res$center) } else { res$x } } report_varex <- function(res, verbose = FALSE) { if (verbose) { ncol <- ncol(res$rotation) varex <- sum(res$sdev[1:ncol]^2) / res$totalvar tsmessage( "PCA: ", ncol, " components explained ", formatC(varex * 100), "% variance" ) } } # This function taken from irlba and modified to use irlba::svdr rather # than irlba::irlba prcomp_rsvd <- function(x, n = 3, retx = TRUE, center = TRUE, scale. = FALSE, ...) { a <- names(as.list(match.call())) ans <- list(scale = scale.) if ("tol" %in% a) { warning("The `tol` truncation argument from `prcomp` is not supported by\n`prcomp_rsvd`. If specified, `tol` is passed to the `irlba` function to\ncontrol that algorithm's convergence tolerance. See `?prcomp_irlba` for help.") } if (is.data.frame(x)) { x <- as.matrix(x) } args <- list(x = x, k = n) if (is.logical(center)) { if (center) { args$center <- colMeans(x) } } else { args$center <- center } if (is.logical(scale.)) { if (is.numeric(args$center)) { f <- function(i) { sqrt(sum((x[, i] - args$center[i])^2) / (nrow(x) - 1L)) } scale. <- vapply(seq(ncol(x)), f, pi, USE.NAMES = FALSE) if (ans$scale) { ans$totalvar <- ncol(x) } else { ans$totalvar <- sum(scale.^2) } } else { if (ans$scale) { scale. <- apply(x, 2L, function(v) { sqrt(sum(v^2) / max( 1, length(v) - 1L )) }) f <- function(i) { sqrt(sum((x[, i] / scale.[i])^2) / (nrow(x) - 1L)) } ans$totalvar <- sum(vapply(seq(ncol(x)), f, pi, USE.NAMES = FALSE )^2) } else { f <- function(i) sum(x[, i]^2) / (nrow(x) - 1L) ans$totalvar <- sum(vapply(seq(ncol(x)), f, pi, USE.NAMES = FALSE )) } } if (ans$scale) { args$scale <- scale. } } else { args$scale <- scale. f <- function(i) { sqrt(sum((x[, i] / scale.[i])^2) / (nrow(x) - 1L)) } ans$totalvar <- sum(vapply(seq(ncol(x)), f, pi, USE.NAMES = FALSE)) } if (!missing(...)) { args <- c(args, list(...)) } s <- do.call(irlba::svdr, args = args) ans$sdev <- s$d / sqrt(max(1, nrow(x) - 1)) ans$rotation <- s$v colnames(ans$rotation) <- paste("PC", seq(1, ncol(ans$rotation)), sep = "" ) ans$center <- args$center if (retx) { ans <- c(ans, list(x = sweep(s$u, 2, s$d, FUN = `*`))) colnames(ans$x) <- paste("PC", seq(1, ncol(ans$rotation)), sep = "" ) } class(ans) <- c("irlba_prcomp", "prcomp") ans } irlba_svdr_scores <- function(X, ncol, center = TRUE, ret_extra = FALSE, verbose = FALSE) { # 5 iterations is the default for scikit-learn TruncatedSVD res <- prcomp_rsvd( X, n = ncol, retx = TRUE, center = center, scale. = FALSE, it = 5 ) report_varex(res, verbose) if (ret_extra) { list( scores = res$x, rotation = res$rotation, center = res$center ) } else { res$x } } init_is_spectral <- function(init) { res <- pmatch(tolower(init), c( "normlaplacian", "spectral", "laplacian", "inormlaplacian", "ispectral", "agspectral", "irlba_spectral", "irlba_laplacian" )) length(res) > 0 && !is.na(res) } rand_nbr_graph <- function(n_vertices, n_nbrs, val) { nng_to_sparse(rand_nbr_idx(n_vertices, n_nbrs), val = val, max_nbr_id = n_vertices ) } rand_nbr_idx <- function(n_vertices, n_nbrs) { idx <- matrix(nrow = n_vertices, ncol = n_nbrs) nv1 <- n_vertices - 1 for (i in 1:n_vertices) { ids <- sample.int(nv1, n_nbrs) id_sel <- ids >= 1 ids[id_sel] <- ids[id_sel] + 1 idx[i, ] <- ids } idx } # V: the current affinity graph # n_pos: number of neighbors to retain per item # n_neg: number of "negative" (i.e. non-)neighbors per item # pos_affinity: value for the positive affinity (associated with nbrs) # neg_affinity: value for the negative affinity (associated with neg nbrs) approx_affinity_graph <- function(V, n_neg, pos_affinity = 1, neg_affinity = 0.1, verbose = FALSE) { pos_V <- V pos_V@x <- rep(pos_affinity, length(pos_V@x)) pos_V <- 0.5 * (pos_V + Matrix::t(pos_V)) neg_V <- rand_nbr_graph(nrow(pos_V), n_nbrs = n_neg, val = neg_affinity) neg_V <- 0.5 * (neg_V + Matrix::t(neg_V)) # the cleanup below will ensure that where the same value got a pos and neg # affinity it will end up positive graph <- pos_V + neg_V # clamp small values to neg_affinity graph@x[graph@x < pos_affinity] <- neg_affinity # and large values to pos_affinity graph@x <- pmin(graph@x, pos_affinity) Matrix::drop0(graph) } # Initialize using a spectral decomposition of an "approximate global" graph # Uses the same graph as standard UMAP, but with each entry set to 1. A measure # of global structure is added by randomly setting some of the remaining zero # to a smaller value (0.1 in this case). # This routine is inspired by some ideas in # 2-D Embedding of Large and High-dimensional Data with Minimal Memory and Computational Time Requirements # Witold Dzwinel, Rafal Wcislo, Stan Matwin # https://arxiv.org/abs/1902.01108 # # Randomized Near Neighbor Graphs, Giant Components, and Applications in Data Science # George C. Linderman, Gal Mishne, Yuval Kluger, Stefan Steinerberger # https://arxiv.org/abs/1711.04712 agspectral_init <- function(V, n_neg_nbrs, pos_affinity = 1, neg_affinity = 0.1, ndim = 2, verbose = FALSE) { graph <- approx_affinity_graph(V, n_neg_nbrs, pos_affinity = pos_affinity, neg_affinity = neg_affinity, verbose = verbose ) spectral_init(graph, ndim = ndim, verbose = verbose) } uwot/R/util.R0000644000176200001440000001714414733727752012601 0ustar liggesusersstime <- function() { format(Sys.time(), "%T") } # message with a time stamp # appears only if called from an environment where a logical verbose = TRUE # OR force = TRUE tsmessage <- function(..., domain = NULL, appendLF = TRUE, force = FALSE, time_stamp = TRUE) { verbose <- get0("verbose", envir = sys.parent()) if (force || (!is.null(verbose) && verbose)) { msg <- "" if (time_stamp) { msg <- paste0(stime(), " ") } message(msg, ..., domain = domain, appendLF = appendLF) utils::flush.console() } } # log vector information summarize <- function(X, msg = "") { summary_X <- summary(X, digits = max(3, getOption("digits") - 3)) tsmessage(msg, ": ", paste(names(summary_X), ":", summary_X, "|", collapse = "" ), force = get0("verbose", envir = sys.parent()) ) } # pluralize("thread", 1) => "1 thread" # pluralize("thread", 2) => "2 threads" pluralize <- function(str, n, prefix = NULL, inc_num = TRUE) { if (n == 0) { return("") } ret <- paste0(str, ifelse(n != 1, "s", "")) if (inc_num) { ret <- paste0(n, " ", ret) } if (!is.null(prefix)) { ret <- paste0(prefix, " ", ret) } ret } # convert data frame to matrix using numeric columns x2m <- function(X) { if (!methods::is(X, "matrix")) { m <- as.matrix(X[, which(vapply(X, is.numeric, logical(1)))]) } else { m <- X } m } # given a metric argument, returns a list containing: # metrics - the input list with any members called "categorical" removed # categoricals - a vector of the categorical ids find_categoricals <- function(metrics) { res <- list( metrics = metrics ) if (is.list(metrics)) { cat_pos <- grep("categorical", names(metrics)) if (length(cat_pos) > 0) { cat_ids <- unlist(metrics[cat_pos]) names(cat_ids) <- NULL res <- list( metrics = metrics[-cat_pos], categoricals = cat_ids ) } } res } # Splits a list into its named and unnamed components: # > lsplit_unnamed(list(1:10, pca_center = FALSE)) # $named # $named$pca_center # [1] FALSE # # # $unnamed # $unnamed[[1]] # [1] 1 2 3 4 5 6 7 8 9 10 lsplit_unnamed <- function(l) { lnames <- names(l) if (is.null(lnames)) { return(list(unnamed = l)) } is_named <- lnames != "" nids <- which(is_named) uids <- which(!is_named) if (length(uids) == 0) { return(list(named = l[nids])) } list( named = l[nids], unnamed = l[uids] ) } # Do work and update a progress bar progress_for <- function(n, nchunks, fun) { message("0% 10 20 30 40 50 60 70 80 90 100%") message("[----|----|----|----|----|----|----|----|----|----|") remaining <- n chunk_end <- 0 for (i in 1:nchunks) { chunk_start <- chunk_end + 1 chunk_end <- chunk_start + round(remaining / (nchunks - i + 1)) - 1 remaining <- remaining - (chunk_end - chunk_start + 1) fun(chunk_start, chunk_end) message("*", appendLF = FALSE) utils::flush.console() } message("|") } checkna <- function(X) { if (!is.null(X) && any(is.na(X))) { stop("Missing values found in 'X'") } } check_graph <- function(graph, expected_rows = NULL, expected_cols = NULL, bipartite = FALSE) { idx <- graph$idx dist <- graph$dist if (!methods::is(idx, "matrix")) { stop("neighbor graph must contain an 'idx' matrix") } if (!methods::is(dist, "matrix")) { stop("neighbor graph must contain a 'dist' matrix") } if (!all(dim(idx) == dim(dist))) { stop("'idx' and 'dist' matrices must have identical dimensions") } # graph may be our only source of input data, in which case no other source # to validate from if (!is.null(expected_rows)) { if (nrow(idx) != expected_rows) { stop("idx matrix has unexpected number of rows") } } if (!is.null(expected_cols) && !is.na(expected_cols)) { if (ncol(idx) != expected_cols) { stop("idx matrix has unexpected number of columns") } } # if looking at neighbors within one graph there can't be more neighbors # than observations if (!bipartite) { if (ncol(idx) > nrow(idx)) { stop("Invalid neighbors: number exceeds number of observations") } if (max(idx) > nrow(idx)) { stop("Invalid neighbors: max index exceeds number of observations") } } } check_sparse_graph <- function(graph, expected_rows = NULL, expected_cols = NULL, bipartite = FALSE) { if (!is.null(expected_rows)) { if (nrow(graph) != expected_rows) { stop("Sparse distance matrix has unexpected number of rows") } } if (!is.null(expected_cols)) { if (ncol(graph) != expected_cols) { stop("Sparse distance matrix has unexpected number of cols") } } if (!bipartite) { if (nrow(graph) != ncol(graph)) { stop("Sparse distance matrix must have same number of rows and cols") } } } check_graph_list <- function(graph_list, expected_rows = NULL, expected_cols = NULL, bipartite = FALSE) { if (nn_is_single(graph_list)) { graph_list <- list(graph_list) } num_nns <- length(graph_list) if (num_nns == 0) { stop("precalculated graph list is empty") } for (i in 1:num_nns) { graph <- graph_list[[i]] if (is.list(graph)) { check_graph(graph, expected_rows, expected_cols, bipartite = bipartite) } else if (is_sparse_matrix(graph)) { check_sparse_graph(graph, expected_rows, expected_cols, bipartite = bipartite ) } else { stop("Unknown neighbor data format") } } num_nns } nn_graph_row_names_list <- function(graph_list) { if (nn_is_single(graph_list)) { graph_list <- list(graph_list) } xnames <- NULL for (i in 1:length(graph_list)) { graph <- graph_list[[i]] if (is.list(graph)) { xnames <- nn_graph_row_names(graph) } else if (is_sparse_matrix(graph)) { xnames <- row.names(graph) } else { stop("Unknown neighbor data format") } if (!is.null(xnames)) { break } } xnames } # from a nn graph (or list) get the first non-NULL row names nn_graph_row_names <- function(graph) { xnames <- NULL if (!is.null(row.names(graph$idx))) { xnames <- row.names(graph$idx) } if (is.null(xnames) && !is.null(row.names(graph$dist))) { xnames <- row.names(graph$dist) } xnames } nn_graph_nbrs_list <- function(graph_list) { if (nn_is_single(graph_list)) { graph_list <- list(graph_list) } sapply(graph_list, nn_graph_nbrs) } # from a nn graph (or list) get the number of neighbors nn_graph_nbrs <- function(graph) { if (is.list(graph)) { ncol(graph$idx) } else if (is_sparse_matrix(graph)) { NA } else { stop("Unknown neighbor data format") } } is_sparse_matrix <- function(m) { methods::is(m, "sparseMatrix") } # Add the (named) values in l2 to l1. # Use to override default values in l1 with user-supplied values in l2 lmerge <- function(l1, l2) { for (name in names(l2)) { l1[[name]] <- l2[[name]] } l1 } range_scale <- function(x, min = 0, max = 1) { (x - min(x)) / (max(x) - min(x)) * (max - min) + min } is_installed <- function(pkgname) { requireNamespace(pkgname, quietly = TRUE, warn.conflicts = FALSE ) isNamespaceLoaded(pkgname) } is_win7 <- function() { sys_info <- Sys.info() sys_info[["sysname"]] == "Windows" && strsplit(sys_info["release"], split = " ")$release[[1]] == "7" } uwot/vignettes/0000755000176200001440000000000014757004314013265 5ustar liggesusersuwot/vignettes/uwot.Rmd0000644000176200001440000003712614756267051014750 0ustar liggesusers--- title: "uwot" output: rmarkdown::html_vignette: fig_width: 4 fig_height: 4 vignette: > %\VignetteIndexEntry{uwot} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ``` ```{r setup} library(uwot) library(RSpectra) ``` `uwot` is the a package for implementing the UMAP dimensionality reduction method. For more information on UMAP, see the [original paper](https://arxiv.org/abs/1802.03426) and the [Python package](https://github.com/lmcinnes/umap). We'll use the `iris` dataset in these examples. It's not the ideal dataset because it's not terribly large nor high-dimensional (with only 4 numeric columns), but you'll get the general idea. The default output dimensionality of UMAP is into two-dimensions, so it's amenable for visualization, but you can set a larger value with `n_components`. In this vignette we'll stick with two dimensions. We will need a function to make plotting easier: ```{r plot function} kabsch <- function(pm, qm) { pm_dims <- dim(pm) if (!all(dim(qm) == pm_dims)) { stop(call. = TRUE, "Point sets must have the same dimensions") } # The rotation matrix will have (ncol - 1) leading ones in the diagonal diag_ones <- rep(1, pm_dims[2] - 1) # center the points pm <- scale(pm, center = TRUE, scale = FALSE) qm <- scale(qm, center = TRUE, scale = FALSE) am <- crossprod(pm, qm) svd_res <- svd(am) # use the sign of the determinant to ensure a right-hand coordinate system d <- determinant(tcrossprod(svd_res$v, svd_res$u))$sign dm <- diag(c(diag_ones, d)) # rotation matrix um <- svd_res$v %*% tcrossprod(dm, svd_res$u) # Rotate and then translate to the original centroid location of qm sweep(t(tcrossprod(um, pm)), 2, -attr(qm, "scaled:center")) } iris_pca2 <- prcomp(iris[, 1:4])$x[, 1:2] plot_umap <- function(coords, col = iris$Species, pca = iris_pca2) { plot(kabsch(coords, pca), col = col, xlab = "", ylab = "") } ``` Most of this code is just the [kabsch algorithm](https://en.wikipedia.org/wiki/Kabsch_algorithm) to align two point sets, which I am going to use to align the results of UMAP over the first two principal components. This is to keep the relative orientation of the output the same across different plots which makes it a bit easier to see the differences between them. UMAP is a stochastic algorithm so the output will be different each time you run it and small changes to the parameters can affect the *absolute* values of the coordinates, although the interpoint differences are usually similar. There's no need to go to such trouble in most circumstances: the output of `umap` is a perfectly useful 2D matrix of coordinates you can pass into a plotting function with no further processing required. ## Basic UMAP The defaults of the `umap` function should work for most datasets. No scaling of the input data is done, but non-numeric columns are ignored: ```{r basic UMAP} set.seed(42) iris_umap <- umap(iris) plot_umap(iris_umap) ``` ### Parameters `uwot` has accumulated many parameters over time, but most of the time there are only a handful you need worry about. The most important ones are: #### `min_dist` This is a mainly aesthetic parameter, which defines how close points can get in the output space. A smaller value tends to make any clusters in the output more compact. You should experiment with values between 0 and 1, although don't choose exactly zero. The default is 0.01, which seems like it's a bit small for `iris`. Let's crank up `min_dist` to `0.3`: ```{r min_dist 0.5} set.seed(42) iris_umap_md05 <- umap(iris, min_dist = 0.3) plot_umap(iris_umap_md05) ``` This has made the clusters bigger and closer together, so we'll use `min_dist = 0.3` for the other examples with `iris`. #### `n_neighbors` This defines the number of items in the dataset that define the neighborhood around each point. Set it too low and you will get a more fragmented layout. Set it too high and you will get something that will miss a lot of local structure. Here's a result with 5 neighbors: ```{r 5 neighbors} set.seed(42) iris_umap_nbrs5 <- umap(iris, n_neighbors = 5, min_dist = 0.3) plot_umap(iris_umap_nbrs5) ``` It's not hugely different from the default of 15 neighbors, but the clusters are a bit more broken up. There should be a more pronounced difference going the other way and looking at 100 neighbors: ```{r 100 neighbors} set.seed(42) iris_umap_nbrs100 <- umap(iris, n_neighbors = 100, min_dist = 0.3) plot_umap(iris_umap_nbrs100) ``` Here there is a much more uniform appearance to the results. It's always worth trying a few different values of `n_neighbors`, especially larger values, although larger values of `n_neighbors` will lead to longer run times. Sometimes small clusters that you think are meaningful may in fact be artifacts of setting `n_neighbors` too small, so starting with a larger value and looking at the effect of reducing `n_neighbors` can help you avoid over interpreting results. #### `init` The default initialization of UMAP is to use spectral initialization, which acts upon the (symmetrized) k-nearest neighbor graph that is in determined by your choice of `n_neighbors`. This is usually a good choice, but it involves a very sparse matrix, which can sometimes be a bit *too* sparse, which leads to numerical difficulties which manifest as slow run times or even hanging calculations. If your dataset causes these issues, you can either try increasing `n_neighbors` but I have seen cases where that would be inconvenient in terms of CPU and RAM usage. An alternative is to use the first two principal components of the data, which at least uses the data you provide to give a solid global picture of the data that UMAP can refine. It's not appropriate for every dataset, but in most cases, it's a perfectly good alternative. The only gotcha with it is that depending on the scaling of your data, the initial coordinates can have large inter-point distances. UMAP will not optimize that well, so such an output should be scaled to a small standard deviation. If you set `init = "spca"`, it will do all that for you, although to be more aligned with the UMAP coordinate initialization, I recommend you also set `init_sdev = "range"` as well. `init_sdev` can also take a numerical value for the standard deviation. Values from `1e-4` to `10` are reasonable, but I recommend you stick to the default of `"range"`. ```{r spca init} set.seed(42) iris_umap_spca <- umap(iris, init = "spca", init_sdev = "range", min_dist = 0.3 ) plot_umap(iris_umap_spca) ``` This doesn't have a big effect on `iris`, but it's good to know about this as an option: and it can also smooth out the effect of changing `n_neighbors` on the initial coordinates with the standard spectral initialization, which can make it easier to see the effect of changing `n_neighbors` on the final result. Some other `init` options to know about: * `"random"`: if the worst comes to the worst, you can always fall back to randomly assigning the initial coordinates. You really want to avoid this if you can though, because it will take longer to optimize the coordinates to the same quality, so you will need to increase `n_epochs` to compensate. Even if you do that, it's *much* more likely that you will end up in a minimum that is less desirable than one based on a good initialization. This will make interpreting the results harder, as you are more likely to end up with different clusters beings split or mixed with each other. * If you have some coordinates you like from another method, you can pass them in as a matrix. But remember will probably want to scale them with `init_sdev` though. #### `dens_scale` The `dens_scale` parameter varies from 0 to 1 and controls how much of the relative densities of the input data is attempted to be preserved in the output. ```{r UMAP with density scaling} set.seed(42) iris_umapds <- umap(iris, min_dist = 0.3, dens_scale = 0.5) plot_umap(iris_umapds) ``` This has shrunk the black cluster on the left of the plot (those are of species `setosa`), which reflect that the density of the `setosa` points is less spread out in the input data than the other two species. For more on `dens_scale` please read its dedicated [article](https://jlmelville.github.io/uwot/articles/leopold.html). ## Embedding New Data Once you have an embedding, you can use it to embed new data, although you need to remember to ask for a "model" to return. Instead of just the coordinates, you will now get back a list which contains all the extra parameters you will need for transforming new data. The coordinates are still available in the `$embedding` component. Let's try building a UMAP with just the `setosa` and `versicolor` iris species: ```{r create a UMAP model} set.seed(42) iris_train <- iris[iris$Species %in% c("setosa", "versicolor"), ] iris_train_umap <- umap(iris_train, min_dist = 0.3, ret_model = TRUE) plot( iris_train_umap$embedding, col = iris_train$Species, xlab = "", ylab = "", main = "UMAP setosa + versicolor" ) ``` Next, you can use `umap_transform` to embed the new points: ```{r embed new coordinates} iris_test <- iris[iris$Species == "virginica", ] set.seed(42) iris_test_umap <- umap_transform(iris_test, iris_train_umap) plot( rbind(iris_train_umap$embedding, iris_test_umap), col = iris$Species, xlab = "", ylab = "", main = "UMAP transform virginica" ) ``` The green points in the top-right show the embedded data. Note that the original (black and red) clusters do not get optimized any further. While we haven't perfectly reproduced the full UMAP, the `virginica` points are located in more or less the right place, close to the `versicolor` items. Just like with any machine learning method, you must be careful with how you choose your training set. ## Supported Distances For small (N < 4096) and Euclidean distance, exact nearest neighbors are found using the [FNN](https://cran.r-project.org/package=FNN) package. Otherwise, approximate nearest neighbors are found using [RcppAnnoy](https://cran.r-project.org/package=RcppAnnoy). The supported distance metrics (set by the `metric` parameter) are: * Euclidean * Cosine * Pearson Correlation (`correlation`) * Manhattan * Hamming Exactly what constitutes the cosine distance can differ between packages. `uwot` tries to follow how the Python version of UMAP defines it, which is 1 minus the cosine similarity. This differs slightly from how Annoy defines its angular distance, so be aware that `uwot` internally converts the Annoy version of the distance. Also be aware that the Pearson correlation distance is the cosine distance applied to row-centered vectors. If you need other metrics, and can generate the nearest neighbor info externally, you can pass the data directly to `uwot` via the `nn_method` parameter. Please note that the Hamming support is a lot slower than the other metrics. I do not recommend using it if you have more than a few hundred features, and even then expect it to take several minutes during the index building phase in situations where the Euclidean metric would take only a few seconds. ## Multi-threading support Parallelization can be used for the nearest neighbor index search, the smooth knn/perplexity calibration, and the optimization, which is the same approach that [LargeVis](https://github.com/lferry007/LargeVis) takes. You can (and should) adjust the number of threads via the `n_threads`, which controls the nearest neighbor and smooth knn calibration, and the `n_sgd_threads` parameter, which controls the number of threads used during optimization. For the `n_threads`, the default is the number of available cores. For `n_sgd_threads` the default is `0`, which ensures reproducibility of results with a fixed seed. ## Python Comparison For the datasets I've tried it with, the results look at least reminiscent of those obtained using the [official Python implementation](https://github.com/lmcinnes/umap). Below are results for the 70,000 MNIST digits (downloaded using the [snedata](https://github.com/jlmelville/snedata) package). Below, is the result of using the official Python UMAP implementation (via the [reticulate](https://cran.r-project.org/package=reticulate) package). Under that is the result of using `uwot`. ```{r, echo=FALSE, out.width="75%", fig.cap="MNIST UMAP (Python)"} knitr::include_graphics("mnist-py.png") ``` ```{r, echo=FALSE, out.width="75%", fig.cap="MNIST UMAP (R)"} knitr::include_graphics("mnist-r.png") ``` The project documentation contains some more [examples](https://jlmelville.github.io/uwot/articles/umap-examples.html), and [comparison with Python](https://jlmelville.github.io/uwot/articles/pycompare.html). ## Limitations and Other Issues ### Nearest Neighbor Calculation `uwot` leans heavily on the [Annoy](https://github.com/spotify/annoy) library for approximate nearest neighbor search. As a result, compared to the Python version of UMAP, `uwot` has much more limited support for different distance measurements, and no support for sparse matrix data input. However, `uwot` *does* let you pass in nearest neighbor data. So if you have access to other nearest neighbor methods, you can generate data that can be used with `uwot`. See the [Nearest Neighbor Data Format](https://jlmelville.github.io/uwot/articles/nearest-neighbors-format.html) article. Or if you can calculate a distance matrix for your data, you can pass it in as `dist` object. For larger distance matrices, you can pass in a `sparseMatrix` (from the [Matrix](https://cran.r-project.org/package=Matrix) package). Experience with [COIL-100](https://cave.cs.columbia.edu/repository/COIL-100), which has 49,152 features, suggests that Annoy will *definitely* struggle with datasets of this dimensionality. Even 3000 dimensions can cause problems, although this is not a difficulty specific to Annoy. Reducing the dimensionality with PCA to an intermediate dimensionality (e.g. 100) can help. Use e.g. `pca = 100` to do this. This can also be slow on platforms without good linear algebra support and you should assure yourself that 100 principal components won't be throwing away excessive amounts of information. ### Spectral Initialization The spectral initialization default for `umap` (and the Laplacian Eigenmap initialization, `init = "laplacian"`) can sometimes run into problems. If it fails to converge it will fall back to random initialization, but on occasion I've seen it take an extremely long time (a couple of hours) to converge. Recent changes have hopefully reduced the chance of this happening, but if initialization is taking more than a few minutes, I suggest stopping the calculation and using the scaled PCA (`init = "spca"`) instead. ## Supporting Libraries All credit to the following packages which do a lot of the hard work: * Coordinate initialization uses [RSpectra](https://cran.r-project.org/package=RSpectra) to do the eigendecomposition of the normalized Laplacian. * The optional PCA initialization and initial dimensionality reduction uses [irlba](https://cran.r-project.org/package=irlba). * The smooth k-nearest neighbor distance and stochastic gradient descent optimization routines are written in C++ (using [Rcpp](https://cran.r-project.org/package=Rcpp), aping the Python code as closely as possible. * Some of the multi-threading code is based on [RcppParallel](https://github.com/RcppCore/RcppParallel). uwot/vignettes/mnist-py.png0000644000176200001440000021121414730166740015557 0ustar liggesusersPNG  IHDRWU|G pHYsod IDATxklg/oŪb"R$EQeɲc%w'NЛٞޤ=}0lb^7 08b1ӳg=M==;NIM-˒(*Vr^|KlJŇD<!# B7B!FB%)(btI !]B!FB%)(btI !]B!FB%)(btI !]B!FB%)(btI !]B!FB%)(btI !]B!FB%)(btI !]B!FB%)(btI !]B!FB%)(btI ug}/ovϾ}yw_> >U!9IA!>_<vο{ئ{߽qӇ=t ~'۽}B|%$>/~޶9/?uo𹞪_ IA!^zsx7_^>wo\:{q3^^8g_t\8>{}F o:1x፿Soࡗ^:w;i'IA!MW8V /_ߺ /\w@=_|b?}6 S{Z-> [^{C^o˭cOUtտMs5?ůuu ͗2r冋63teާ]kOUpKε7'෶pVW3}ސ<{Zާ;{V7W7xw|B< $y5w~9_8ouk_NkG?=OUZJϼ !x^߭{t}H/MMϹqP[m&>խkϾxsk@6n黾4v95]%OZ~w}Ġx(I q-xoݺ֭{e֥5osխ7β}$/͗nCl8}3Iy=?2ڣ%IA! ;>Cҵn; oҡk?{W_x ?.yw~sݾy=o9z1jp8mapTKo&B û+/xhwxP.[Bz*u^m?&v ?@. B"M wolO 0l5+6bᬫ/И&k.}ĊA(LC1β\Xp1M%*vapX&)3D=L2B,aJcqԏYp-$gK /Nxe5:@@N3iz CT VF +_cF% §X!%)UBq R4(iILף9L14aDc7 dit gDk7vS ivؙ͑|Z\қ"*moelw3KޑBֶϭ8CaM}.әdk21?1@c 4^nD'$;9 Gbu:d3f7Yjr8WĊ4hX'Xb 3^gq|pIA!rrʐ&/gSjl 7J(j֒&tʘɅX+g&#Utc_c RXKM5Nik{KS08@DE͒79M&Ll~+OY3S>A$G-l %yv|֋YBiB|!mX=_cy NLj^_ 3^`'̑Gidz1 78G&BY}byTN4Pq˘ XY?b`*nwORP1 `Ҝ@=E|'PX% :Y'YTzg/)И/3rb?OtpX%esu*Klba!Yb$ΓROhX8.q-.M58:j4gH}y8LBPxoƥPT3f^EM.HذX 6L*iZc2 fMGQ<۟24OP\xf06,+Q=ɪJȃQi/z%<)mu 8^UtQw<'AxgBѕ]rFj9$:8KTti s^uqH H9NNatXO切 sZ =c[ˑ&P&y$Et:8"M4LY T0sĻVhEǸl:rJ8ѝw )(FhO,cf좞<^1!5L@ݬ`Է)<_qa 3s ӗF2<PF"<;RC*6Nk ɋ4ZDYIo:J!y58FT dp4K0<臏Blϖ\IBmzn&wr+kԛOML]b=.n`Rgk 8 xUKy7wqSG  $X:,,Q`zTOG[&]5c5vpQt[>C~y$$oGRP1r]^-sWʘ3S X5::י9J+q4pV6RӋl0#z5HOSx 3Z.&7M!0I"K".KgNǛ`6z T!-lPOHWi[C{Nݳ}I !'1t$XΓc 닔 A6YPz(>$q`;%*rqo; w<}uSϱ}t.ǘ%]z0jb/Q#]J=O@u_; ءץDNTd)Kf?[3}PI !E)+KMl6{Yʧ?79lW=[mx]0$keKɹf?<%&d>MRP(*ڍ@@K+ZBb (c/1D 4B*k1%J?C}Emh(^3 gHiz:$0 fFDeqX1Oف"y28KB0C,6>t9U1<i6sl4nBpte[o,thŲnM3ws|i ayޚ+ԦI~->$BJ Oa 6$P[tpp=Moy?Djf^4fIvpX ϓ¢zD+K+@Bmz]UMl MdoykL$_H_P)tJPз E##$ \ZdCG<6 p/S._ KlH{K:%h^Y zQ=6 4C JϓyOO8 KIh(%Z l v-Ij5:9uKljtv!uڟ-Wþ`;i[\*n$)(x`T0˄Am ֎f&2.eK&76Opdr?(&EVƋl@CGүQOaxTg,Y Sd~c̑zw%Z >,%64 Ǚ\@["xj L` pujݛDH !!e;;;Tq`cɢj4r uÊDѨ&9OK/ d3DM#$ӋrS1QcXB:ybhQ.S3ha7Ca5k*_g4kO3{3˸JHGY#Oc'>q&% 0ҧ &?MRPϠ9#k&KKqA!2ZI#r^H[ ‹#(3YmiέC?-]:d_g sw$_qV!$)lc>O]. v)1MrKC_G#}w ]!I8ʘ(tP rIݙaV`Q-UdTXf0dsY&u=&6?#^IN`[YbaX?0ԃTF`h`_c!G'!jZ8yw^=˸C?I4k~KTbdhxiv$>b $McPBb8ϕ)MTBOl3IA!CI4 *jm*A Jg`U. W'ܪ:ѐ u Raq"(: 1M4UB >9@ϓ}vPOӁC/GfWk fЉUvX+Lh.4i2D9Sɩ-刟x>:5 [Sghy&]NNJ$s9cΓIV%tKTζ@Y=.=Ƽ+O0cm?AΒMԭ5ƀ3p׫~Mєf*scZv$R:=7n6;QƳU_jFbXo{)&5&x.FCx@­y4U:J jt5z 4iY$3ǀ$X2LV<$JZe piPq{4lr'kAzC.uh kDՊt'Cуk*xslYdc-qBBlFeҥJsĽ%v=߰Iq$< IDAT;8/OSHa$26 5:B N?h. W# R?$Y# r?,ZL17'P'f}5rFވ clZOA\''xDp ncC.PJJ#d$%6ȅo/Q9^A VOhK-<6!m>NOJ% ЃCէka'1 e XV8w\`L 5OF/''dފU ^.gk3b«/F1kFPeHS#hy~&]W_{/$!WR1ENCOr,I !!YmshdiI(8@hf/W*l1fuhX}ra/ɚ`B/D;B?k\_2#^, NI~WJymGP"(_+dFbtc9J'IċC;-Yb^iv-['r(B IA!#$9݈ jR:[`gތc Vl3^{1kh>;8ZwC)ILeB/쥸9rW餫M㤶V:oįY*'Fƽ3qI+.xg691|bq-7I|wBK!)(xksg.VW_\aXi1fE&T(4 1按/QQ"AH˦Ÿei[NrS*!3lfc${XNAs3Oy'3Go{ ,jFpv>v~$?"E˥%)75Cj>N֨:t{6l=X}\-ᷖd4YdvN5浹 3{ &=t> q&ÒZglq)͇2B̵`~,I !Ej$N`T1f Fk5c`hܨ:_w~ljcƼ tzL"D( h {ktfH{PS8G1I$> 2R&+۲Q8WPo⇜Q001/B<?L$ZIjh)]+.kZFc:Ĵ\09mЩ;}CVP6M*i5%(YysϳYボ:$t+yT?bS#\Ǭd5%ԍ`2ҟavΟrйa9I"e.xD })/Q?Uy}:\L;,p~Dp5>,%Ce$qlHER쑏0PgK֓ao,Yb%*2'/P2u<?a&7С@cjZc<I3kAsA>$(: |rR'V]ءiFd2܆`(ǻÆĠA RIjK`?[P;4/OfL!e"gS<|at܉83 "@Mfئk.2KޑB!?3q E{%m{CS9U#&l2cZ`nhk #o^\U-c81`sl!$~eac:M0И^#pxF !~Hi.(QOgM@V$fBAiWy>L }`h(0N[ qN:h2SYƬyG15LF]A(zO]⫐TrC c䛓JW0{(s0R[A,آ+'qBksf^- ϓvKދ%*Y)p, R^>׎vj# !]i5j)}1)/c=FL<^0]:Y!@V'$8@Laڠ@G(DsD(,ESEP4BY9~coW,P4::_IA!ģ+θk Q((~Z.dUlkԪ  [c~Uу eOն7f1FĿ& 1 âah(N-MkF(Uu +UzFD=څ_eB[N;^lWm bʮ1ifyL3Z5]KY>iBV6;sK33ef |y2S'VidfT:f#WD#cKmB:_/xg{ ^Bt-ȼʇvYiGt?Ik4b r*WLȺ˗e/$R1fNiDt>>`ͯm߱nRݓ !j͚F9U]COb@R iP(~+1kI-烍RiTd.D tpT3g}m6ᕈ;9a46*1vIY%:19ʼn}Za^wkΟ !RUVzD(z'ƘN6y*S<>P*U)@ˆZ&@-wj@;"X<(~K &BZޥ CC"2{?9MWze8T-"A;KL < &Lvh>ι25=c4#ЏiѩD/=pS߁|նVs͚2us8V%ujd|ti"ے//6{ko7W~{ԫ Ȝ jň؟$ڋ99Ʉ~#6}BT 449]5 j`ssEJmZAc##LZ``Vb/P2BEŤZu3 LwhNTwY=ZU[ ja%#ȸjkS f ~6R:JlIX.u&ԍD!42ʈBsU-HVtkK*i=ѪmV:8fuڥ"s0Ģe3ct ﮷u_^h?8S0 F@$d\=PGcK=m]bBSHToX|6Rn:KN=b~] Ȩ$6l! "ڶx+^QE OZ5lMmQUiᏢEϷ>$=Th`1o+r1~g+3S "e-`Q}K&z0PR#?bɱ Xq>4P_[6L4|` RF0dȒյVbn.W;APk DBRAR(`Db"DaӣۧoqPywjr8̎M"̒ U!-p٫cl+&{Ƙjnlb8^ھijXذ|k TmjWLh[WDS9SM:I'gR|u$f宽/˗H#?guyHdŇNӴy~B2miPPY2Q_86 w?1hzMbCa6:6u?1 z ca-̪Cw)  pM`b PX}Tfk_ eqSS{kae-jt'#O4~q wQUyw %tN?ڳ5|‘sUB'&.nl^pvbe_<<-*= B#7bP)X1)X2uÙͷ.@+lV- Ֆy]T>G",+ Mbdl}%)*0?jAUT=Pw|#gaO؁|wa 8נd0C ? O%ԼFuRw6{/N8o\a\LG)4H9H ~I `zeR)VWI$F9uHoRr܇R.GFpUEUY/P_l`YT|u|k7ϧu.'V9ʘeqfϭ=, 2OOpqw0裯m#?7-&hNFCqrs;E&J Znd,B^G*gA#4h㗟VۚgJe6}%>%F69N/**?9P07h9î]X ,,0?OdֈٷzNòH& hUתU.]׿& 6{ufb qtByh\20RmV(6eht$lρu-oMREۊV(WE8LKʙ3dҨTJXԒIt eui>x ŋ'wZe 1~%iơ-o)]ڰh[XkKVfZdNQlv8ȪGY/NTY12xR)|'ETۢ,8S'7IF`qiqf*WgC50ĕͽq0f[jC xF ,Qݏ-n")?{LLLL8Z_'T*D@b}`ǎP{HdM=Ma OmƥKRj]y3OOvBږ1>IX]Jn*g0IAqotN8@ Q#kS5B~mo٣T6Les͵0)}I{83Oay_LJ(:(T*e5>FA0X`_2̗݅acAjOM*K%UIC%L}EfDFd&C&hu]:J!2~z߫ -Tv}oogQ#(mh;2ud/ 7nP|T*T*OS'n"i VI&{r`ϽTs|kOQDh`i@B"c|$k*M:a/>zo`u]uqe>;;&CeQ, ֤/g4kܻealncj0qpI]OT>!IMdr)X,6ט4d:=J6!ٕdZ=J-lgJc&R}>L쵹YL0dm/DzF\;#>Hq%b"N\lR àף\ƶ1/=z= tNZ~Dfuzd!cXc oy)Myɚ7(sѱavpH??Dh40 #o}?/9ʅzBhbUIQ 2۹AH? }򟏏'EK&%Fˢy %UBH>x@:@.7&p$_}x {`>: $lx 4i@G&" $>x9fR0\γbpn)>!6[:E)8%^Zny=.e72׊DTTlOkM/Je9?׊5Frl4=cHB2aΞ.-d2&cL<o(29DZBa";,z\7ސKrq'Kui "9I d`6m[kE A:Ku CȀEXu؆0ϚD`"_נ U0*J<?;r,_7 =+*TEYxNWd6OO؂t HbP:JpB?I8?0>F!3N.܅ ,`#Y]rxB!arzNL@"p>ϲIoG2+5pZp煨kb9bT!eIY?W m|DZ.G=2# ܽK$B{LMHVbzn=Q_cfPqxH8+Lirv{$XuYΝ{ $dEPD J&~^ۛ4=wl%8E Mr QFsEDj Yy؆dJ-+Ƶt@>>N߅:)0t{9-Rz) hҨet1ٞZ[بg\-pOCo`ZÒZFCh[7J,2>\.ȫi]pb7"R2Lۈ>ewxv8RC>OM(ę3ܻG>_r0N"%ܾM $ẘ&"*&`Ih 2wbRxWm-s,~?wOBÛQ:۷%We{]ff.y yH$d9OX'bp7Wh0icJOhZ*?z&"8GP09)xLǍp`vAl8 U.XEr:)OMR"0',q:ĕaZ/j-{w\:Ѻ݀LLhY"ݾU5Z\Jj. Jđ̀Y%oH?$7)1a~_PngfؠZF9wwa0t8yrt]KnmP`{d)/Iƞ[,7th;$av玑paKneJRMEdRmӨ k56v UNV~{=0$A0aˠr` d\@<=3, U{8_A{ðt*6Nk2G3b#9V@o0!QQ F@e\qj5$${m^x2#…D@JQPXwUcDƉ&)?zH??9qSTÉxM&',lE.C(DH.G~@:nQ3914td2hCC"1#q@:M8oyj"<Jܾ;5*:ح1EAsd\ nE C@PE0!q gP&0>}CeΔD?nKCԹFUuO ak;UX~|D\/1d CHP(j/0بZjiWfu]na.Fkc1ݏ\! dLA3aiTIftL:&ޠR(8 T/OMHU套UB!֨TDsi4ٳ x76FႱ")4mb²FH$0M4=)5]kDrDKL%$,NJ5 $,iZ93>QIݰޞsZu=rH0ѶBi6IcƱO} y-lsR&6BzN ]?ݥ,4w4ý0ʩ޹M@^k7Y^&auAѣ5&Mה+v0t)ז_IiX:qhZWk'JL-h%yM0 "7NIcU78If]vkȁC{M- ~ kJ$=ܪ6P׈$hZߗJz#Pс_ִHb<m45baQj=:gtX'\y@˙Q^tD_hl,L@p*i *LB5<.h3fP,h6y#C`Xt4z/x¨8!9 UG{?٤hLGg1,"ML"yFIQa DBltF/=_/}L%폾``uW$)BN(%xYzG(2(Xz 'QE8}MM?$۠fw֊tSgR!GZudelY I²4b(8tb .@IPsHA|Ifdu |ʓ 9SS]uK01fFbJMr:PlqoDYd^Y@9~ܧ7leU؇> )!%I1 K@Ŭ)A 8琗0{N2! |v82.uhڎMo b-70cI?ݾ Q~&>}K1*6.AO%cu? "v/ ss,.r ߏ< 07or$e{@尻dc]wOF= WHgxQͯTRajJh$'U %*MLʗؘx:\ M֋l Ju,s<("&Jr+zK W }\"a.xq24DadtX>|`&h[a8=@ % p ⌌C 1o:d+̍p(vTՙjyG|1&]j{+PUAK1,*]MR1d6cBB'nKurM՛lQɓH;Q*I]IMZT9*㑬p@W!Gp\4ctZz|wv֙|LT3 >M&e΂A^xbi H&yQHHrD"'v9dvl)\{Dbpx:-u]fi]Wej763C,.f88t{&1 22TDDZ:1.beYd!EFit?*>xlqh}0 $t!EӐA v讜v֊dq!*?)@Rg>j8E!C9ا7`ZE툐=Z(@f1BDv]!QC758s&mfΰy: &ݠc2aX,VjN"+( .5E.,86}Bs$x`zRݠZ2p.?k%sz}4hYdns- mMLLi4m33QV(^-4PY='8)tXQ_GQz=mnMK* 0fn ȍLG ?'HTUkofeξȣ5} i]c0DrCxj"$5!ѪnЬh A`8f%d`9NWۤ(bg8ROceee+c\C^ܷa82B0H>O6vڭ--a,.rcA8<:YW"I$xa@yPKqكWw[D)!f1G:Zw8Ao@S1G^fSVH,xSҮ0sT*%w7~b0*b(PA=kPj,C& p ,H!/h>ORӺ .?keq)F"ǘP"{m G"|;U|"qg@a qOA`]4X}NMďlO}zɰGQ4r_c^aunl1:vZK"rM&$+zH5b).|b)AB ҧ5.>o*.s | Q*8Cl8YNaJR,/;lm!YT"^Tbnh&S+n7IŅ,/& 2GC2ɩSrWܸW-5;yY;%\6{ mvd'Gs@2PjQh1حUܣq{h=*b2_/c,5X0_AGP  &4 :$ aL ҴK+>;O"%!+615ci/r g!Iݰ<5Ks9WdB hqBS^7|ũWK6ʖ\ 8hK]*D a @8ν5rzuHOc6QT^&bmAmab.jhzd+g{Nn1st g.#5vk|{k|f@IJukvvXo*LN)VVv1[ZӼ! 0H(ﺲ,b~l,h>u_4dfݪyj=(l.Dx{Ga:.c(hDB۠7zöYN8ML~4_}2[ga,B xAd! 1.&x.j5k؅oƟ 3\5\ަ*Uq_ VpLx }$5۩cBFMÑ,7^`HdM_׶$wl@b4Ƀ%GiMԴ5{6 /}fSZY<5IAQZ̝^amQAhVc)hT8q?H;5`Zz@N+I8T:RO!o+#_ĄT($ܺ (J#[YqO<=lxt::mL߱1gi=Z-fGq$"c2O3>N6K FB{uvw? Dp2I2a(5&!Wj1F 2"jTt*- Gxlg%k+N0B<kpj`Z d=Y "`x||4 f|Tb:"=uČ_ Pw(YvEnU\Ƭu Asg; պƲuyDN'p`/ݦz fEDFy"wF#"8!8 @D c1#$ ^f ,-vT|w!Ї^ $ { Ad@aꐁ?S}HI6'j?S>JOwUW ZC@n)g@cAL~}>HK&C 2%h{)CGO/HY xGTWHLP+p%&&R. PcvsQ>akYgb(}a$P,'/1N,>PTS(xўM#wGyF$R,ܺŅ Zx%?zf ,~?g25Ha0;(LL r&HY*lM{כ's9E'9R1)h٘]h9hQB}N$pU=4L5xXˤlQDfcJ&3)?}XJPq-Xh0p5 \@ &uÇhYyDAI&Y\Fh7oR.Z`&'f[ElH矧e{Fo@sQU.\U|>ϣGD| LL`OenP8}P%T"!fc}LZAg1xL۷OeAΤbE푹3̻ۑ0N-t'tqԲ~潣'pS pkr $^5x xV|A+ԉh"?uN>hKUM t]LUmS3D&tf}>)/wkԻPY;rM4Y?P]h IKJc}a `9}wGkl;dי鲵BI(Pءf$3hf2=w]/}7nxp x?V8RAVX=)#K]}lE܆#A",S ɜ -rm2vdRX=Lsł&ҡ |p&=`̃$kIGUEhTмgk#ꥐ%(ҵZ^ia6YyK2}8 `4Q-s_/#4  4b˗V ty,bT*t:7赑ehYg}W^15E$2Jh%o˕+(@s*B6/,}#Gȉ8s/WzA`LA_I5MY&C("O7li P(!RW$5mT7q<  S]}L tpuhfPw!s! gd`d[0 ]y>!zdm16YkA)`pvYhj-[vO[oe1© K.Ҩ?3!:l64@3κv)HZ6t.*uEUs4*>.QإY#=KEHJa &n[KY%ew`vRyz?FT(h,, o} -[>ѫ+LM7zLMqc\.X:nJ&Ho}>2Lyo a\Ii9H1@Eɱ,kQ=m,wu2gY4C?ԡmj @Tc`6p.@Ma vcOB;쵉 .0ƖIR悠iؠq_/Ⱥ=tw@HOyl ]@fI* = ;5cr0ƅC`e?^? ]!`vJz9OsWxNPZN}r@iq=aѬѨ$&L4'vYUe*&r”oygk"l25@@ցfMG8J U îP6d^5`%&^ц.0#7=&׊ipYE6JXdv8USRhƌϭ5j |w0}ر:ChLjQ44`fpz\;Ki_)z}ч R |{6-n/0DPwE!Q>9t:#n`!"Eı[VD,o&E ƜP^Jԫbsc,^`~?dak,E&LNJ8SS986ۤgI]A QX'f뺕S͡ ~DM&C˅ Jϻ gW"Iq,o#Xau%?*ҴlHH0{J ``k U$ 8C޻p.GJc5+0bVyY &)aƽ(>9>%AvEx8A^'-=p(}_J>74~/K1vo.q6zսLߍ-j}*=r~A*{G>2}hN"KHJX\"j**C"Pz Ej'5j8ݫBUc>IԠYc@zFU+170[#\{Ecj^='b`GV< YyO#&PLk(Pܗ_I/# gwn9 lYY~#67i$ΜX7pld,L0MpO+t5IY}OxU Z\cAT QB{ZM¯pkr в)<"=vQ(g3_) :\5A=q ѶQfaNĦIDx"W*_""iikek`;& Nw. "{TY?S7J SEG4To__ammoMi9Dt53hHm!c8:.+’]A50yA6Y䥷8/.`W&ʡ 8ޟq.\P$@Q7Y[㥗PU]0E?9Y?8'NeOO~B*PXtNEPM7Q|zL#Z&/дu 4jwO+&7 d#AitT9  _1$ŒTZ35!sydPuhP8 Cw<#2 Ч'T`5T{KN=*@fFJ8Q *֢d=%~#ik}\yfchJ1^zb I )v׾nߢץ~P5 *d2h @pۻ dG>j. i#!\em ePp<3. 3wtT(07e.߸OनTll27QDC^!_aa Z.SnU'WgbDhj,9"_'c5^Hέg>??S+4` H Y.ulu?JR(EHejr{gBViټ'k-PٳNaI$b8h^Z k*^8[Q$VNu*3+V+Fm bɌ:6$Un47XVΊ)@VH_\o!Ρ J8<˼F(;L֎זXZb}zEAvi4܋f!G 藴_amS)o [gΐ˱6w7_gvTsSXFEhNp (.o:Ci~9q$^" 0ܮ#llXM2@\+ =8ଇvĒUDj@ @ bD *%H&yU{fuJs̸ տWt2h6K(֑{{n{?k8x<8ׯcYZ&.c;V򘞚/C贩 *&f&a36k2\H)M1wq5fblc*H#_gFNs~gG  C?I$6>&<ueF CY6| m63O>,z 8/f=FMFc> аA\AQs1GP w _`4xK[erf#y.a12#Z x{YKw-!%j}(8KNk~Glq2sςr_){ ˑN=H?sNL?85. 4QMzC2Ũd`04M:6 Ck}Nt2stۮShߢ@R>nR~z G׋NjeF5>)>^;[XV4*v^@&xBk4Ѓ{i4*1 A<ˏ~ĹsD1; ƛoɰM4JTx_MַX]%"M dw8NŸD?-O5MX%Ν{ bd6΃t#J-6kUu6kX6.u=:J a{,+NS$w\eQ?M"&\LRx܄-t11MIXq. -|0D?Y x~2'"չVcP4фa\}ALBZD^V]QmUU1yr^W{$LSA]YfSY@l $M)3-WhODB?%buJ%|>r`ud'$]|>,FY\$byd}w?`nέ&U^^s3"aj E!Vw}#ad$c>iK:}~|dzur&Php4NIM Wk3Q1Q%%1j#c 2ARn{Ձ,0@:D֪K Cy1mr"vm~τ\eNB346%qHy,*d4:1`OH xgoȝGdtT|fԼ~EZz+d/2+/Vj}{S) #Y { ,yj%<11q˴86w,> Au F9<ϿLJ̙9S,i<_|CN&X^?pOUy-HzN|MV_Z%E^|5` -H^U套H&y5Qet|s:}xWvis[Wwwwi'31&o`‡XEj&91 =I&*KC,W%ԻuUR-q}.˿u("=i$a(͍tU3 @9a`˂ O$`gHG: +=/GHE|s:tѽX;ӏ3w:"f(afG ըh9 azʢzEÿccQߣQEiX65XS̟&A0fHf>$9"K̰n&clŌ9{~knKTlk׸zRvd-uMepwsv~E]E^wc]+NXXш~^\v蔗?r]4rpJD4f@Dm~N)66/ȻY!>MS3 E!@CIDI8(W ې!p mXb̾v%Z#sP G5Y)~Xg7cXwQgP (Br!̃ AH@5&.!,>GCѷ!ځwE/|(se. -)'ۗpOn_h>;7>/iT JSV%ʢ.1DV] hTiܙ86,8Y/z%Y5(.=mM{Gr& c/u9N7 Vi6()M3;irnP3c_oL@0p6@".Wی}xg<9e˽^:gH*|*wv9'|sI^F3!娙{B5^oyb"mb  _&Et9aE5Gab, *Y?6+t*(03 -HC@b*իk1H|hTvA 0} .ޣp\4gu FQX˴*ZV^d@ձF(ЃQE9"{%2t=oz௏̭T) 0a04Tx0Q`ErwQUEmRraۜ (by0==jh?p(>ss\D>G<>1.H.J {R :MKH^$.@LRiHj<@6BD)p*3sd#$xġC(TM`D5؏3 I ?>HuCS߀6{dm`pd7ͣ%˨+$ϼY";꾡ZDf+_j#]FFHy.v| S~^!_tmV^L'q,><A0LQ7u3^/B/0l!21q tx@1¤bk! KޭfZ{HB5Z97H%)MkdX/1gA"IBQ<^$NDZ,?% fF 9N{?͡ - =tN|9GwYYm"9;G2)nďUpiz73A,\ߦjc|YT0/mq4jyIg19E^/]NЪfim_& jùoqD1$hܾeb\gC*OF)TrG@_yIݿ#vRH-cr97w00*E͘}*1Mי}X6Ać3d '٬1cs}o%P.PhP3 Ptm]t 5=mnEX/c7 mP3) Azpo" iXh?PBc!*q(11 ȫP֍D[e'UGiPBEuV9/q:NiUHj 8ܾ)yj()V#"PQ!n"ɨ:>GϔY'8T_+"~i/ma1󻯡ԶR 󟦉iR.{j< :ܥa Z=6kCUKR!vd¬mY.s_C Zl2%EŊ''OgkPSw^~s P sqt1):~ٛ€Ҁ[?| o~زzy9  ]1N}B[;iǡ >D -~}NO+nla-hNFCb?qQ( xIiPD7qb4"(c&f8s Cnj H&)J;MuwIgXWLGg\-EY4trذ]Vg`wr==)r lYV6w@CYFr t٨$rSHɆVIKaؤk]Mu$/> z9T8]?~Is<.)qS$T Wtm6(ttN.f$:FXjgȔ>!Pvñ ARSFՉ8+Cj? *{e Ӫ35σKHuU :(^(.BpNU(tMY.U1K.9-C:=`ʽ2Ee$v@DǢNkxUf(DɝS!T STx08 ~^udFա E*}*&gV{۔[h<ΐ\RH:6L.%8z֩˕W!5Y6%uXF yrm,S_d6%!O.U\Mx'$H6sGiJPTb]]59X]U@Mϸ%)d2F-'uH`!B22vG{I޷{J$<: f/:'f<˅ *xWgT]dڭ٧ޡՙ)~31!b: 12a &Z,ͣX6_HQ|n=d Pr.2}N>*"B6t@eQ[j_"GO!O"+=s"<~-dY7*0o˒%Ő⢢YQ^/jǺyY :4:{4u~:YNR+>ڨҳ4*X&C;|Z"Ҵ,":-#))6q8C*&]@4,Nb9)\fS_CU2- pD6˳ړɸ-i|4"̓& ^SpuKbZe5^8 T%M6Gn%y8Dah@v/(5B(s(\1jpb;xOswt;UQӳDT 4* ;u>&\@Ptt A$w9*+ DU𐯔d|#H؇" $~%ΐE]²9E3iS_'#E=c;'cYӲ{' *56M2Dy~}P35>)B% sD9LO坴?mnrȯ(1@`Yҳlf]:-R Qͺgd{1$ FWVuPEV'z]%~q5cnAer#zW\lM¯PnqdlCc[J[j6@u*&7+Dkr ?G_VN`AS8(*w;~bD%Iw^'2 u^ a (AބgЧiŏ)2|b⧋I(}^_g^FQ_(FljMtIVK863E+%EV0rˑN51)1}[k^ւ|Ռ3:Zf۲eh0Jgc:w8c{ˉHT=YEQ 2Asw]!݈.nW%! W`R#C.4!"Ċ}FmObH'˽2*?{H/٘AH`rWIkR.'3WMJ{IDM Pآ^]>ϦYg_ADCifX9TCc!e _lmMa܎__B[HbJ7 0ĺPm{%[>"6 CF i9NBi;+QsS96"Y={*w86Md4| 4?l#IH8TCӻ ' *=+enP%~X2q4pl wWdBLEN*H{zGߗp#26 Xm*?ė=?vOeÚY:tX}^Sdd"L~qIozevDD(Xu6IFOOռ5BCO_)DPWٸ,];x:If05,e!*xbͣ\ޡlC@Afjps߾֘Q-XmPGА%2kO~\n~T,LRG[0ecBgRcy)o㜄fbfQ+#=WZiknms`j h\eѤԧf\@H_;2:w+c܊0s}?kiEIN\$mV[`oRgr<Mt&񭣜˩)ԭ o#l8@B5=bz沬٨0* eqSOS]2ޅ#LP?^ۍIA89)y u7[>^z>uƖ+xlbxct B3X;/gi"9 v7hUY\C7ȉWH#>Aś/Эs ((}a{Tvd+ɠK2˹XXogNdhb<W_ާ:EiAP s.~H!N(= h"qvXοx=i{@G, wkT,XCuKgXpP*|@{?z0|H,&=nѮVf| 4kTGR#\>8n}W7Ӷ1VwK/t7HIfmlvKsOsɯt.t0|U6 +t_D.KuTvIDOeXNqD]?¿k/CA?Afax@uǯs-E&;-YLЛ.os X`xU.1Wm%DzXyK+hYg/NQf--V{nP?4h jqwhK{'|t*~U!h{}R%YܹJdG{/l^}~v#.e%v% 4auIrJR\w),K i_z܏NO>Q!^⠣ Y)]ݰtsBqƌ?C/Q_rKwUdCsuq; \Pīq8Ǵ~AUߞI+gklQTE&R%9u793 8WQ WpLTl[03qQp.&G4~D:u d8uuW֕ux֯Y>{de[m%dy3yǗ=Fu7.Tx[!jOs[qYsG3k?~s|_NgA9I9ɋ, IDATwH,e0U̒Z'tE-48me=#ym-ͬ*Zܬ;]Z *.c+k-bKwrx"PT|Ls'^/Yjqb~?"&m*.r@.WHqK#oSq6u_"3uQH /`RD]""qw֑P) x'R~YpWպ\#8za{@EMIghX;buC;N2n >*8|cc}g??͏Vė6|D=?JCDC>/4'5^ؽǽfDmnx=6"ZF'ɩU(Ȕh%*>KB-͇JfNO{!/ock i %ҋ FwveDm"WQ -}d>5dt'e$xNYaqk-g<]N`>*x҇Ww~W3BEnl`Ly ӯ Na"w)%GlSWRpFXYK<1^4Ku\r(w(/ͤ+ -xZ(](2B }VPl ֳ(l}y!1GPp7.:MZF;~WZw^{T̽\cO/Dz6uq#5ѬR\D9V|:yP/q_|;~oGQAA>ED<<2ĕ7/ ᄎu7,_ٻr EJjݰ{CnrQAQW\M֋CŢˡK,9OzP6{ݚm|qw5:yz`4 hg_!G-揕UI;+%V] 3:R ko0g'KL8V&S`<8v-1kߡ3bebplnw):cW 蟱⿿σ z}ݹ}#*;jm#̹ͥ.- :rC=Y)ŕ7N.vw "kNy`W5~E~&cbJ9O̒p\2G~ ~S~n`8¯ @0o/H Wadla~1}3 owssp7xw!;s#;}5ɕ eJ4:NK9'4*ȱf|c2$0KQ\5Yʿ*;aY=14@{l?mP0t$cZ`6bg Z\1,%"ayV^enoF_TtL݃Y_ǹ`=@O\M$P Xn @J!P}?Ny*.IEG þ_$RTBZ/kI4"d\ dziĈuZ5%x Z2$ҝ+| 9?b0;e˻ܔu/w-.?Q=gJ_3qmo*qxF$B.7lIAM0xDΣ0| !#:>e&.*RωCM1a@=N= n/eU3ůOϏ FD|"E?Ŭ {9IFq;̹+dfKXza7>C;LƸO*e.-(M0J:][S(% Xn7:rĿ1lӦAE&.7Tŕ1_EKSjX]Nc˛9N9ʙŢ-I28{84:kf%_Mm'FBlS.7VيhYt^p?;Q@(Ci.P'CI:Ae2zL=@xu QB2UE)6F:1񪌫)S싄C%`]5OEWp4S RO4M]|#Ka[r̥A{6" y,w^3IXK*34I숱lsF4+ /Q̞Uu`RʢzVVP꺘@;3|<@g"*m"b]ER-Cvg c;"d m"5 6=rsEC%RO,%azE~C4UpÚgѩXl_EV pTyX%"u+NyWhU;ˇKz\2S,0bq2Y ܣ\D&A!$6X[AG"~C鿈ýspl!0=a_^2,ad"Ɛ(8wF*`=#|ޱ[qlƷ_C~{s1JF<<"TyuG ٥?Qӿ}~%ֲRʴL&^biOl }]/FEY;4t,K]dj fEf̸̦`o/lZJRMoɮD%&c!!{ FxVVb{p&A _(R :vX C"]7Q)3F<$"4ZF?ңݥFv/1Gt$G6c15MNLCðxM;4Tbnvd "U`~@Rm.yvfpUC7^u+ oMa°luBi]x{-r)(2: - M'>R ƯPR*A%d 뭲@h7dGE.ؗQv4'RO-e) Zl4({XuI ɻt Ȼ)5CXP+e;U**}f"IddS@B)9O.E`a)oG*AQC[|[A;/Lː[zENa>_Eb~ g9C&qn"Z̪PPu.BL 81nuԽX|a6!%ˆOH#":Ӂ/Y*}{l^ |+KM~b^Etq76ro[6eUv4MO%!mrm]+./XBT""~/"x(PtvH0ʨV,WrsD_Ñgz>$aTũC䚂z'j-p&o8PQUbj$%0hS0bCzr-_èވA\aʴ}~ʃxQbeQ=@[ED_+`$ @"Hb9 eC`L$Rm;i}>`fIE9?j8@[J4q&#`DCA%OWǬ2q{&LLNb%j^7~9&r9Xp&]6vdXnè51@B>&@BԪrFrBN|וe: 9hU xu$p2saD%̚ uB@ ;<ENb$0t}iVhkhgg=hW#D*iQ-2/ҪRQ'L&LJ,Y[ht0vVay_ns=KʳDcos{l7xC `TEbVrG#+Ħk:˜\.[x~Xx~J˩"Nu%F1װ n"G8Ѕ$h!ƙe 킮_`:~95;o m {Mh#gQG;_'ζ3EDn"xX4 xЫr:4\YJ5uOz%ٷSG $mz~R$d~>|qM.0,yk*k[c)ᑉvFpM&cě8*JpD8 !(im,KL`[a,BFv 1dbr@.G8d0w`Db\qqtW1ÀǓQt54zg<8d|ls1eGȒI.}roΊ'z0 z1L$&FCPZ^P׭z8T3dwP_2U>QG1N3|woMhl!O"ؗg 53]ZюN+ "8`WGc.Rm$y̘0Z&hG!REc.ܐC$)%~6iuQd>@VPm˩ŧJ9<+M-55/oré`g.9-I.dT7A$Oq$:_X<ً G~ |&t_%ae|wu [#w "PL<"AH=͖4X`$tі:t~!NiDKkT;xy#'L5ٗQ|<&W IFIѺo-R`õtijq_F:_;2(np$zMzu078@?;kX ZPOCoaNo#Dq !HP@?$ wc&x 1)GB=-hٌr? FD<.rMcz/  ͨN𶥂(PRѷlȏMS%g1Gkq8!Drڞdb1 Nvq8Σ{k`qvn,tşKנܠb`}=cY,z(U/GE5\G4#x7UDBN!=(>-(}|=nsW b :q]L ¹ BֈߍY/ "ˉí!>I2&iBǝF{1tNY6IX$!a<*"V@Â4f_NԄQ+e18"Kw{o;\cy7%H#">I*CH#<6vlĊ 1!@Rg51a@C4P!ؿ>⃈T0"=FitI}RUlpʽ5Nẖ '*=wt9}3xcܣUQӓY,L4er?>V9='޿XJ3{@CDE5Q >~ n 5o"gOt]rEq벐PAdu@PmfZf$Y9]R)U{ؗQ0M%?ɋ`D'O)rt*;25t5^1\Ntj K򲧰 wEtCr CEkTVR'8͍-%-hi<ܤ2iؚiMŊ{ԎGG/wvv x لOќm1*8 j1Jv7t)F9¤-CUD0F:!d!%xk"l_vɓ s,hăD*PhS`^M :!v,է}7wm~PPq+if܈؛\QYz?um{ߎK-˨2hRmҬBZ:8;[ٿcy nJ|xb0͈MWΣ" MbIG]c ÆWgм&RA0U-}>̤ .ye uዄ#cmL\8z O{r,ӵ)eh/|* D|aT0"g:l9iɿqWFH z\Y)ynJ$ j+\Uģq#9m1] + _Q[$%.Dq%`@o&{/ٙ3֬"v y ˊXo iJ"耯0@Tظ r #[IAײG}e $FD_oúeѵ3d u@`DC!G@`[7E7Vr7$c%MOQ-T)qAfB4ZBAkL%=)Ņ-. G s<${Gis39ӎӄėqj[-vp K?Q*)&> 1I!GZ0NX~h]jRlB{ Q|D"GYF1hxM ŭo3XdOUحˆWFָ-3: fq~dxq]5c=?*z{ /.cd\cCӿJy6kfZ"&o, F`'F耊$*$ˤCJEsH)%5Cǹn qmNliDă?ѱ`1_6~ -Тv/we?*C3 "DA]nR9ڡJ:4@RTKX~cg qt]BJ /?wxD\`u5vA'~7G}PiD]=4;dBd3D"[ĎvL&>2D4G5~ 0jx}kM~O}=_*r]CzQdM!=&UNx%^LmPiS/qlruz*jU~Q|R1#Xn&;pr3^]= i YJN{6fvǞh!gM]G琳ֳ 6|9`"(x-y B8BH#'mx< .I6~% LB*Hkq Yӿ3Wd2ӿxf?U$,E"ZݷN[l,:'Pe(Ԥr)RZ1mIÒY ֦6_1Ua*AFDzD^+g)];G$2&M )bKuC%/K8B4 DBQ+"ȄH&B wCh_|ͪfU*ñ׈ZYWa2|{?_)u\˭1k+Z%\97x*Maӻlp]֎8VRdrk;u,grnߺݎx$A _Ѻċ챹'``}LO @8"JQ'qa_ A"VB;_EPp䑒]>+P VD)t]d)MBX.RjD/.FDO>G}]nwi$ztD$5vTH5Tnw%!"[g$cYҜ*gTyX_C)#x;4A. _bc_Gю#nxL< gP3YFD*v.4Դf^VVHFEt{$D$}cY]]rqbb%Ud:{ \PhRX`2_d!s|8cf"kech'81.x75Lm@,enp9$SH,kl\9NY$h%˺XVlǷxm8 OlXl{&/v}1}3Mf,3Icg::L;$NlYWK)"Y*V.T'lRy%JSv^yS.zAeC#pks)`,z>i'O@5|⼠ |dIEjs/hdܢ8vc469JLk# XG8]fC{EsUVaR~Qݓ"mm{DݲE~~ij@7L)%`"{c'x&?qke.,qQдO3b$Nw[wq-QAn"PZң=d;ȑ2>̅˼Yg%&*3c8#e]b'|D ǮKkuVrV4=YhSbUv.NZgH벥.N kcu(F"cWMK;G)!YeIJ{ma.prSM1ק,Pѹ3Yx=?⟂pwGKeb]P- orⷞ]z[؅ w~mA$Izo2k>/~* ^H/孍0΃w$=}닯| A}<A`h46|B$iuV{a,}_?D( ۞MAAA؉AͼD ^YnE"An:ίz_!nÄM Q^ njwpDP?|ۋUKn9WF_Fne96lQh9fڣOٻ4wٳ)(ZLC墕;.p/ӳ\;=fFg, {wu)({=;)%,2ۢna[" ݔ2+J'vaۭ>HAAs"2vcIR.Gˊ1kW;[Ib~gOv RP4o<vN*]ilVxz _"Eݨ_'SnD ž|7΋x%VNȪCb։K)ԑy M *ZP~G{HAA[vC7ٮ+.lL̋9O@ݻj/uξwlJ5ihș^SQm@p؉Ɛ%@ͧ86%d#\H|*c։CY%|B6^\NUg'l{aov)({{gƴd:61c:)CΆR]jk&"w潊 hm}k&ΪH 9J}q1׽;Gjp"s9 Ytf-ΨLەĝi:փ֠)o?N1hȿóȘx*۽"rJX֯%SJz Q,K87Nx_([Cz݈vOB:.ӌ uR%!qROg4y1" 7DK 1!gr/Tn&rX{ħ؃(f}2a|fl8Zz"RP\W4{,>oܯR Yi”ʸ]; ӍXO4MM# 0#p7iB-L%˖KrR,~]弚<.q]qF RP2 c)~FTl[PRTDf."Lg٭,z;1y5>$̔!0M2!RmRtenZZCGYyLO3vFR ^޻jobF1U\U c?B~~;$^D 'g'Ms2mf-P8#BjBWBOCU APz* 'tل-J y3)nPHe'a }u4[??gfgIaVafR?)= +4/}3Fޜ:=Tݲeg9던oLU!@vТ!1eWz+j?THqB.Y)rΟӖF2Nmc 73~M #3~ZZ$fmF&ʰ~s1/RJ}M>D c'?Kh[@"RPn I a;q`H#(┾`Te/XJNHYQ(d%4 =>"FzN>7}ޖḓ^׍o:P,ġOueS4{l{:z>jrҽ9 1W ,c7$ʘLFa|E.2L 䳚a!f*:3WMlhG$LFBSpLmNHqCVX-8&} [xD!)7- M~ϐ)WBfo 66ZViMz A\Ț3JEp `r(as#daD$\rAg<$Ԙ1TzSOZ>h K1BIg>Sڷ򌻴aAC~∰zz2[rsRtVff,{*KrW\v0,A(p?FqF%,"6YLSf+|3S;[7G#b U˛@:tUHqS|4b \ iK2& c[<8Ո W94"ԙש2VL'R{FD!K c) ci ~"ݏc%bsmIe&Ø9r/h(C֙rm"9iP:10"P)%ƥk*U 0"rX%2f}%xPu"2|ȣ1BNƌX>PL,4_jW4u6 ƶ1,OhL 41*0}K`S1znz,7ZFbYsEdaT1'jlǨLbdBO)##Or و="e ( .q!ìDB/ƑPbu+C+d>?78%<.;I3فb e8"Lx]#c+ =lL{?T[>eaT(.׬*I>Q}7e,kIґT9ԞF€Fr(1X;HAAHv*b =GڭFn4,+1!mh&P&K q<󶅕<~*cY>-Od f;BtNZMƬ\qRLH2qI]vߡXl5,rRPе1A@lN͚5IV"'Z)d3vfL2^֛CGw{‡!RP>y@ҖXMh8j$v+ )^!o@:lcR)chT}u#2VBF?dcD2m006"ـ3kTR7a[e:ŋ|RיO#G,F43ah{ĜkwlzG)NJT[zd3Њr`*k*w‡!RP>zɍr<kg.WD6C"6:L"g1 d)X(,kL\U#;yw]V+E홝[ Ss6R܈1zL3+&k9wW.R&p#s5@JLoD3r:JEfAob.W HGDC^l>axL7΋U{iЭS>ʳgQ-bG?)Mag D &CAF/`|Y?hMzG)(rL:OpnLIh1m =f\SҌXL=msȥ:7tDob@OIk1!ok,%FQ+Jl:# \0))vu1MDn$#ţi^h府>N&@Q =_D ‡.$b-P" & N{d2 qL/g#VCe UP|OzZ aH[R-N; әId)Ô>8;3& "cuid0rsd&๼)\(d2 `ktѿұ*u(g 4z!Ic>I*>WJ N9QwJ^DS<=>i :A(p>@s>+J vºK(lSPHhJAhzB[*z,ԚG)0SHU&=#nl+l4Ѩ`:cPr`^g6fͦͤudaܐYe?dk%vQFor DMȧSdcm(ƭ0 e/ĊN |Hfsڋ?2`C*c8:1!5 ˠܣ13J>)G\9?MqJ?ʈ(UU=T-ݪu *H סqJI  Dc"՘Mz9~|]b'>YQ&Wct/5KZ 8~dpYǹIa⿧R7)(ްr t@E‌pH$cbZ)QqFxBeYqA$K":ÄL-BL[cRB ͦ:S1ap,2"x BBGPRB+v ݧ3# ƄJiЭk Y\Ο oz&~5K+ q),c ud& qo0J]p{/k~^araQz|'ƑPRfHR ܚi ),Hh)ߑ.3֙iuC #F SsxJ>$Q!˧z$ %әInt6kjw`,~ l&̠C2if`V+ &ʒ\(+i6 f!vSgf ;摄 `=f!X)$4@Ɛ41N@CŶ#dBgmDɝU_"cq\e\e24:/l8Q`ȫ+1M.Hzk^58xhY`;RTGNw8"ݟ2Ysm|htܹ>k!)}^frk; sHAAH2B6!-Apl_@C'ot4ԘAriV [Xt.+TNe9#cR1d je@Jx\`(hqЃ-957%DV@c|]2t(F#+3vNζ$51F"Pq.bg%]:"1 Vl= O3Z1[1-̈@!ZT/ӒKh'#Zg꒪y$&2- } `F6ulDLWjzI{wΰg}g@E!vk&ud*2Ne"A{ŠD`z)(pė L8yF@!"8Y er2i6nȋ;ZO|Gmz6dzq~rSp8W2"L;.=!OJ7cztBdl.ud5ydz=C% aeOqhʳF4&7/nc' V#b)VYYV)LxT3թ3*a /tvȘ6 Yc:i3L(Y>MƔD&fШFt )N:CPqڕ_ R;Ŭ:hNЉ&ib[DroKأ3oUXP_c&eDHAAT&tqx?۹He3@D]gVes2Q_I,Ș)^ŴlzT[m3 \1BL3b%PpRPY28$ Jbr]z%֛~kkGfA[a$Ȕ"c&ΊvF :^⽨YTGSD Q`QՉGp6O 9>kpbӔɪLL3̪T*. 1+c#ޒPVAQ%r1yVB$Y)m?Pf"-j JБʃ=YJAjlYda3O㬱ox\[Ih }Y.qVe"d}.\.IJ(wPM)".&RP ;SFHәKqT&U&b e>A`uiYl)((6٫z{UXj)s:=핥eusdiu2 Ktuy$S}nO؉en]{c-K/(AO@WGI8VE?4s oZ$iMs(gKb:t?)(˭Zx-z2}90&=KG/2զ82P [cTrsԿK2x b$1سL=Uí3XF/X(B!Sƨ`cM9Ybb" ܱaQbR{/n3]yBo4@okK:EItiSeJP"J44XSt/vwg'O Rdm@۷"Y!n3?M8zkQ$ތ_)[ns0)b1z?aS8Bfs N>KzA1+t/sEo=+Rp)(e*qVYǯ(Fq6/a0\Ǭ$лq!mnIg9X>$fDAUT A!& 匝f&$-s05Uo8jzǖra ׉\L1a5%"p)(}v Y87q7HB9KDV3e&ţQv%™D?L^.^뗕}F {[Gz%o㷈==\1.\}޵x쾅OE )^4(%CZسY 7}m*?b9?ahqy<d5ՄBzKl_$1JPuU1#oo| G&IGຒHCU^%%7y拘U:cB+YUnwR5{᯾2_$G#휋;[cM%NqOl˨Q5YQ^1mcUZd>ģi˻j(U~Tӎ5{]Q;mߒFn"Ipp묽@[Vl%)KtErTrXN_SZUrsTlxl_crL&]KJCsJ @4 $^ԟHS';M&a9}g$X5 GQ-EEڷPؗB ~/G_ 7 JAv[gN')4F Sp/qGد7rV{@{&T#h@(c5$՝wM^ _;Ǘwb y:70*[)m(+,I8[5qLP>E(y>cTi%;= lf÷M9(cg(iXp˿O+$A5c↥61'G`bϠ),P{lB5`MlسZZQ 'Eghx$ƂpiŜƳ(W' goMzWh9 G1ʝ+ժdfn^=Nyw>ҽB)n3<$%iˊe[v86Kjg](vk-Ev׋ٗ.6 6PC!ˋ&FĮD؊(EQ|9.03IDATcIJ:9~t~z3v'tU""XV@ :[TioѿlWӸun "'kAM8C}қo 2PCg{9nIDLP绅ͨ['Jp Rsټgΰ<YBOvʻI: x|t-݁3Ic׋_~cL6<""w%:ɧH!uU .n[7RT]&43~Q{pvatdh m_PDɍ<5S5“)7J$]X6$"Afo(%"8:ۈpk5[& ARI}Ab q{-[)]fQId,* .(D_3[#b"W z:' Hg/ pD}`AD:N+nh& #߲?6Addn|'ǭXQc m }Ӄ. Xu2_]q>_:@.2G+Vz˶dwټFy2G4V񚔯VɵX89 lhSPDg??EJ9 a҇kHe R9۝Ҙ+S5{X)~$?)N8]"ŮM4RPDdN&q5ienznB V6x@;F] Ip_~HM4b* l0e20,ҭyYb;LL0vp+kgCT>ē4VS9+ bӓ.} 迀F̣5"2نۤ*Eeϱ1/]!E|m W]'IJ;q+>;]{s.7I2"2RI$gY]^nĞ @0I0[Ŏ$.B g}1{^aMS]xK^]II"9uQb7 n"վlQȞ&?f4*oEɱy}wʈ"2Y"G3}%9Bl'کlwvDh;M؃zu󖵍oSmR{Md2K:u K}PqX L=F^QFMb;RY`ckJqWr&N:94]I$Ed赋K8"^}͛|_}'n` K@'ח$g D&)9dPD&S ps& MQ{h;^YlD|GojL<5xvf k'.ph.4l|{^V ;F8t e"2"9>d¯۝2&,:Hd;kaQ+`prt6)]&#*Kԗ!4/("Co|Ѹ];DIq3X6+/P;t;x xM* ˸uZk$t~"T(Fm.͛ RPDFAj?#EA ҮZQZe$g`DCeE{+eZ2pQ>Lˬ@ϣ^{qCp/5s1Wgc&͛l{VϦxױc RPDFJHioƎP| #  TZ,}P_!eSlv˔G[NpAsiDTDFInh|ZU]phWL:H0 N>LxM45Yy!u˦k Z20 d*E@fHؓ6W @0E™AYy)Ed{7/xM| *QN{ ԰c a:=wuY7ロ RPDF?ڿ}?t7xM$עYkLnez/(Y ܆^%j,H٘럋zRlǜX ?=+z4u#K4e5RLJAYcp^Qp [DoPz#?u4۝ % rgxQ]EҏLENޠAf6+nbWSQْ?bKlHJAev/3!8F:&-bfnt#7hI)Y,e՟\hDTDF_$G=OaD=)|yeUtݳzx=3b$R V:@0q%m|z prz0@6N:0(EdKp4_ Z?kuGpr4|F~Aci^PDar_!"<נt«ԗڕNmgVYyF:;4B )EdkHfUVkRY\6JPl&"ɑ9e57rm4 XvZWVo,/_K,yA٢Y):{菍hSӪEBkdAUF}A٢Iҏf9V&w:J*4l2 1u|2 YMV^(ٮK,Ã. RPDH}?k2v8}$)sce3zޠ|UVDS@,,[vqekN> yv..wsO.|1ƭWoӇɪ=R_PD>G;`iDAP ' RPDD̥s)ED\JA1RPDD̥s)ED\JA1RPDD̥s)ED\JA1RPDDt(7"ё{9;s?3@7~FhDTDD̥s)ED\JA1RPDD̥s)ED\JA1hjSP_PDD̥s)ED\JA1RPDD̥s)ED\JA1RPDD̥s)ED\JA1RgN]ScgRq^}nGZe42JAC\85{ǮN\93zNi}@߹ݸ.@y|{هLO'?>yչ3G=>ίEve2 nuNN;zf#":\|'RkÈ?d[gzKO׎}TՋGog0(5RkÈ?dz>w us\{p`<(jDVɋ2lC72ZfՏm8g05r?#Ҋ[ceչ;WE.F1asr?#Њ[ާ_p}&?o|#sW/5]^?Z)J+z1d|||sٷIZm42 3sg8=|WvyN|8Loqj-("_tDDDC}A1RPDD̥s)ED\JA1RPDD̥s)ED\JA1RPDD̥s)ED\JA1RPDD̥s)ED\JA1RPDD̥s)ED\JA1RPDD̥s)ED\JA1RPDD̥g|cguԅJd zAAD> |'=3{W=GgNN;yqg>?RPdAeB)(2Rnߙ3WN+E>JA3G5(yQ2^.V"©Ӝ?ɹ!!GS N䏟gΟ܉m F)(2";q+b?s$O@9(P ?8}vrP5"ֲPN~|kąST""b.""b.K)(""R ""b.K)(""R ""b.K)(""R ""b.K)(""R ""b.K)(""R ""b.aIENDB`uwot/vignettes/mnist-r.png0000644000176200001440000016445114730166740015402 0ustar liggesusersPNG  IHDRWU|G pHYsod IDATx}w7bU,>fw=ۖc.ox߼R?gUf9z{~x껽ӛ|t}]?B [w'OV!fm}L'*F qVob{v)3Dv_c~p߭{gǓ>g#b^zՇ_Ď=Lӛ,{[OMLx>ܙ3?}|3o%NSO}w3IA!>?rx72.];yr{Cp._1}Z[mbԩsl&ft{BC;WOO|{]|.!IA!6ݚ©}[!qR%;Kͽy[=u =ڝ O^6jѹ3S'6Νy{VqO][9xkҙޱ ۥkgV z4խV0>;6SyM?枧NlfƵB|jBUϜ}V;;N ywm>w/~Nuc'}juݷO5n ᦛ_qtysg%Žv Bo'Oo2!B1$B .IA!KB .B .IA!KRP!B1$B .IA!KRP!B1$B .IA!KRP!B1$B .IA!KRP!B1$B .IA!KRP!B1$B .IA!KRP!B1$B .IA!K|v<N7A!ĝzyJAvSBvJQ!KRP!B1$B .IA!KRP!B1$B .IA!KRP!B1$B .IA!KRP!> 9gQ醈b!C.`}dH tBlWYX^&B9NoI !gش`M2@r;2Ft[7$b,"m=l`F[{&aF MB핧wvU,6.,g(+T$5NB ?KBrTg_rb 6fMmenڦs.pM8S3 J$b,"Pء3LӣBbSW.P شsT1f5L{Ǚ'oI !6v9/gbnkwZ{TݥGm`Z8@ ,4- 0 .Y5/&8K7E@iКbMbDe{r:5W΁=e@\OtߑZP!ѭoU-z7X{]?</P%_䊊үMlA:@ h IA!Fyj1i5pV1J|ESCJnN5Ahqz#TM1qJUNP]ժUorv>$beLZJխT4a.!b9"Y ]vVGS!"Wt /Re"+UBԟBIK/~z]2Ɔ[&luV,c %WǗP'=F8kDG*Xb 0$c]Rot#Ǵs}^ ~|&΢Q(FXIk^鱁 h/4RUhfaS{oE& ޯBjAqkwٙ<42~/KA8 0fw7]!xq4VI2`Ӟ 9A5Ga"V6IJF ?!IAqה ǵ?8s..X~$ a]r8 O0 bmV("u , obȾk;ϞA~pLyZ[ye}` %=#CHy_og5;F:6>]Gx?e Bˤ?VGs.b1⻈˔c(  _ LjQǤ%)IH @\?y?kg[YH Of1 WvP#gv=)P>1V]ZS~qtg?Q '8\he \gy:op!F'1;eW!)([fF#@?x]h;dj/^I9(>G xm_M+N §9!>o_c6ZS*QbKlT(e)x#h{"nQ==qh@äx0G$]rao0<9`٢DAkAs~˹iޖҜ֭1b4tO6Gbg\ 5qG ? TAMi:/aM!&([s missTX/q=6E27H/%?Z&7XO2I6Tk[_i6(Ij#ȃ5*a9cy@ V<,6Z%Lx0%~|89* U~|ejuQbE~2-Qh< 娦1U /-Ih!_J8Au&m@C&^溋ۡCoe?p=LRP}(e!t{8mP$2/vBQlhZޣav; *F@2O20_0cUtZZ5Kaۥ phٸƣL\%RaÌSj+qA8oҍ >CWc }D6:q:K&J$I|t{yMr՝nwt6hcOO$^Xvy+rT?g6}CMZ*Q'HR ojUxv-]΁/0>BlB`*79t-K4[%6b B+TBIt'ޥ]ǼL.O&ŋ|IjA-2O]B>rWC@ x1VygُΟjV $'L=n9_1q22e K| Å)+x 8 Pnxtt;Vy7:Hv w C|XH@}FC+E " Gfb䯒}d3<հ'jJi_` Dj4 m:cijD_;1eXܟh&F^GnfΒw_?^xkueüKǗIh+ W󲳚G_J<.NMFcAP1NƪKD0unwqΗElg#ﭭP9ssVX}<1&Y+Ktu8)"' jˏ/ !Ry #QHgNvγL #h {av̙=Ͻ-Oٹ_].wj}"֎_bFf_'`:\M. ➑0G^^X6`^) *zKF}MM+/ih):K\wpi EE*m"EJeY3fsM5Oq?C=hE3qQIDgWNׂ&yT)[߿Ɜ?>ߙ7/ywfOof+ejM)PڟQ4p);͛ϺAtw+)]V vF)%ʊuzQYѥD޷>I:ӤרR(Qnڙդ{ 函yÜ< $=,;!WvxtZ.ͫbC;u.2#.`U/y˓,0@b<: vpr&7*bCzqk|8UYK[8c%JDvq"٦]_ZP8vw ̂buj3K2nӮ`ϳ^/+ ,XԔ 7YGfT]\Ge(a*Bf +!60/fNz˳ZPXi#gmz\vc3n>$ ?c$ Zߤn(%l1c֫Dh, (3֟bB64 X\ %ذ)>VgB0ߏyՁdοQX }dX.m6YZ6,ǾGYjk-0 jL4|GTqZ_a\*!l g{RLm~?>aXٿpۛ3F]m_e*"hhQ\?>^kPV n`樆P0NnVkwKa8Yo@q-sCJŧLSvKo>3vԂ@?RRWyVh*:n,R1zq5z!bw ; u蟛~"Sd)9YmaLd(= =o!{Wjtvie Xmw{d'UJ+ .)mz0/5x>8@x^'PqK6$!u@1V BͲvokw˾%ak/lD(}Di2\7k*U=b}Y^J8g—Jt(N$@y$N+9Be!Fh;&]8IJoxyl&V$C(DUf23!}!$6<.颩MB*$SY{*Yg;-'5wָGUxp% ?~`ZF|] \?삪mk6Zyv%*4@)̇(?}vrpJٷ@lHth \x2MM`V-휵f6vmnP ߠq@ؿJ; %J\Xw"]na!jiN^)kC3kjZWEٓj)'We,Nu:ӡ` xٚy7+T@y?Uw!b t)YDKBTG0l~~ ʬ؛aC&AUj$>ϿmeXG~ 9X5(M%0 qjugqg #3khtl+LmB "dau4lMQc1[j &|7ozCX`"+.4i $!`ccQC*BEw&TAF>Bpa#6sWX۟14[ NKUQb3yCRPll\$CaFn px:meb:KUF}vCHO0&h. \_ h4 s`"Tκtb&ZL# upѺ$C3Z7K[9<Н+sZX - hw% kMͧz@=~KX=<%䋢lJOהl-N4/9K1KͱXܦ 6)dd!gA_ӡGVk-a"UI {, =ĨZ:ɤaFp )(Wrx`N Ӵg-?^).S,K%7ʶjNpmЃ*$ P(0Ab H $aFJwHY#>ZV'_Xϗ^ػ&c^=QFNd&U]JQUYyU16/`@uҘ75mh/E4+Vp"tA υ(1(&*3D_j[LNG`e?m:m:$1M=h,R!6DIF~-$6FA~t iPcoPdBájl+-pP=,6uX4ۤ 6eUI?) 0<mkb?QEeRRB 8$">zl7Oorȑ,!Vs$OFg/uhW+5:J͢r8BP"½W3~- Q.lo+gהH(R7C?mD2(G[TU#Ẹ:FFm{H"ޟVӃ4$,} E6L1c@Ŧd2#ц=xgxkl <{3\Es'׏3ChMpaB @ נHpXy5e~@M΋N4)\MD4_@.EBm^C(n\g*;6°Nݼkt?SJk7έSc=؟T14`,.RzMEOfخ&n" cVs?T6v|p9R][,g/8® ,5h`8X ޛa&D Fv["WR[3D2Vbsg~Џ%)(WT#tHNb_٦bT؛!~Qv==K ٓ+& l80^??vA!6@!Yrq:aPhA2D-@hϭ'4KDӥY2][N$k{"bLk_bu䭚zF&/ߺW|W5Zx ,:$0Afzo*C$_Jˤ5$J2XYWX-mPD@R`4®[2Ο]J/y8Ak0'IT5a"Y-q՛Ca7BIbSSE&_LowӁ%v_ U6Q4۬THjLZu-ݳҝ`$DaQa_ JwKpAok[`BFP=ABZ;? Y&<9 ?nFlHBkuh7wiLD)/f u<(^w*MPľu6&I]%JRS*6o,* i_ ܨ{B6J 3I;3+Pa`P LC0ix1&iBvi&5=C:'9um]6} =amѤNLGHA1FC;wK8˔Wo8nc5x]׌bcZj4G+c!5IR'1$b{k̮c:MHzC)0fBi\k!.Q_21V*,Za")GL8 0m aΜWh` _< #v0 V)X4]JdR({CZ Hաi_U?΅b.62~v}Lƒ`M9lX8i Vf`kǘ"}K;9ľ/_ ɼ%)(;mlyUFBWPuCC B;ECNj)uE( !І:<qH@PYEI~NloTP0[[|{*ZM]Qcod>V|q^)OMsmFHPjcFO71Ԋ$[|OܕנB Q-!C"\9xR0{-O_ݤhXB!]:DF3 p3d1 q.Q0{ fѴ*B+子  (pw^ iĂQPkxLT>Q(xj6LuQrR**ݜ۷qZԻ}GҤes$`Ԓ12bh~4SNY7qdPO~l(5 nؐX@K)DJI?1IK2H 9?I+ n`#ch?֙3i;7ݣe) ZS;[^EJd l~VeOQ _:!![ӱuYɟ$UGO r|vK SaZ6֎u3į\\eA<e!L,c6FOh w!%l B = U1_CG@TfV8CVnݦ:MD ( n4_M0;kޤ(GS V(mwP?=8654{{@1Qw;O.LgDtiuE8xL,p33.7n{oJaZ"&`Eb+G.#Kl72ORpmRP8#_l%|W(E %ѕ0v`i@!H}dSqH57vϖ[x:^ *A猦6sP&MӃk~-:gQet,\zxK>+1gL& %(,e_&l"<0Lffm2ډ/0h4xU_ӧ_bh(6V;euc\,׮"4B2"Qh!Kt|V$h}6l7XxRo!la(l{jy1`/"?ϼHtlt\D_/Toxj=9jVq+0֕M.ؾg'Fd\̟#^1JBጲJMP ) I& ࿔W[B13``8{Ǖ+˜:5z%4$iΟ1khxt ev!14xL:Ν=% L#fFv ( {!L}Bg%zz/#ɱdapq8[5Bzw$Qo7]2ImB׎2#&էGTݐ}"p?*[iB-LT_op"6˛oS-zeZX.aYy#2S!f1MbTg4&dC.nwP$Ne\6|8BCHqew\g7Yu|:?!7w*6dMa>!@ `-B L;;PY č d4Fo'-08 U4ٴ{Xm '\}aaQӸlt2uCDS]l= =_hH{2Q_Bupդ벾NpH09tG23DhTJh~l8B6K$B.[?rLoB uS#gJn;u'N*tQ[u1;šy`0b,Ղz:Pj'V F3u>WI|CO4688"5lٷtI8"-(Јh/(p¿ \JB* Q.8l"&AIB"KH1s&.ydO7pYR)t]4y@v>mBhN$jC}0ШJ_Ge*QS9!qQ hҐ$Ұ}BHIfc"& S 26dE81&$u!=/7oe Fش8D>DhםVSp2ʹeh6ʈuY{USFl u7lϖв{x؟pEz_'%\&R)Νcj~ϥh1MΎ t(i4rñX_WLOMF]dC rTS2mԺ([M]6!! Slߺ{vK|sZzyԧ|x > " +X: 15u@ 9i&&oҦlLMmU}cS~osfD hs+3c,ئkfj7bQ;\ $|PLb/GG Wr*i9L|]g4BQ$T^m IDATBVVc2-r9~mZ-.]1/(0WI3 dqK)ylX 8GY yvrLGfG}W`dH&78a` C!!>7'al=Z#JkV wd{]AډF4^b  (ضtByߖ8#wcDC{,5+-g8MLTQ*Dj.!67mQ)^CvszT؆@6$xw9k #:4t3:Df_| umѠZVdLJH!H&1 yTwe~˗V4( yVFBшRY\$O01%4|ZBD ] Ժ(tTC@@C)r6;ߛ>LЄ44`PXm|u *{k9OȞ@Kf xmBx ai[$I ]X+H넴@eHT;$P)F!&*q@M8$Ţx;7KI4ʭ[7oRho$fymȽsst[7r8h A?EbY",xC޾ptOf4+h7OxQal.ЇڽаQ[p BЁI@q8!s&4Lζ;"I[|/8~VDi`̘̇:5ŇȆe(Qlzޜ<<&*%(qƉ EQ$~:Qcub8{52 !1C5&ߗaG/YCXp>BiM&.s*Vڱn4g݀W'1hO0Q/A4kkss^9Ci!wb(Pjr HqTp fv[e޹wȹW_ cb@dhVfc䛸 `oXH*nP.h bp[4zX.gW*A Ѷk0 96X,D6%21d ^@qoQ\T@:[b0co">?QSCM"pGyЇ.,taV`6[ZDñ>ΐ6_r5]9Rr"`LPJZHU i̚,7I t۔px{RgLL&W+ 8vkp]=!ɚ30; #XosJ;ramT#GoJMWX\u׹|9>/.+-C)Jmǧ0uJm}Ʋ|_/2ivD9>bU->̴w=aLa |ҰQ>:QB;)bj 1ވ.;(8iT:|?>` ь 1C& JvE6]mbИ%-*L c.07G2^FTU9(#,xZiT$@HkCgmf]qo\Dt헩dXg5mٳ]ߙZc]ӳU$uq0{!ggJ8;j*L@R}KTztGG} 46GνJ%0!P 8}Fɴ[XW_6b<cTvS$*V>P#Qr{%A`JIt9;P(r'i ~Ɂ8W?aN[olR#aL9SQıc5lT=΀mu?I8L*/5ne[ ŲhJ5m| (a kǘ٬ՠW8y)գlV{tBF|ZяEz6L0ﻻD ܡrظL d w  'hAnP6f *0U 1ɉ_*Ɖ5 gb(仾gxNT4=Qn3Y >&*s)8wnw`dNsM Q O5S<3Pm*2'S.ǧ8 ;XQLc6cvm>O!sS9ފP i5C*07bpЃ(x@܏U6AqaA&=7>ΐ;b.=jdyڪMYJu S ??}k̿*:T`_‰-R+}B`7Po>?o|Pp ӪWVpu9~d7w n kEM͑?<:~Fz( io̢>O0_?έ"kNSRh1p$p,T@WYjЬS_! ?W )AU5fr!6z1* A;LRyç0Sgۧe bp2ﮨP!Xn.CpGɊ|&0n|o7QQYiA2&uD-e he8$gyEΜ FqiI \L*)p] u^eQ3Ӄ7G|i&x?%ycO6HDm wVE@SȆfq(,׹p@5h"`gCQ mr|1GHP{L>C"i?Yt}PAp<PIj am(HZtjsw Mv^g5RvزH :!xa;6)&",kS,i2@AWջ~qo!A115wC *oI.a$u;}1sKk-.[[>"ϓJ.Skp G:ğ¨M.;aJm*a&~j9,4(ܔ$z_Ѿ >iƒg_uaNjHT ϣPc}jr,(|`?O,LORܸANs y|%5~Q)edۦ?g{. 0*Rp{cu G5 iذT8fl;G4aN}9w'MX@*ěDq8ܴ)P`Ej F ذ nSę N7N#t=L#OFl(:і=yJ']v.Un;mX#-66Jsc#MC ~Cߛ*ݮײg {iG%ARhp Ǐn8>M*E.L$岰,'!|]s| Uv0uʜHTRm#>v06a ]uC jr#1iқ=mI\,O>df-6/Pp4#_H 6W;W $\H<$&INq+WXXO|\]67z[Zw }Dh%m',I=FOF4Iy)^~..>L[?8@ap}%PaK<a ^̤LL(&qw=gF8d0,Qk -# ԯڱ&zC悘 BO#7T@s#0i,rFbedzr9LgSfmMez=RSfx׉2#8ztόuz=uuu^pW~tnZVڡL#\^֤ӑPMc7鎂Fi7 (bn UQ:+ bcЦmja:X:p8ND&TMyBKq1e!̍w ``*h ID4-FoIjvl 0cӣ⤿cAR'RaDύoS31;C8<~J6JMߧ1vvr9:^x7?Ϝ:,oBw{f&D:r:T*̝GV>oƛ}JJˏ2dWpvxU%;F2 IQu^X=}-Yci{dQ P@@.IG),! iXMO+0ßޕj=6NH%K,![]Yto̅mܭ;1\/bec0`fi7׃զvki2x o5יylr]qvvY8 . \41ͽGrF͵ +\xg !|RT,|6%zв gJ uH /:UC4vaNe yׇbNƩ8%;",CRjpn:mQT! e*6LAS~ʕ/Xl;V8[MNLZn' ~=x;7͞,>A]ZNQN|B>O;Z|ss<4\7ǎx?$!PUYxrN⭷M,FHrO>uJ]xWN”Y^1ҴQi$h' S8gRۅMk R3Ormkt-[9D-!cR|XL6Sb$J RS]QMi$0<k(/ٔEi,}LT8stt  mJCuz=LTͺl2Zf{`iPU F7$h9v ]gsYR)rywܤZe:\m{joop;\#}X2`dJm|Dخ+3ƙi  HJf p4?q%0L%>$$F\=c<1_Qop*fЎmlZ!+5_T8!ep &T oDH#-ش\j/++dz; MЮ۪8~7hl3b8-ږW)QR kkln")tkb1u 2J5nfvQU^EX\Ĉ#Ý;,,p<{Xo}tzoHq?wf?md,35+9s IDAT;.J6>CeЂD y0oԯ:'b:]KNAEpEX#&+e}~sc[5>,&*Ed2:d|!.p*f?y3~)7k2r|~ z 5J%iXn"!>`9so7/`#oc_orxdm^b”JDO3 TJzu0fةA]'%qX^F9u\;w2Cm@ċ/ D(f26fwY_HNq. aR1|!@f:<|6Ā86oA%6hsWǁ'E3D!' Ga4K)Ϛ S4\>(S+pGOc(*\/A&÷EB CܾM2i>n~&bjˣ3@Q wT~ ]whl h27n|KK8U }L^H{`֕eBW *,!Au=Ӿ~")ԉhRsz8n6IhW jW1,1}LTqB!7xkj\h+WIJp]]?lfZɻiY5ivQ \L4J.ǩS,.hm3;= lr /m㺨*/4  7/@!-,7ϛTJd$q<ճSAePc-(̗&H1=wnöhSgJ]/Щ(X,~G'D4 ss?\IgD"DD80[[dLO͇@SaCЩT~hò$-ey++,-q4~?>ǎy*eʷ[z4f7[ $%o֔.4}DOO?FAb~B#+k n`kExAff7ΜÂ1{=$'m۬w8԰?,͎B7"@ᎰXOGa1Q_ /ҙlL&dx/q]~vvh6 T'1/x##ﻮS<$2%{% fv ·`;a3xvl:LxMnvW}d}EfdƕM6IQR7/SVdfDdy4Tu 6ښ;+[yd6z ^u `q[q8٦+X]tnt_ґTuͅD>],i[>wn2Cڐܝ6~px,\tힿ%E1aQOCȯ:U[~ *Wklz_ɩ m=Qu1 4pxT9k|aff8x0?{hqg;D SO V`-5Jk$| %!MIR1kR)ZQ 6dA?: ={{]ܴQc&WwXq#HC!_lcUa?O>ɓ_9=G,N$o;JW*cbvsH$lTJШ r5q00 }!l6?cQT)f+V*!Qb/Z4 JH8()5$=2.S7ߟIbl;lW?K8ц#01Vu/,-I*G,-1m@*m3b,-ut]FE"XLY.r+c +siP:ͫ=.K:# \6IC3C:Xr~*UnFCནc&y?] :Sfb4 Yr{b1J:,ArU oX#&&{lX uV%- U"(cGU3UqaQ"\mG")_:8E2/ 9-syx*FR/GjQV Gb]v ~{rH4x{44@Hbݻ̝IѢAKmO6&! `KTi-DaYlG{1EƆ5c}kZ17G@.Z{)I%w)lW%`MA[-7|io=Ay@o@qplvtxXnnf4oO(Xi ߎB~xe(PQҦ0BDq1 E?k; P]ܾB;#Y/0L(EYu1>rE@7)5FW{Xfk]akr_ӼZ1vڴپD=A:EUFQyzqRnDdA\22n ݓw8+%Owsw? fr"xD^gT$íT*< %ZFTZ<%f(!RY=Vbg=Ϣd28}k_?KU`/@-9vC=ѤTGuFmV0Êa+059Pc5E=DAEBRT2c}/ڭ*EP >{MfM4iRs\xjI(& * 'UɪvD 0*!=Ya"Lo[vLy.òj}.+z<ЋXnjPQZ$/FR:f.p8~"uM*:0zҨ!BE"3y ףLei {p|F.e"&)o4P wҌ̫r}S9ZW&ΊtX$i^1ɿ2q/l.M'B(֥% MGniٿYrbOD+c&|mRr>:dI+hj4~KȗplW kS-}ֈ>ER}V\k%Dț5ʠ'f~ݼGeޤqGhE^HCB:{ u9 ٩]2Rգma{򔚘Uk7lalw2j<%gE+7y᛼< xi?Wc3棠̦ p]C%A@`۩)Ҥ2o ahQ?Z.cDe1mMڮTxn&J_'b"!bp_ Wkjche;!PG'g I˥2H;Se]YZ__Q_yXcsKv4Ӵ&Y Rq)FI Gzv k[j2qUW(owڄOk+Xnj-pr. nt=?'уY騁`?R1n` b *@"0)}bwb1^|RDzz:1<ԐVƣ_ ?x-H&)#0^29)\jKJV.CYyMD##xg׾Ojx.Fl^gOi!Hd4H\8" g.()6KڲR->G459ߍZC@$g9"7(mUp̘{ .,q!B P-SA>6JX D,Pcw@E: Ci|o&#zqB703Je"H`-0k"0f)3݆ ҘPl7Œvfxy{Hkf$\IqShaUEjZrXv4gxPTH_ hyog -Lo2!1[WnQ+j~LD6ѡDPlN3?wWN^2_D{HljxRWTp,~30Û'P+D| LyH;h~XnjW pXjFquB&N4`0ɡ_5mVx o#_ ,6V-E]BbN`(a_NC0,WHqE-u4e?n׼䫀$ V$)vU:Wo"XC6I1bOя"})0Gpsao\,Z`BOD%-޸/%`k [9!CG~K (zx"4Vs,ppjhaL$l])ن .$SG9x ܋4% N ߧU3V1c 5J+4Ϯ Ҥ:12!By&ONp`}KlҦbЫUPgrLFEܢ5|(.fsb;jԦfPi20?0!!RD .L2;-FaWw JD8.NWQ(ニeT\2T B7oBjU \2W^덚1f臨C"Cq&,]=zEy#O115jDk>,yb`aR9\d㏐01܉{y c3St3Y.[rNLl8AD1i]BFY%~ftQ7sMC3٠&#IF$Wm;NML/ IDATVCo{Kd(tF{NA5QV\mpt6^ oc1R`E9D$n5<.ה~Y}?52`.\};k:\r2u}Y9" 'QTSȒsvdέtAOfl!ӵ3a##ȃʴ z$3,XJvZxKm%R9RYiOeXnjnbq̆ID01[v[T:lѠ|jYc@/ok }FmS%S{ҪRiAi' .$|_zm;KZ23Es P#1¾IHCdu׌:Iug9[aU9~~g.c֯]q/,Pސr Hض4ut: ;,';I~FcXmĒ; Qg`B&4K4'sh:3yHỴ_"8)kXnjTX6v3 \C0vW;4~82>Η"(NDG4,{x>Ah' !pRFeTٞ4 kR 6sAq_)(I ]uJrbb/a'h (y7,$nձV"/PrՁ.XF@vs CN' bhtaB䘧pb Kӌ1tbhwFoCc PmLEsDa253`8psуc3.]^>;zZ@n D"ʎnxts Af㻬DSGbQ Ly|||A<2Akg'S#NQ@cD*Hۃm%4$:$.Nb~֖vEE5ԃh.!B2mzoG?w57r!M꼟wL^`R +M#;ɱغPݒ%掐.pU/$ MsX-mdAc~&;sF#6ڪC6?0opKq,ޟecM\J 0jrͦ $tcr6V&5vlN۬fb32g >@UJkl1R:ށ6I{Es(y H& Ku_uhlBI /? p{E;ڃMF]^{o"!G%0(9;a=V]RYTC1fpbh7zѸAHXP¶m3MϦ!?YCK gHaE|:5Qބf LNL;wqh:42 L$(p s#ft9 GFnW?xvH75^(y5}@9'~)>[cF 4kd9B>Sv[ ]q"6rS{QmHB+68`IzW"J!t Ba~Jv?Vl_FhwxE05OvCyG<*o0 z_c"\0$նB=˳WWnOճQGy k$ث[JzJU4,]$"lw8OXnjeJ??C^\;4|\Ku }V NuanhoqICc@?2l;5iBiA^{j2[  'gBx5Ry! B|B*7qߢ񟐌qP| p~ۊSYuQ|JZCT#nj1w-A4390YdGٙsQTv`/1Y2SCaTx M~oQ/w +9'XƳx?Zzǝ_?go{=f',: J$e(iIk^ [X0 E(60QuʛUx,/ Fmmǭ<2 Xh( /aCn֤ث70hdnuXFԌm6j%J׏LDHV3Gt^x5Z5l ˳@VR-L"gRGfY4P? N~xSO\AId9x 33}C8{I&\2'ոCO%0* ♍Sqg6`c9 f@ʚ"jvt`kӟ$±|eK<[<ﮱ@״(8WXs BZ6f<9G(R6e*W^>&1\d}k la(B&vK&^LuJ*k|''ͽi?i\FFv~*&hi,]mx8hkFeh0%@?: h ر;(s@r\^Ӎ sW*9w{U 41ȇ;e|N a 8V4DKQ%aS ]j/p1!?O~Xs"&G"P$/\|I1+} 4&& {b2 "B _ VÜ- ^.mf}GSϿx.g*'BFqĘ1?'87SI4vɒ\?r̆1c[%cEfTP0("zI?mOO|%H:w[tg}ѡ[B}E 3N0DN ś%p{HR %,Y.Jݪu 6,Os bsZDʬxϠsva3 SS^}{"\H!=1rtpSjew7i^#ȍk m63 @O )Rae?[h!7`)+ a>`۲wH9d"N[hC_٠%A9c (QMһȂ*2J1x q%jI=m*:ɯDsw6(ζeU{_ǜ^I]|9qޱ'Nr/4]j!L,-wn0uz5mU&IdY^u[Gs/S7$: (d3hqN}^H+W eO|^0V1c Q^/Sv>ڔJ&[ܰ.M٣(<<UC><gx.$oJ90;:ja=W3QIr& gZΎ>RD0D-n#  P&7, 8,^ygXm᧑WZ:,^@E1wۧg|[c unǤj1z˯"Gij 3z!eʛG<Պf(*ɂ*s}W{ȝį`u8xN 2Up̘Kч}ګ= 3D|(-Zv$sEfgXKU96#s.5Ewb*% l$PcCâ#0?gӣ2V.zPeF) QxetR Il Qo_8[M$AI`# F>1j#q1K{5(Z-h<|[Q!Gs/$(-9|?#2mdrLRPT6΀8C!qr̝Qf*5j%90Up̘Cx_buMm!3O/3҅VCcͪMO#΀Βm}Uv,6$$p>Q!#%ONO`l3\& = (N c|^ %C6wBx搜)g۬4I|4l-K g ˗&3 ^n:$M^-(EEѮ׬2pZX4[ha*[d,ޏLUp̘J?Ԅstu߰䮏'!D&9aBAďgXУZpVUPRzۥ)/2zq+ȷ=BL I-Dy"-?Wߨ/`<oeCB 7!He`e]`)` !E*kOj%D8ګBTBTwK8(ZxZy\*T>MQƍ3S /0yۦ8Ce ?jkP6J'QÜzg.v*8f]#M?vXLGCzUJW*;6(Sa1ͤL9mWvS#ˬ69N i|K*@ l4;lE_ ' [m!xBFHŷ6ZFK(~@H q1g( T+a['_*@慦 kҡU0Q4 da[hH %J/2=;Wp|d,O"+(:'!򟽺1w[s_@ PC -1mVwQG0/tikp6 e +_q?I6<"no'1R릚$HlZ)E oQzo)Lc~{i E%C8Wj 3`Dyla[BPW"i/Dp!Xnj봨s; l͒AbL<(pT6倾M Ħ5D+uJ~4Ǎ(3IE+DrLj]2JL`!Ǒc w B2 |DlT4pP $)Ba޽uwksЍ RdIF˩d&6dƻUffvݭd<[̔I&ΎUeYe` _s˖|E6~>/X?{D;տ ÍMr\*em~ٮfmA. o^ ^f˜`|bLi]KWu~={֦azKSG$?8=)B= Htָ2Di^અa`C3s)9j5 s嗷Y ʽ<_HZ;X8w o8?#Ղ<^_!O`'=)ɘ< ~QRNH! )NߥR:ӪRo4;USOѲU. ,).mQ,α |*CL?U92lg&:-οYr4}t~ )tk Aq*dqE;Sirsܻ תFEEVX?ͯ~oexem>o>~f5 IoIQ&A$AE=O5B͜Rj$5{`̒'~tTzksY/IkQuo/h),gWs4m4{Y gZe%m&(&M*BktX6' A$`5II0cL,#m)W^rOQkjogՑKuv .Ц^f:KIfWY|gp<6b<`͓yps׉:DG2WD$f̈*D.uh6ZmĻ6QDڀ橲S0Zuήvf5=զ)ˡ]UHQsp?mr6"v@meI(u׋u_7yNS{ E")(2 Ƨ92͑/̍z%,s>CΧoS|X7z&։J,W8Ӥ0 da;;Zdك?8"_u4qqϹ.gWjꠗH{ Z⾅99Klu 1 e' :vۺ5gD1%_U(._mܭ 76c9{ʵ3%ض٫+blE8Z5t|v0>I1w{^aFJKG!cSzƧ(a6:sg!͟4YJ(XCVZn7K ;Mq($g:QMloUnJ2`!ܢ^fլb{E$Eo.a%MbDu 5QL&S0Rmcu<0e^F.{+%/S!Wڠԕ& 6q)z5 q䐑]cThVQU|{o}^]G)M37qX[0ƇO>vccw-BxqYn:]w)%8S;lƊdĴkbDt1 O{gV;hB/>rЯ_ KUZ((6E5ة裨{½?:Mm4395Hjv8N &\8EzhELPft֗)0-v:}<479lw+No1ZZzufW8nߛg94G:44[d$l▙H!dL25&zh[sdW1vHڠ d8&Q-R?p=M&qϐ}pxjcb WJAƘ:ēXݝꅱbFӨp.=oLG5~666]z˾DSdl[7,iG Vf M}w1-: *+走390*_"~c~7$ERP.8Gs#y1>\Iuhoxz>GL)BM@sղAYt<5`hcu A^FMOnJLUQ(ś׸e3Z&[lM BB)~ >Q.& hn?Rвl|z7ZCRP"Gq jt'6}JnbpUWPs8Po")K=m٣`'*"$*T=kje\dasvvLj-RO>9zaU2FʼnlM~* zjtxcL>ͯg6M|dzp:6HM:<;\[%}/;?&EqES7CtiQ=Vm.0 0>^nI sr\cF~-{ݱl/"_9Ju fQŭUK&g aڬ/3:]q`~yARP/{9doʠ}~F>@r.&WD.Z DzneL`6 jŀI>j]"ݡKcQޭ\#ecewnz6c7]q€0[k]fVn03s,f_SYN")(Mwsy7MCE% QKě:qڎj Bt? y()-ܷHKÕ{"re` mmT-Mp@2mZ&#cC&"Ng9MRPG<@D FLZNpA'C= )"Als xI|2 Mwmo58wG$Զu|Ioc&Y6q؍㏢ZPԋf8x6( KRPq.I2& aT(yf-jF/1QL̙8+2qi{۴eۗbtd;*^kvog.mqAts4|LJi/ *:tϒJP $ݝ;5Ey}$W(tP2QԐ'C}r6N>N?鴝'>_ȩWbLbLbA{P"<:!&*(ID t5;t 0 M;URi0%Ln$3xL*kRT$I݆[DQbw_gA;{{ʌB>.B?q1v+?Ea`kcyS~>K-(e {]H8G4,zd \u*TݲcGWP^mil߽Ѝw;B| >a ƚEu ֠YB(]=y#S=Ԃ?9I !>0?t^|cUAG'i.x$!>?jUjD!jKKRPۛ9i:yلT\CiMTb`1gQ\{UPvu JoIJR0Ml>MH!(nϽϽz}?} \KV8RUCj&3:m#![B{q!ƍqRz-e>'ܪs^/[v仟;z$_IAqSyi;Vs9&lj i逫!'fɡ댸Qey*b1FDS#o2e2ݎϦ9p z2<-)(`f)xys|1D*>n>!|r;u,<.1hgbz+&cZ*JʴTY{tZ!nW *M{sϺ9;3b.)Ͻ_?O~/W{ß7O@e?WЂt@ Ԍh݈U0)TRBJd&lB)yJc5C)}θ1]-R#]Np;̎'<{O?ޟ1gs_|Til <03m+>+^7xYaWAW0C1 u`q\.r$cE0gP&v#Dbh ?*UK`AJͼɼAYE*Ng=@Gg"'I=>79LPiZ]bl;)G3r 1ӕ/ ý >\ Vuyc-m3dz*vbXf-fwwixAֻmSP }.- 8a r <]nPh5F|'XhfPNuRqZMH]>=[<>wyӠ\]K{7c}bȸ^!`&S &uZR f47p*t<ΪdLfRt[ƨAYIqXib{1 11KT ZuLF}FBMC)f"}HDS!:mqK & }wy`zOǭSX곢bi9>+^JRG80$tcvm/2#ĝOjA12BvKX !8\`2LX~BWk(ir?'ibF!S)1DC`rU,T-Cvl8*44L*!&ؠ@ߛ, Xd"1lhPH,B :@lFk}Φ(JF%3$ĝLRP  S#hYPcE#g,M C5{S="C)ll'}·RRMq> :uz\Y8 z!nIA1 & &U>*R7YfBB !; 8<ߏF߸4|6T21݀0BZƠag|u<&z{ɸ:'|>-.T#z`G>K:[/]W"d .K x\htb } mq&ۡ AYa[&ZP}mcs_="6A b =m2۠cCC!MHt7Fo^ߪ{ֈn1n՘^B>`aE V7TGRPx_HLd&35:ЅL>!YVcT[jb"(ML҆,{?)vWn`@rw>+^@E qH ,N:S5qc*-ZjЩ[֖e . h`H !Cu_}^46:Ƞ'C !hJ|5 *lBgwQ  k{lT$ bBܹ 4.WH2q@]% '\DG02nIA!Wi-d8C獠{N5uҩ.V9;,==1IRPqgTdܥ~ըFvjg0pҷ0waikg6xnBY-~d-=8%5=Grw?i'F{Eqwm,t -CRPqUHwl\6  @Jx?1?51f"ݡJzDw9BNN!_6fgҚGsp Lf!~UX`n]vow&RFۏ*I{ř}N1s<+#MjA!/"w,sa8"нIdq]U7l4K4/24ckA_?]Էd*~ IA!-ԫ^?&h7q}?ӓBd H,apc`wH'be?cG%ڙ)f:it3T8īVF&2DYI !~*O1{{\[U  mD~ )n0> ɘnBj(v«Q(12dZ8iDzGj[/:}^dw(IA!OҫSAygH jmQS&!3 *4/[h-yrw}gBwVӱ5O$IBQA}"B$1i_e6^ߢ0ߣ$L>3iFA:y.K{{잦uP`t\ ,c^,o;,MZr&8s􅊿IA!U02e#7uY)Ϟ7@_$3Cn{ߝFh.L`v$qwF.yjY]qT;:$C>6E*6 n;:p O*eB ;g,J@%TuB.v[[E&:htV}o-Z o#mvN\603ϒ1F*!vN^vvG}rRy)x O*W~k1Q<+nC/b2 |x7X ;oL}cW2̠T^Ls=r\ƶf1ĊJVo q6͙וH/9=W>W{|_\|緅28O:AЭo΂{gfwf% @/YlKvɴ_\P"zQUypUBV*R]%GD , D],qstLvD$JG>O`h1;0ӽnkN#%9, ޥ|V#U]4븃o7t3oҵ}aٵ N;oݶq;vĹ PELe&ꗠDO2?iK4rԖHbg5C}%mDft"' Zn[X-Y0niT!]v<Jy4k%$G=L) w.Nr֙x}ύ kKiyP_$ tFqfJˣE,V'Kr 9-@9}>#5 V}ć4VVo|ƊGI@P-\">m'QkC,A5AoiV=T3 xyY }JDe ebY> Dc9,g,<5..-NQN#G2&е{-wzÑGů9tV7~Rݾ[ -Tf(b%x߮#=N!fLe/1%ˉ8EgI'V:'(S[`fN+'tMc710>HcTP/'?rӦND~{^IgE\`e0 `$4R\a>V&ӫg+}/̟h(OP&5?#ٟ;P>4\+O|}:*wz &?~A͕r/ Mf͞DZ<o : ϰ10O9Rm^Fj8==D_} j`Xbn[O`:уZBB3Tp g3G 0FaMr`FVOKsՌF =:8}4Xntb̽F#_Q– =Cߓd!P>MP'hJdvzO|(ɱ "(TX8~n.F+D=Z3 v5aZD,0ccݿGzTiֱS$HQDLgHtJÖ(Pv Ntɽ:y [tѵ-p =YwWd o΢:17̿`wNβ W^^Z'Kc,*H1/L 5U2"7A'@aVp󯵟_0NM 9^)œz2/VJ0r9Jrwtk_Ԗ(Na:dlPiԚ"Q&07LS+vMe">Ob{ŒoI%P֡u[AgOP̓Ob;O}sňf+wK/T*C|XПc+ty*s"L 3,A.@;:@c)ho]8=TÌIRdQEֽI{tNogZiV2?Oe:j֢ɤx&t?+$ΊH:>.0E FG\LeJn5?AP$X/k]ĝbgSu P+UPd{{)W>zt|+/3:7~QNyǁr[.{e%Q>ҙPʋe?Y~Mj9V8Asm1lL%Ua>Ω"^|t߇]y29M| .Rqp:kWWu~Ѫs}Q#2lzQ7Ƚ|,,4 kwg),1lX1%3-9f_q"#JF0 Nosvߓv;%ԗh>Dr;QJ[!9МSJ/lfSL6n Meo udyÿW0rհvcCtWd}SE;Ee7 bxZ L2,a4LF;ibƨK.Gk< ~9+_(6Зhְܴ{fJffVPZ\ꊧ89./QImDNb= ޣAePE)4g `\~œxZKfNj 6!(v˥\=6g_KzױU}f}.M*n">ߊ:}xKYh8dzag-9,o;[t.Dd2=2#/'ƞ!l,5c~|ﯫ%by%),4*F_ym~ ,Q%JI`Df+9bӥV3r\XtFmZS!=T.'v{QeRE֟ʌF}NfgjaFSM7HhV@cXVX O#hC#4.N?KEl2eQLފ8V'}+Oe $>f@eLm oo- +bOF^M"WOV@uG] )#.N6%Z~2^H ӽ @m js"qK43=JW)iFߊ_!'<%bYiՉVD=|-ϒ}h-woAYo\̸?e vV_Q|nj9Xak:x.q#2jp2 Ս5ԙ{#Jeei+ߣ^f 1ZCu9}ȟQlն=z4lt:sXzF&fc;f.زi"7x?,GKo 0ᯐF/QUj--İ};E2QZ nj5x͔ۚ*~6,]<ӧiֱ \#WdSE6GyɟPu4irI5NJcIf õTYxzz-fĪǝ;&##?O#OF2κl.Fzn+,2̏h>LfvCIP;ŠL v7هrinmz梁г=^0;m0}zSXj)73;]**(z:j_.N_f巂{ ψlv!:I3~sV0_6OPۂ?_S ?Yk0"72hҊTAYXz Hx""^*("" Hx""^*("" Hx""^*("" HxkH$֛ ""눖۴h܎h-:#*"" Hx""^*("" Hx""^f(""ѱ*("" Hx""^*("" Hx""^*("" Hx""^3{H'd֏{+t4Oء6ai_|xѱfqp}/L|h0`0gG3;r!& PiSŁk{bνwk֣s'vdC_Ǹ4~Ő>|Fqhxܖʧ)Ə*(i|Љ#N]f7g-^pۈ1 O/C6Q7:+Nxܴ\__ȇ-m nH;|os_}<Ĺ;j1~t >9a!>c½;r Eǂ!pॉ82D"g_Gہ8i}WLozlik """kCǂ""^*("" Hx""^*("" Hx""^*("" Hx""^*("" Hx""^*("" Hx""^*("" Hx""^*(L?qV/>6% DZo| Gk2#|5"ÑGkt: D7 l 7;ȆZ^8{D TA fGN@;Al, đԝ"=UPd#9~x/LA͐"r[Kq'&ȯG &_(u9ұC8oC&_#'n6A;ArS# #include // [[Rcpp::export]] unsigned int hardware_concurrency() { return std::thread::hardware_concurrency(); } uwot/src/r_uwot.cpp0000644000176200001440000003376514733730066014111 0ustar liggesusers// UWOT -- An R package for dimensionality reduction using UMAP // // Copyright (C) 2018 James Melville // // This file is part of UWOT // // UWOT is free software: you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation, either version 3 of the License, or // (at your option) any later version. // // UWOT is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with UWOT. If not, see . #include #include "uwot/coords.h" #include "uwot/epoch.h" #include "uwot/gradient.h" #include "uwot/optimize.h" #include "uwot/sampler.h" #include "rng.h" #include "rparallel.h" #include "rprogress.h" using namespace Rcpp; template auto lget(List list, const std::string &name, T default_value) -> T { auto key = name.c_str(); if (!list.containsElementNamed(key)) { return default_value; } else { return list[key]; } } // Template class specialization to handle different rng/batch combinations template struct BatchRngFactory { using PcgFactoryType = batch_pcg_factory; using TauFactoryType = batch_tau_factory; using DeterministicFactoryType = deterministic_factory; }; template <> struct BatchRngFactory { using PcgFactoryType = pcg_factory; using TauFactoryType = tau_factory; using DeterministicFactoryType = deterministic_factory; }; struct UmapFactory { bool move_other; const std::string &rng_type; std::vector &head_embedding; std::vector &tail_embedding; const std::vector &positive_head; const std::vector &positive_tail; const std::vector &positive_ptr; unsigned int n_epochs; unsigned int n_head_vertices; unsigned int n_tail_vertices; const std::vector &epochs_per_sample; float initial_alpha; List opt_args; float negative_sample_rate; bool batch; std::size_t n_threads; std::size_t grain_size; uwot::EpochCallback *epoch_callback; bool verbose; UmapFactory(bool move_other, const std::string &rng_type, std::vector &head_embedding, std::vector &tail_embedding, const std::vector &positive_head, const std::vector &positive_tail, const std::vector &positive_ptr, unsigned int n_epochs, unsigned int n_head_vertices, unsigned int n_tail_vertices, const std::vector &epochs_per_sample, float initial_alpha, List opt_args, float negative_sample_rate, bool batch, std::size_t n_threads, std::size_t grain_size, uwot::EpochCallback *epoch_callback, bool verbose) : move_other(move_other), rng_type(rng_type), head_embedding(head_embedding), tail_embedding(tail_embedding), positive_head(positive_head), positive_tail(positive_tail), positive_ptr(positive_ptr), n_epochs(n_epochs), n_head_vertices(n_head_vertices), n_tail_vertices(n_tail_vertices), epochs_per_sample(epochs_per_sample), initial_alpha(initial_alpha), opt_args(opt_args), negative_sample_rate(negative_sample_rate), batch(batch), n_threads(n_threads), grain_size(grain_size), epoch_callback(epoch_callback), verbose(verbose) {} template void create(const Gradient &gradient) { if (move_other) { create_impl(gradient, rng_type, batch); } else { create_impl(gradient, rng_type, batch); } } template void create_impl(const Gradient &gradient, const std::string &rng_type, bool batch) { if (batch) { create_impl, DoMove>(gradient, rng_type, batch); } else { create_impl, DoMove>(gradient, rng_type, batch); } } template void create_impl(const Gradient &gradient, const std::string &rng_type, bool batch) { if (rng_type == "pcg") { create_impl(gradient, batch); } else if (rng_type == "tausworthe") { create_impl(gradient, batch); } else if (rng_type == "deterministic") { create_impl( gradient, batch); } else { stop("Invalid rng type: ", rng_type); } } std::unique_ptr create_optimizer(List opt_args) { std::string method = lget(opt_args, "method", "adam"); if (method == "adam") { float alpha = lget(opt_args, "alpha", 1.0); float beta1 = lget(opt_args, "beta1", 0.9); float beta2 = lget(opt_args, "beta2", 0.999); float eps = lget(opt_args, "eps", 1e-7); if (verbose) { Rcerr << "Optimizing with Adam" << " alpha = " << alpha << " beta1 = " << beta1 << " beta2 = " << beta2 << " eps = " << eps << std::endl; } return std::make_unique(alpha, beta1, beta2, eps, head_embedding.size()); } else if (method == "sgd") { float alpha = lget(opt_args, "alpha", 1.0); if (verbose) { Rcerr << "Optimizing with SGD" << " alpha = " << alpha << std::endl; } return std::make_unique(alpha); } else { stop("Unknown optimization method: " + method); } } template void create_impl(const Gradient &gradient, bool batch) { uwot::Sampler sampler(epochs_per_sample, negative_sample_rate); const std::size_t ndim = head_embedding.size() / n_head_vertices; if (batch) { std::string opt_name = opt_args["method"]; auto opt = create_optimizer(opt_args); uwot::BatchUpdate update(head_embedding, tail_embedding, std::move(opt), epoch_callback); uwot::NodeWorker worker( gradient, update, positive_head, positive_tail, positive_ptr, sampler, ndim, n_tail_vertices); create_impl(worker, gradient); } else { uwot::InPlaceUpdate update(head_embedding, tail_embedding, initial_alpha, epoch_callback); uwot::EdgeWorker worker( gradient, update, positive_head, positive_tail, sampler, ndim, n_tail_vertices, n_threads); create_impl(worker, gradient); } } template void create_impl(Worker &worker, const Gradient &gradient) { RProgress progress(n_epochs, verbose); if (n_threads > 0) { RParallel parallel(n_threads, grain_size); create_impl(worker, gradient, progress, parallel); } else { RSerial serial; create_impl(worker, gradient, progress, serial); } } template void create_impl(Worker &worker, const Gradient &gradient, Progress &progress, Parallel ¶llel) { uwot::optimize_layout(worker, progress, n_epochs, parallel); } }; auto r_to_coords(NumericMatrix head_embedding, Nullable tail_embedding) -> uwot::Coords { auto head_vec = as>(head_embedding); if (tail_embedding.isNull()) { return uwot::Coords(head_vec); } else { auto tail_vec = as>(tail_embedding); return uwot::Coords(head_vec, tail_vec); } } auto r_to_coords(NumericMatrix head_embedding) -> uwot::Coords { auto head_vec = as>(head_embedding); return uwot::Coords(head_vec); } void validate_args(List method_args, const std::vector &arg_names) { for (auto &arg_name : arg_names) { if (!method_args.containsElementNamed(arg_name.c_str())) { stop("Missing embedding method argument: " + arg_name); } } } void create_umap(UmapFactory &umap_factory, List method_args) { std::vector arg_names = {"a", "b", "gamma", "approx_pow"}; validate_args(method_args, arg_names); float a = method_args["a"]; float b = method_args["b"]; float gamma = method_args["gamma"]; bool approx_pow = method_args["approx_pow"]; if (approx_pow) { const uwot::apumap_gradient gradient(a, b, gamma); umap_factory.create(gradient); } else { const uwot::umap_gradient gradient(a, b, gamma); umap_factory.create(gradient); } } void create_tumap(UmapFactory &umap_factory, List method_args) { std::vector arg_names = {"gamma"}; validate_args(method_args, arg_names); float gamma = method_args["gamma"]; const uwot::tumap_gradient gradient(gamma); umap_factory.create(gradient); } void create_umapai(UmapFactory &umap_factory, List method_args) { std::vector arg_names = {"ai", "b", "ndim"}; validate_args(method_args, arg_names); std::vector ai = method_args["ai"]; float b = method_args["b"]; std::size_t ndim = method_args["ndim"]; const uwot::umapai_gradient gradient(ai, b, ndim); umap_factory.create(gradient); } void create_umapai2(UmapFactory &umap_factory, List method_args) { std::vector arg_names = {"ai", "aj", "b", "ndim"}; validate_args(method_args, arg_names); std::vector ai = method_args["ai"]; std::vector aj = method_args["aj"]; float b = method_args["b"]; std::size_t ndim = method_args["ndim"]; const uwot::umapai2_gradient gradient(ai, aj, b, ndim); umap_factory.create(gradient); } void create_largevis(UmapFactory &umap_factory, List method_args) { std::vector arg_names = {"gamma"}; validate_args(method_args, arg_names); float gamma = method_args["gamma"]; const uwot::largevis_gradient gradient(gamma); umap_factory.create(gradient); } // Wrap Rcpp Function for use as a callback template struct REpochCallback : uwot::EpochCallback { Function f; std::size_t ndim; REpochCallback(Function f, std::size_t ndim) : f(f), ndim(ndim) {} void operator()(std::size_t epoch, std::size_t n_epochs, const std::vector &head_embedding, const std::vector &tail_embedding) override { NumericMatrix head_mat(ndim, head_embedding.size() / ndim, head_embedding.begin()); auto head_matt = transpose(head_mat); NumericMatrix tail_mat(ndim, tail_embedding.size() / ndim, tail_embedding.begin()); auto tail_matt = transpose(tail_mat); f(epoch + 1, n_epochs, head_matt, tail_matt); } }; template <> struct REpochCallback : uwot::EpochCallback { Function f; std::size_t ndim; REpochCallback(Function f, std::size_t ndim) : f(f), ndim(ndim) {} void operator()(std::size_t epoch, std::size_t n_epochs, const std::vector &head_embedding, const std::vector &) override { NumericMatrix m(ndim, head_embedding.size() / ndim, head_embedding.begin()); auto mt = transpose(m); f(epoch + 1, n_epochs, mt); } }; auto create_callback(Nullable epoch_callback, std::size_t ndim, bool move_other) -> uwot::EpochCallback * { if (epoch_callback.isNull()) { return new uwot::DoNothingCallback(); } else { if (move_other) { return new REpochCallback(as(epoch_callback), ndim); } else { return new REpochCallback(as(epoch_callback), ndim); } } } // [[Rcpp::export]] NumericMatrix optimize_layout_r( NumericMatrix head_embedding, Nullable tail_embedding, const std::vector positive_head, const std::vector positive_tail, const std::vector positive_ptr, unsigned int n_epochs, unsigned int n_head_vertices, unsigned int n_tail_vertices, const std::vector epochs_per_sample, const std::string &method, List method_args, float initial_alpha, List opt_args, Nullable epoch_callback, float negative_sample_rate, const std::string &rng_type = "tausworthe", bool batch = false, std::size_t n_threads = 0, std::size_t grain_size = 1, bool move_other = true, bool verbose = false) { auto coords = r_to_coords(head_embedding, tail_embedding); const std::size_t ndim = head_embedding.size() / n_head_vertices; uwot::EpochCallback *uwot_ecb = create_callback(epoch_callback, ndim, move_other); UmapFactory umap_factory(move_other, rng_type, coords.get_head_embedding(), coords.get_tail_embedding(), positive_head, positive_tail, positive_ptr, n_epochs, n_head_vertices, n_tail_vertices, epochs_per_sample, initial_alpha, opt_args, negative_sample_rate, batch, n_threads, grain_size, uwot_ecb, verbose); if (verbose) { Rcerr << "Using method '" << method << "'" << std::endl; } if (method == "umap") { create_umap(umap_factory, method_args); } else if (method == "tumap") { create_tumap(umap_factory, method_args); } else if (method == "largevis") { create_largevis(umap_factory, method_args); } else if (method == "leopold") { create_umapai(umap_factory, method_args); } else if (method == "leopold2") { create_umapai2(umap_factory, method_args); } else { stop("Unknown method: '" + method + "'"); } return NumericMatrix(head_embedding.nrow(), head_embedding.ncol(), coords.get_head_embedding().begin()); } uwot/src/RcppExports.cpp0000644000176200001440000004141514734350106015044 0ustar liggesusers// Generated by using Rcpp::compileAttributes() -> do not edit by hand // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 #include using namespace Rcpp; #ifdef RCPP_USE_GLOBAL_ROSTREAM Rcpp::Rostream& Rcpp::Rcout = Rcpp::Rcpp_cout_get(); Rcpp::Rostream& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get(); #endif // connected_components_undirected List connected_components_undirected(std::size_t N, const std::vector& indices1, const std::vector& indptr1, const std::vector& indices2, const std::vector& indptr2); RcppExport SEXP _uwot_connected_components_undirected(SEXP NSEXP, SEXP indices1SEXP, SEXP indptr1SEXP, SEXP indices2SEXP, SEXP indptr2SEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< std::size_t >::type N(NSEXP); Rcpp::traits::input_parameter< const std::vector& >::type indices1(indices1SEXP); Rcpp::traits::input_parameter< const std::vector& >::type indptr1(indptr1SEXP); Rcpp::traits::input_parameter< const std::vector& >::type indices2(indices2SEXP); Rcpp::traits::input_parameter< const std::vector& >::type indptr2(indptr2SEXP); rcpp_result_gen = Rcpp::wrap(connected_components_undirected(N, indices1, indptr1, indices2, indptr2)); return rcpp_result_gen; END_RCPP } // annoy_search_parallel_cpp List annoy_search_parallel_cpp(const std::string& index_name, NumericMatrix mat, std::size_t n_neighbors, std::size_t search_k, const std::string& metric, std::size_t n_threads, std::size_t grain_size); RcppExport SEXP _uwot_annoy_search_parallel_cpp(SEXP index_nameSEXP, SEXP matSEXP, SEXP n_neighborsSEXP, SEXP search_kSEXP, SEXP metricSEXP, SEXP n_threadsSEXP, SEXP grain_sizeSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< const std::string& >::type index_name(index_nameSEXP); Rcpp::traits::input_parameter< NumericMatrix >::type mat(matSEXP); Rcpp::traits::input_parameter< std::size_t >::type n_neighbors(n_neighborsSEXP); Rcpp::traits::input_parameter< std::size_t >::type search_k(search_kSEXP); Rcpp::traits::input_parameter< const std::string& >::type metric(metricSEXP); Rcpp::traits::input_parameter< std::size_t >::type n_threads(n_threadsSEXP); Rcpp::traits::input_parameter< std::size_t >::type grain_size(grain_sizeSEXP); rcpp_result_gen = Rcpp::wrap(annoy_search_parallel_cpp(index_name, mat, n_neighbors, search_k, metric, n_threads, grain_size)); return rcpp_result_gen; END_RCPP } // calc_row_probabilities_parallel List calc_row_probabilities_parallel(NumericVector nn_dist, std::size_t n_vertices, double perplexity, std::size_t n_iter, double tol, bool ret_sigma, std::size_t n_threads, std::size_t grain_size); RcppExport SEXP _uwot_calc_row_probabilities_parallel(SEXP nn_distSEXP, SEXP n_verticesSEXP, SEXP perplexitySEXP, SEXP n_iterSEXP, SEXP tolSEXP, SEXP ret_sigmaSEXP, SEXP n_threadsSEXP, SEXP grain_sizeSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< NumericVector >::type nn_dist(nn_distSEXP); Rcpp::traits::input_parameter< std::size_t >::type n_vertices(n_verticesSEXP); Rcpp::traits::input_parameter< double >::type perplexity(perplexitySEXP); Rcpp::traits::input_parameter< std::size_t >::type n_iter(n_iterSEXP); Rcpp::traits::input_parameter< double >::type tol(tolSEXP); Rcpp::traits::input_parameter< bool >::type ret_sigma(ret_sigmaSEXP); Rcpp::traits::input_parameter< std::size_t >::type n_threads(n_threadsSEXP); Rcpp::traits::input_parameter< std::size_t >::type grain_size(grain_sizeSEXP); rcpp_result_gen = Rcpp::wrap(calc_row_probabilities_parallel(nn_dist, n_vertices, perplexity, n_iter, tol, ret_sigma, n_threads, grain_size)); return rcpp_result_gen; END_RCPP } // optimize_layout_r NumericMatrix optimize_layout_r(NumericMatrix head_embedding, Nullable tail_embedding, const std::vector positive_head, const std::vector positive_tail, const std::vector positive_ptr, unsigned int n_epochs, unsigned int n_head_vertices, unsigned int n_tail_vertices, const std::vector epochs_per_sample, const std::string& method, List method_args, float initial_alpha, List opt_args, Nullable epoch_callback, float negative_sample_rate, const std::string& rng_type, bool batch, std::size_t n_threads, std::size_t grain_size, bool move_other, bool verbose); RcppExport SEXP _uwot_optimize_layout_r(SEXP head_embeddingSEXP, SEXP tail_embeddingSEXP, SEXP positive_headSEXP, SEXP positive_tailSEXP, SEXP positive_ptrSEXP, SEXP n_epochsSEXP, SEXP n_head_verticesSEXP, SEXP n_tail_verticesSEXP, SEXP epochs_per_sampleSEXP, SEXP methodSEXP, SEXP method_argsSEXP, SEXP initial_alphaSEXP, SEXP opt_argsSEXP, SEXP epoch_callbackSEXP, SEXP negative_sample_rateSEXP, SEXP rng_typeSEXP, SEXP batchSEXP, SEXP n_threadsSEXP, SEXP grain_sizeSEXP, SEXP move_otherSEXP, SEXP verboseSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< NumericMatrix >::type head_embedding(head_embeddingSEXP); Rcpp::traits::input_parameter< Nullable >::type tail_embedding(tail_embeddingSEXP); Rcpp::traits::input_parameter< const std::vector >::type positive_head(positive_headSEXP); Rcpp::traits::input_parameter< const std::vector >::type positive_tail(positive_tailSEXP); Rcpp::traits::input_parameter< const std::vector >::type positive_ptr(positive_ptrSEXP); Rcpp::traits::input_parameter< unsigned int >::type n_epochs(n_epochsSEXP); Rcpp::traits::input_parameter< unsigned int >::type n_head_vertices(n_head_verticesSEXP); Rcpp::traits::input_parameter< unsigned int >::type n_tail_vertices(n_tail_verticesSEXP); Rcpp::traits::input_parameter< const std::vector >::type epochs_per_sample(epochs_per_sampleSEXP); Rcpp::traits::input_parameter< const std::string& >::type method(methodSEXP); Rcpp::traits::input_parameter< List >::type method_args(method_argsSEXP); Rcpp::traits::input_parameter< float >::type initial_alpha(initial_alphaSEXP); Rcpp::traits::input_parameter< List >::type opt_args(opt_argsSEXP); Rcpp::traits::input_parameter< Nullable >::type epoch_callback(epoch_callbackSEXP); Rcpp::traits::input_parameter< float >::type negative_sample_rate(negative_sample_rateSEXP); Rcpp::traits::input_parameter< const std::string& >::type rng_type(rng_typeSEXP); Rcpp::traits::input_parameter< bool >::type batch(batchSEXP); Rcpp::traits::input_parameter< std::size_t >::type n_threads(n_threadsSEXP); Rcpp::traits::input_parameter< std::size_t >::type grain_size(grain_sizeSEXP); Rcpp::traits::input_parameter< bool >::type move_other(move_otherSEXP); Rcpp::traits::input_parameter< bool >::type verbose(verboseSEXP); rcpp_result_gen = Rcpp::wrap(optimize_layout_r(head_embedding, tail_embedding, positive_head, positive_tail, positive_ptr, n_epochs, n_head_vertices, n_tail_vertices, epochs_per_sample, method, method_args, initial_alpha, opt_args, epoch_callback, negative_sample_rate, rng_type, batch, n_threads, grain_size, move_other, verbose)); return rcpp_result_gen; END_RCPP } // smooth_knn_distances_parallel List smooth_knn_distances_parallel(NumericVector nn_dist, IntegerVector nn_ptr, bool skip_first, NumericVector target, std::size_t n_iter, double local_connectivity, double tol, double min_k_dist_scale, bool ret_sigma, std::size_t n_threads, std::size_t grain_size); RcppExport SEXP _uwot_smooth_knn_distances_parallel(SEXP nn_distSEXP, SEXP nn_ptrSEXP, SEXP skip_firstSEXP, SEXP targetSEXP, SEXP n_iterSEXP, SEXP local_connectivitySEXP, SEXP tolSEXP, SEXP min_k_dist_scaleSEXP, SEXP ret_sigmaSEXP, SEXP n_threadsSEXP, SEXP grain_sizeSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< NumericVector >::type nn_dist(nn_distSEXP); Rcpp::traits::input_parameter< IntegerVector >::type nn_ptr(nn_ptrSEXP); Rcpp::traits::input_parameter< bool >::type skip_first(skip_firstSEXP); Rcpp::traits::input_parameter< NumericVector >::type target(targetSEXP); Rcpp::traits::input_parameter< std::size_t >::type n_iter(n_iterSEXP); Rcpp::traits::input_parameter< double >::type local_connectivity(local_connectivitySEXP); Rcpp::traits::input_parameter< double >::type tol(tolSEXP); Rcpp::traits::input_parameter< double >::type min_k_dist_scale(min_k_dist_scaleSEXP); Rcpp::traits::input_parameter< bool >::type ret_sigma(ret_sigmaSEXP); Rcpp::traits::input_parameter< std::size_t >::type n_threads(n_threadsSEXP); Rcpp::traits::input_parameter< std::size_t >::type grain_size(grain_sizeSEXP); rcpp_result_gen = Rcpp::wrap(smooth_knn_distances_parallel(nn_dist, nn_ptr, skip_first, target, n_iter, local_connectivity, tol, min_k_dist_scale, ret_sigma, n_threads, grain_size)); return rcpp_result_gen; END_RCPP } // reset_local_metrics_parallel List reset_local_metrics_parallel(IntegerVector indptr, NumericVector probabilities, std::size_t n_iter, double tol, double num_local_metric_neighbors, std::size_t n_threads); RcppExport SEXP _uwot_reset_local_metrics_parallel(SEXP indptrSEXP, SEXP probabilitiesSEXP, SEXP n_iterSEXP, SEXP tolSEXP, SEXP num_local_metric_neighborsSEXP, SEXP n_threadsSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< IntegerVector >::type indptr(indptrSEXP); Rcpp::traits::input_parameter< NumericVector >::type probabilities(probabilitiesSEXP); Rcpp::traits::input_parameter< std::size_t >::type n_iter(n_iterSEXP); Rcpp::traits::input_parameter< double >::type tol(tolSEXP); Rcpp::traits::input_parameter< double >::type num_local_metric_neighbors(num_local_metric_neighborsSEXP); Rcpp::traits::input_parameter< std::size_t >::type n_threads(n_threadsSEXP); rcpp_result_gen = Rcpp::wrap(reset_local_metrics_parallel(indptr, probabilities, n_iter, tol, num_local_metric_neighbors, n_threads)); return rcpp_result_gen; END_RCPP } // fast_intersection_cpp NumericVector fast_intersection_cpp(IntegerVector rows, IntegerVector cols, NumericVector values, IntegerVector target, double unknown_dist, double far_dist); RcppExport SEXP _uwot_fast_intersection_cpp(SEXP rowsSEXP, SEXP colsSEXP, SEXP valuesSEXP, SEXP targetSEXP, SEXP unknown_distSEXP, SEXP far_distSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< IntegerVector >::type rows(rowsSEXP); Rcpp::traits::input_parameter< IntegerVector >::type cols(colsSEXP); Rcpp::traits::input_parameter< NumericVector >::type values(valuesSEXP); Rcpp::traits::input_parameter< IntegerVector >::type target(targetSEXP); Rcpp::traits::input_parameter< double >::type unknown_dist(unknown_distSEXP); Rcpp::traits::input_parameter< double >::type far_dist(far_distSEXP); rcpp_result_gen = Rcpp::wrap(fast_intersection_cpp(rows, cols, values, target, unknown_dist, far_dist)); return rcpp_result_gen; END_RCPP } // general_sset_intersection_cpp NumericVector general_sset_intersection_cpp(IntegerVector indptr1, IntegerVector indices1, NumericVector data1, IntegerVector indptr2, IntegerVector indices2, NumericVector data2, IntegerVector result_row, IntegerVector result_col, NumericVector result_val, double mix_weight); RcppExport SEXP _uwot_general_sset_intersection_cpp(SEXP indptr1SEXP, SEXP indices1SEXP, SEXP data1SEXP, SEXP indptr2SEXP, SEXP indices2SEXP, SEXP data2SEXP, SEXP result_rowSEXP, SEXP result_colSEXP, SEXP result_valSEXP, SEXP mix_weightSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< IntegerVector >::type indptr1(indptr1SEXP); Rcpp::traits::input_parameter< IntegerVector >::type indices1(indices1SEXP); Rcpp::traits::input_parameter< NumericVector >::type data1(data1SEXP); Rcpp::traits::input_parameter< IntegerVector >::type indptr2(indptr2SEXP); Rcpp::traits::input_parameter< IntegerVector >::type indices2(indices2SEXP); Rcpp::traits::input_parameter< NumericVector >::type data2(data2SEXP); Rcpp::traits::input_parameter< IntegerVector >::type result_row(result_rowSEXP); Rcpp::traits::input_parameter< IntegerVector >::type result_col(result_colSEXP); Rcpp::traits::input_parameter< NumericVector >::type result_val(result_valSEXP); Rcpp::traits::input_parameter< double >::type mix_weight(mix_weightSEXP); rcpp_result_gen = Rcpp::wrap(general_sset_intersection_cpp(indptr1, indices1, data1, indptr2, indices2, data2, result_row, result_col, result_val, mix_weight)); return rcpp_result_gen; END_RCPP } // general_sset_union_cpp NumericVector general_sset_union_cpp(IntegerVector indptr1, IntegerVector indices1, NumericVector data1, IntegerVector indptr2, IntegerVector indices2, NumericVector data2, IntegerVector result_row, IntegerVector result_col, NumericVector result_val); RcppExport SEXP _uwot_general_sset_union_cpp(SEXP indptr1SEXP, SEXP indices1SEXP, SEXP data1SEXP, SEXP indptr2SEXP, SEXP indices2SEXP, SEXP data2SEXP, SEXP result_rowSEXP, SEXP result_colSEXP, SEXP result_valSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< IntegerVector >::type indptr1(indptr1SEXP); Rcpp::traits::input_parameter< IntegerVector >::type indices1(indices1SEXP); Rcpp::traits::input_parameter< NumericVector >::type data1(data1SEXP); Rcpp::traits::input_parameter< IntegerVector >::type indptr2(indptr2SEXP); Rcpp::traits::input_parameter< IntegerVector >::type indices2(indices2SEXP); Rcpp::traits::input_parameter< NumericVector >::type data2(data2SEXP); Rcpp::traits::input_parameter< IntegerVector >::type result_row(result_rowSEXP); Rcpp::traits::input_parameter< IntegerVector >::type result_col(result_colSEXP); Rcpp::traits::input_parameter< NumericVector >::type result_val(result_valSEXP); rcpp_result_gen = Rcpp::wrap(general_sset_union_cpp(indptr1, indices1, data1, indptr2, indices2, data2, result_row, result_col, result_val)); return rcpp_result_gen; END_RCPP } // hardware_concurrency unsigned int hardware_concurrency(); RcppExport SEXP _uwot_hardware_concurrency() { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; rcpp_result_gen = Rcpp::wrap(hardware_concurrency()); return rcpp_result_gen; END_RCPP } // init_transform_parallel NumericMatrix init_transform_parallel(NumericMatrix train_embedding, IntegerVector nn_index, std::size_t n_test_vertices, Nullable nn_weights, std::size_t n_threads, std::size_t grain_size); RcppExport SEXP _uwot_init_transform_parallel(SEXP train_embeddingSEXP, SEXP nn_indexSEXP, SEXP n_test_verticesSEXP, SEXP nn_weightsSEXP, SEXP n_threadsSEXP, SEXP grain_sizeSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< NumericMatrix >::type train_embedding(train_embeddingSEXP); Rcpp::traits::input_parameter< IntegerVector >::type nn_index(nn_indexSEXP); Rcpp::traits::input_parameter< std::size_t >::type n_test_vertices(n_test_verticesSEXP); Rcpp::traits::input_parameter< Nullable >::type nn_weights(nn_weightsSEXP); Rcpp::traits::input_parameter< std::size_t >::type n_threads(n_threadsSEXP); Rcpp::traits::input_parameter< std::size_t >::type grain_size(grain_sizeSEXP); rcpp_result_gen = Rcpp::wrap(init_transform_parallel(train_embedding, nn_index, n_test_vertices, nn_weights, n_threads, grain_size)); return rcpp_result_gen; END_RCPP } static const R_CallMethodDef CallEntries[] = { {"_uwot_connected_components_undirected", (DL_FUNC) &_uwot_connected_components_undirected, 5}, {"_uwot_annoy_search_parallel_cpp", (DL_FUNC) &_uwot_annoy_search_parallel_cpp, 7}, {"_uwot_calc_row_probabilities_parallel", (DL_FUNC) &_uwot_calc_row_probabilities_parallel, 8}, {"_uwot_optimize_layout_r", (DL_FUNC) &_uwot_optimize_layout_r, 21}, {"_uwot_smooth_knn_distances_parallel", (DL_FUNC) &_uwot_smooth_knn_distances_parallel, 11}, {"_uwot_reset_local_metrics_parallel", (DL_FUNC) &_uwot_reset_local_metrics_parallel, 6}, {"_uwot_fast_intersection_cpp", (DL_FUNC) &_uwot_fast_intersection_cpp, 6}, {"_uwot_general_sset_intersection_cpp", (DL_FUNC) &_uwot_general_sset_intersection_cpp, 10}, {"_uwot_general_sset_union_cpp", (DL_FUNC) &_uwot_general_sset_union_cpp, 9}, {"_uwot_hardware_concurrency", (DL_FUNC) &_uwot_hardware_concurrency, 0}, {"_uwot_init_transform_parallel", (DL_FUNC) &_uwot_init_transform_parallel, 6}, {NULL, NULL, 0} }; RcppExport void R_init_uwot(DllInfo *dll) { R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); R_useDynamicSymbols(dll, FALSE); } uwot/src/supervised.cpp0000644000176200001440000001024614733074465014754 0ustar liggesusers// UWOT -- An R package for dimensionality reduction using UMAP // // Copyright (C) 2018 James Melville // // This file is part of UWOT // // UWOT is free software: you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation, either version 3 of the License, or // (at your option) any later version. // // UWOT is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with UWOT. If not, see . #include #include "uwot/supervised.h" #include using namespace Rcpp; // [[Rcpp::export]] NumericVector fast_intersection_cpp(IntegerVector rows, IntegerVector cols, NumericVector values, IntegerVector target, double unknown_dist = 1.0, double far_dist = 5.0) { auto result = as>(values); uwot::fast_intersection( as>(rows), as>(cols), result, as>(target), unknown_dist, far_dist, NA_INTEGER); return wrap(result); } // [[Rcpp::export]] NumericVector general_sset_intersection_cpp( IntegerVector indptr1, IntegerVector indices1, NumericVector data1, IntegerVector indptr2, IntegerVector indices2, NumericVector data2, IntegerVector result_row, IntegerVector result_col, NumericVector result_val, double mix_weight = 0.5) { double left_min = (std::max)(min(data1) / 2.0, 1.0e-8); double right_min = (std::max)(min(data2) / 2.0, 1.0e-8); for (auto idx = 0; idx < result_row.length(); idx++) { auto i = result_col[idx]; auto j = result_row[idx]; auto left_end = indices1.begin() + indptr1[i + 1]; auto left_it = std::lower_bound(indices1.begin() + indptr1[i], left_end, j); double left_val = (left_it != left_end && *left_it == j ? data1[left_it - indices1.begin()] : left_min); auto right_end = indices2.begin() + indptr2[i + 1]; auto right_it = std::lower_bound(indices2.begin() + indptr2[i], right_end, j); double right_val = (right_it != right_end && *right_it == j ? data2[right_it - indices2.begin()] : right_min); if (left_val > left_min || right_val > right_min) { if (mix_weight < 0.5) { result_val[idx] = left_val * std::pow(right_val, (mix_weight / (1.0 - mix_weight))); } else { result_val[idx] = right_val * std::pow(left_val, (((1.0 - mix_weight) / mix_weight))); } } } return result_val; } // [[Rcpp::export]] NumericVector general_sset_union_cpp(IntegerVector indptr1, IntegerVector indices1, NumericVector data1, IntegerVector indptr2, IntegerVector indices2, NumericVector data2, IntegerVector result_row, IntegerVector result_col, NumericVector result_val) { double left_min = (std::max)(min(data1) / 2.0, 1.0e-8); double right_min = (std::max)(min(data2) / 2.0, 1.0e-8); for (auto idx = 0; idx < result_row.length(); idx++) { auto i = result_col[idx]; auto j = result_row[idx]; auto left_end = indices1.begin() + indptr1[i + 1]; auto left_it = std::lower_bound(indices1.begin() + indptr1[i], left_end, j); double left_val = (left_it != left_end && *left_it == j ? data1[left_it - indices1.begin()] : left_min); auto right_end = indices2.begin() + indptr2[i + 1]; auto right_it = std::lower_bound(indices2.begin() + indptr2[i], right_end, j); double right_val = (right_it != right_end && *right_it == j ? data2[right_it - indices2.begin()] : right_min); result_val[idx] = left_val + right_val - left_val * right_val; } return result_val; } uwot/src/nn_parallel.cpp0000644000176200001440000000402714733074465015052 0ustar liggesusers#include #include #include "RcppPerpendicular.h" #include "nn_parallel.h" using namespace Rcpp; template auto annoy_nns_impl(const std::string &index_name, NumericMatrix mat, std::size_t n_neighbors, std::size_t search_k, std::size_t n_threads = 0, std::size_t grain_size = 1) -> List { std::size_t nrow = mat.rows(); std::size_t ncol = mat.cols(); std::vector vmat = as>(mat); NNWorker worker(index_name, vmat, ncol, n_neighbors, search_k); RcppPerpendicular::parallel_for(0, nrow, worker, n_threads, grain_size); return List::create( _("item") = IntegerMatrix(nrow, n_neighbors, worker.idx.begin()), _("distance") = NumericMatrix(nrow, n_neighbors, worker.dists.begin())); } // [[Rcpp::export]] List annoy_search_parallel_cpp(const std::string &index_name, NumericMatrix mat, std::size_t n_neighbors, std::size_t search_k, const std::string &metric, std::size_t n_threads = 0, std::size_t grain_size = 1) { if (metric == "euclidean") { return annoy_nns_impl(index_name, mat, n_neighbors, search_k, n_threads, grain_size); } else if (metric == "cosine") { return annoy_nns_impl(index_name, mat, n_neighbors, search_k, n_threads, grain_size); } else if (metric == "manhattan") { return annoy_nns_impl(index_name, mat, n_neighbors, search_k, n_threads, grain_size); } else if (metric == "hamming") { return annoy_nns_impl(index_name, mat, n_neighbors, search_k, n_threads, grain_size); } else { stop("Unknown metric '", metric, "'"); } } uwot/src/rparallel.h0000644000176200001440000000317014577210515014176 0ustar liggesusers// UWOT -- An R package for dimensionality reduction using UMAP // // Copyright (C) 2021 James Melville // // This file is part of UWOT // // UWOT is free software: you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation, either version 3 of the License, or // (at your option) any later version. // // UWOT is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with UWOT. If not, see void pfor(std::size_t n_items, Worker &worker) { RcppPerpendicular::pfor(n_items, worker, n_threads, grain_size); } template void pfor(std::size_t begin, std::size_t end, Worker &worker) { RcppPerpendicular::pfor(begin, end, worker, n_threads, grain_size); } }; struct RSerial { template void pfor(std::size_t n_items, Worker &worker) { pfor(0, n_items, worker); } template void pfor(std::size_t begin, std::size_t end, Worker &worker) { worker(begin, end, 0); } }; #endif // UWOT_RPARALLEL_H uwot/src/nn_parallel.h0000644000176200001440000000745614733074465014530 0ustar liggesusers// UWOT -- An R package for dimensionality reduction using UMAP // // Copyright (C) 2020 James Melville // // This file is part of UWOT // // UWOT is free software: you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation, either version 3 of the License, or // (at your option) any later version. // // UWOT is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with UWOT. If not, see . #include #include "RcppAnnoy.h" #if ANNOY_VERSION >= Annoy_Version(1, 17, 3) typedef Annoy::AnnoyIndexSingleThreadedBuildPolicy AnnoyIndexThreadedBuildPolicy; struct UwotAnnoyEuclidean { using Distance = Annoy::Euclidean; using S = int32_t; using T = float; }; struct UwotAnnoyCosine { using Distance = Annoy::Angular; using S = int32_t; using T = float; }; struct UwotAnnoyManhattan { using Distance = Annoy::Manhattan; using S = int32_t; using T = float; }; struct UwotAnnoyHamming { using Distance = Annoy::Hamming; using S = int32_t; using T = uint64_t; }; #else typedef AnnoyIndexSingleThreadedBuildPolicy AnnoyIndexThreadedBuildPolicy; struct UwotAnnoyEuclidean { using Distance = Euclidean; using S = int32_t; using T = float; }; struct UwotAnnoyCosine { using Distance = Angular; using S = int32_t; using T = float; }; struct UwotAnnoyManhattan { using Distance = Manhattan; using S = int32_t; using T = float; }; struct UwotAnnoyHamming { using Distance = Hamming; using S = int32_t; using T = uint64_t; }; #endif // of 'if ANNOY_VERSION >= Annoy_Version(1,17,3)' template struct NNWorker { const std::string &index_name; const std::vector &mat; std::size_t nrow; std::size_t ncol; std::size_t n_neighbors; std::size_t search_k; std::vector idx; std::vector dists; #if ANNOY_VERSION >= Annoy_Version(1, 17, 3) Annoy::AnnoyIndex index; #else AnnoyIndex index; #endif NNWorker(const std::string &index_name, const std::vector &mat, std::size_t ncol, std::size_t n_neighbors, std::size_t search_k) : index_name(index_name), mat(mat), nrow(mat.size() / ncol), ncol(ncol), n_neighbors(n_neighbors), search_k(search_k), idx(nrow * n_neighbors, -1), dists(nrow * n_neighbors), index(ncol) { index.load(index_name.c_str()); } ~NNWorker() { index.unload(); } void operator()(std::size_t begin, std::size_t end) { for (auto i = begin; i < end; i++) { std::vector fv(ncol); for (std::size_t j = 0; j < ncol; j++) { fv[j] = mat[i + j * nrow]; } std::vector result; std::vector distances; index.get_nns_by_vector(fv.data(), n_neighbors, search_k, &result, &distances); if (result.size() != n_neighbors || distances.size() != n_neighbors) { break; } for (std::size_t j = 0; j < n_neighbors; j++) { dists[i + j * nrow] = distances[j]; idx[i + j * nrow] = result[j]; } } } }; uwot/src/connected_components.cpp0000644000176200001440000000274414577210515016770 0ustar liggesusers// UWOT -- An R package for dimensionality reduction using UMAP // // Copyright (C) 2018 James Melville // // This file is part of UWOT // // UWOT is free software: you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation, either version 3 of the License, or // (at your option) any later version. // // UWOT is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with UWOT. If not, see . #include #include "uwot/connected_components.h" using namespace Rcpp; // [[Rcpp::export]] List connected_components_undirected(std::size_t N, const std::vector &indices1, const std::vector &indptr1, const std::vector &indices2, const std::vector &indptr2) { std::pair> result = uwot::connected_components_undirected(N, indices1, indptr1, indices2, indptr2); return List::create(_["n_components"] = result.first, _["labels"] = result.second); } uwot/src/transform.cpp0000644000176200001440000000515714730166740014576 0ustar liggesusers// UWOT -- An R package for dimensionality reduction using UMAP // // Copyright (C) 2018 James Melville // // This file is part of UWOT // // UWOT is free software: you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation, either version 3 of the License, or // (at your option) any later version. // // UWOT is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with UWOT. If not, see . #include #include #include "RcppPerpendicular.h" #include "uwot/transform.h" using namespace Rcpp; // Initialize embedding as a weighted average of nearest neighbors of each point // train_embedding: dim x n_train matrix of final embedding coordinates // nn_index: n_nbrs x n_test matrix of indexes of neighbors in X_train that are // nearest neighbors of X_test // weights: n_nbrs x n_test weight matrix // Returns the dim x n_test matrix of initialized coordinates. // [[Rcpp::export]] NumericMatrix init_transform_parallel(NumericMatrix train_embedding, IntegerVector nn_index, std::size_t n_test_vertices, Nullable nn_weights, std::size_t n_threads = 0, std::size_t grain_size = 1) { std::size_t n_train_vertices = train_embedding.ncol(); std::size_t ndim = train_embedding.nrow(); std::size_t n_neighbors = nn_index.size() / n_test_vertices; auto train_embeddingv = as>(train_embedding); auto nn_indexv = as>(nn_index); // Convert to zero-indexing for (int &i : nn_indexv) { --i; } std::vector embedding(n_test_vertices * ndim); std::vector nn_weightsv(0); if (nn_weights.isNotNull()) { nn_weightsv = as>(nn_weights); } auto worker = [&](std::size_t begin, std::size_t end) { uwot::init_by_mean(begin, end, ndim, n_neighbors, nn_indexv, nn_weightsv, n_test_vertices, train_embeddingv, n_train_vertices, embedding); }; RcppPerpendicular::parallel_for(n_test_vertices, worker, n_threads, grain_size); return NumericMatrix(ndim, n_test_vertices, embedding.begin()); } uwot/src/Makevars0000644000176200001440000000010014730166740013532 0ustar liggesusersPKG_CXXFLAGS = -DRCPP_NO_RTTI PKG_CPPFLAGS = -I../inst/include/ uwot/src/rprogress.h0000644000176200001440000000242714577210515014252 0ustar liggesusers// UWOT -- An R package for dimensionality reduction using UMAP // // Copyright (C) 2021 James Melville // // This file is part of UWOT // // UWOT is free software: you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation, either version 3 of the License, or // (at your option) any later version. // // UWOT is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with UWOT. If not, see struct RProgress { Progress progress; bool verbose; RProgress(std::size_t n_epochs, bool verbose) : progress(n_epochs, verbose), verbose(verbose) {} auto is_aborted() -> bool { bool aborted = Progress::check_abort(); if (aborted) { progress.cleanup(); } return aborted; } void report() { if (verbose) { progress.increment(); } } }; #endif // UWOT_RPROGRESS_H uwot/src/perplexity.cpp0000644000176200001440000000424614730166740014766 0ustar liggesusers// UWOT -- An R package for dimensionality reduction using UMAP // // Copyright (C) 2018 James Melville // // This file is part of UWOT // // UWOT is free software: you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation, either version 3 of the License, or // (at your option) any later version. // // UWOT is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with UWOT. If not, see . #include #include #include "RcppPerpendicular.h" #include "uwot/perplexity.h" using namespace Rcpp; // [[Rcpp::export]] List calc_row_probabilities_parallel(NumericVector nn_dist, std::size_t n_vertices, double perplexity, std::size_t n_iter = 200, double tol = 1e-5, bool ret_sigma = false, std::size_t n_threads = 0, std::size_t grain_size = 1) { std::size_t n_neighbors = nn_dist.size() / n_vertices; auto nn_distv = as>(nn_dist); double target = std::log(perplexity); std::atomic_size_t n_search_fails{0}; std::vector nn_weights(n_vertices * n_neighbors); std::vector sigmas(ret_sigma ? n_vertices : 0); auto worker = [&](std::size_t begin, std::size_t end) { uwot::perplexity_search(begin, end, nn_distv, n_neighbors, target, tol, n_iter, nn_weights, ret_sigma, sigmas, n_search_fails); }; RcppPerpendicular::parallel_for(0, n_vertices, worker, n_threads, grain_size); auto res = List::create( _("matrix") = NumericMatrix(n_neighbors, n_vertices, nn_weights.begin()), _("n_failures") = static_cast(n_search_fails)); if (ret_sigma) { res["sigma"] = sigmas; } return res; } uwot/src/smooth_knn.cpp0000644000176200001440000000771314733074465014747 0ustar liggesusers// UWOT -- An R package for dimensionality reduction using UMAP // // Copyright (C) 2018 James Melville // // This file is part of UWOT // // UWOT is free software: you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation, either version 3 of the License, or // (at your option) any later version. // // UWOT is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with UWOT. If not, see . #include #include #include "uwot/smooth_knn.h" #include using namespace Rcpp; // [[Rcpp::export]] List smooth_knn_distances_parallel( NumericVector nn_dist, IntegerVector nn_ptr, bool skip_first, NumericVector target, std::size_t n_iter = 64, double local_connectivity = 1.0, double tol = 1e-5, double min_k_dist_scale = 1e-3, bool ret_sigma = false, std::size_t n_threads = 0, std::size_t grain_size = 1) { std::size_t n_neighbors = 0; std::size_t n_vertices = 0; std::vector nn_ptrv(0); if (nn_ptr.size() == 0) { stop("nn_ptr cannot be empty"); } if (nn_ptr.size() == 1) { // Size optimization for the typical kNN graph case: // all points have the same number of neighbors so just store that number // as the single entry in the nn_ptr vector n_neighbors = nn_ptr[0]; if (nn_dist.size() % n_neighbors != 0) { stop("Invalid n_neighbors for nn_dist size"); } nn_ptrv = std::vector{n_neighbors}; n_vertices = nn_dist.size() / n_neighbors; } else { nn_ptrv = as>(nn_ptr); n_vertices = nn_ptrv.size() - 1; } auto targetv = as>(target); auto nn_distv = as>(nn_dist); double mean_distances = uwot::mean_average(nn_distv); std::atomic_size_t n_search_fails{0}; std::vector nn_weights(nn_dist.size()); std::vector sigmas(ret_sigma ? n_vertices : 0); std::vector rhos(ret_sigma ? n_vertices : 0); auto worker = [&](std::size_t begin, std::size_t end) { uwot::smooth_knn(begin, end, nn_distv, nn_ptrv, skip_first, targetv, local_connectivity, tol, n_iter, min_k_dist_scale, mean_distances, ret_sigma, nn_weights, sigmas, rhos, n_search_fails); }; RcppPerpendicular::parallel_for(n_vertices, worker, n_threads, grain_size); auto res = List::create( _("matrix") = NumericVector(nn_weights.begin(), nn_weights.end()), _("n_failures") = static_cast(n_search_fails)); if (ret_sigma) { res["sigma"] = sigmas; res["rho"] = rhos; } return res; } // [[Rcpp::export]] List reset_local_metrics_parallel(IntegerVector indptr, NumericVector probabilities, std::size_t n_iter = 32, double tol = 1e-5, double num_local_metric_neighbors = 15.0, std::size_t n_threads = 0) { auto n_vertices = indptr.size() - 1; double target = std::log2(num_local_metric_neighbors); std::atomic_size_t n_search_fails{0}; auto prob_ptrv = as>(indptr); auto probabilitiesv = as>(probabilities); auto worker = [&](std::size_t begin, std::size_t end) { uwot::reset_local_metric(begin, end, probabilitiesv, prob_ptrv, target, tol, n_iter, n_search_fails); }; RcppPerpendicular::parallel_for(n_vertices, worker, n_threads); auto res = List::create( _("values") = NumericVector(probabilitiesv.begin(), probabilitiesv.end()), _("n_failures") = static_cast(n_search_fails)); return res; } uwot/src/rng.h0000644000176200001440000000714614733074465013023 0ustar liggesusers// UWOT -- An R package for dimensionality reduction using UMAP // // Copyright (C) 2018 James Melville // // This file is part of UWOT // // UWOT is free software: you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation, either version 3 of the License, or // (at your option) any later version. // // UWOT is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with UWOT. If not, see . #ifndef UWOT_RNG_H #define UWOT_RNG_H #include // linked from dqrng #include "convert_seed.h" #include "pcg_random.hpp" #include "uwot/tauprng.h" // NOT THREAD SAFE // based on code in the dqsample package static uint64_t random64() { return static_cast( R::runif(0, 1) * static_cast((std::numeric_limits::max)())); } // NOT THREAD SAFE static uint32_t random32() { return static_cast( R::runif(0, 1) * static_cast((std::numeric_limits::max)())); } struct batch_tau_factory { std::size_t n_rngs; std::vector seeds; static const constexpr std::size_t seeds_per_rng = 3; batch_tau_factory() : n_rngs(1), seeds(seeds_per_rng * n_rngs) {} batch_tau_factory(std::size_t n_rngs) : n_rngs(n_rngs), seeds(seeds_per_rng * n_rngs) {} void reseed() { for (std::size_t i = 0; i < seeds.size(); i++) { seeds[i] = random64(); } } uwot::tau_prng create(std::size_t n) { const std::size_t idx = n * seeds_per_rng; return uwot::tau_prng(seeds[idx], seeds[idx + 1], seeds[idx + 2]); } }; struct pcg_prng { pcg32 gen; pcg_prng(uint64_t seed) { gen.seed(seed); } // return a value in (0, n] inline std::size_t operator()(std::size_t n, std::size_t, std::size_t) { std::size_t result = gen(n); return result; } }; struct batch_pcg_factory { std::size_t n_rngs; std::vector seeds; static const constexpr std::size_t seeds_per_rng = 2; batch_pcg_factory() : n_rngs(1), seeds(seeds_per_rng * n_rngs) {} batch_pcg_factory(std::size_t n_rngs) : n_rngs(n_rngs), seeds(seeds_per_rng * n_rngs) {} void reseed() { for (std::size_t i = 0; i < seeds.size(); i++) { seeds[i] = random32(); } } pcg_prng create(std::size_t n) { uint32_t pcg_seeds[2] = {seeds[n * seeds_per_rng], seeds[n * seeds_per_rng + 1]}; return pcg_prng(dqrng::convert_seed(pcg_seeds, 2)); } }; // For backwards compatibility in non-batch mode struct tau_factory { uint64_t seed1; uint64_t seed2; tau_factory(std::size_t) : seed1(0), seed2(0) { seed1 = random64(); seed2 = random64(); } void reseed() { seed1 = random64(); seed2 = random64(); } uwot::tau_prng create(std::size_t seed) { return uwot::tau_prng(seed1, seed2, uint64_t{seed}); } }; struct pcg_factory { uint32_t seed1; pcg_factory(std::size_t) : seed1(random32()) {} void reseed() { seed1 = random32(); } pcg_prng create(std::size_t seed) { uint32_t seeds[2] = {seed1, static_cast(seed)}; return pcg_prng(dqrng::convert_seed(seeds, 2)); } }; struct deterministic_factory { deterministic_factory(std::size_t) {} void reseed() {} uwot::deterministic_ng create(std::size_t seed) { return uwot::deterministic_ng(); } }; #endif // UWOT_RNG_H uwot/NAMESPACE0000644000176200001440000000064114730166740012500 0ustar liggesusers# Generated by roxygen2: do not edit by hand export(load_uwot) export(lvish) export(optimize_graph_layout) export(save_uwot) export(similarity_graph) export(simplicial_set_intersect) export(simplicial_set_union) export(tumap) export(umap) export(umap2) export(umap_transform) export(unload_uwot) import(Matrix) importFrom(Rcpp,sourceCpp) importFrom(methods,new) useDynLib(uwot, .registration=TRUE) uwot/NEWS.md0000644000176200001440000013160614757003506012364 0ustar liggesusers# uwot 0.2.3 ## New features: * New parameter: `rng_type`. This will be used in favor of the boolean `pcg_rand` parameter, although `pcg_rand` will still work for backwards compatibility. * New negative sampling option: set `rng_type = "deterministic"` to use a deterministic sampling of vertices during the optimization phase. This should give qualitatively similar results to using a real PRNG, but has the advantage of being faster and giving more reproducible output. This feature was inspired by a comment by [Leland McInnes on Reddit](https://www.reddit.com/r/MachineLearning/comments/1gsjfq9/comment/lxip9wy/). ## Bug fixes and minor improvements * Setting `num_threads` directly in `umap2` did not result in the number of SGD threads being updated to that value when `batch = TRUE`, which it should have been. * Despite assertions to the contrary in version 0.2.1, `umap_transform` continued to return the fuzzy graph in transposed form. Thank you [PedroMilanezAlmeida](https://github.com/PedroMilanezAlmeida) for reopening the issue (). * Relative paths could not be used to save a model. Thank you [Wouter van der Bijl](https://github.com/Ax3man) for the bug report () and the suggested fix. * `repulsion_strength` was silently ignored if used with `tumap` or `umap2` with `a = 1, b = 1`. Ignoring the setting was on purpose, but it was not documented anywhere. `repulsion_strength` is now compatible with these settings. * It's no longer an error to provide a `pca` argument if the input data has a maximum rank smaller than the value of `pca`. No PCA is applied in this case. If `verbose = TRUE`, a message will be printed to inform the user. # uwot 0.2.2 ## Bug fixes and minor improvements * `RSpectra` is now a required dependency (again). It was a required dependency up until version 0.1.12, when it became optional (`irlba` was used in its place). However, problems with interactions of the current version of `irlba` with an ABI change in the `Matrix` package means that it's hard for downstream packages and users to build `uwot` without re-installing `Matrix` and `irlba` from source, which may not be an option for some people. Also it was causing a CRAN check error. I have changed some tests, examples and vignettes to use `RSpectra` explicitly, and to only test `irlba` code-paths where necessary. See and links therein for more details. # uwot 0.2.1 ## New features: * The [HNSW](https://github.com/nmslib/hnswlib) approximate nearest neighbor search algorithm is now supported via the [RcppHNSW](https://cran.r-project.org/package=RcppHNSW) package. Set `nn_method = "hnsw"` to use it. The behavior of the method can be controlled by the new `nn_args` parameter, a list which may contain `M`, `ef_construction` and `ef`. See the hnswlib library's [ALGO_PARAMS documentation](https://github.com/nmslib/hnswlib/blob/master/ALGO_PARAMS.md) for details on these parameters. Although typically faster than Annoy (for a given accuracy), be aware that the only supported `metric` values are `"euclidean"`, `"cosine"` and `"correlation"`. Finally, RcppHNSW is only a suggested package, not a requirement, so you need to install it yourself (e.g. via `install.packages("RcppHNSW")`). Also see the [article on HNSW in uwot](https://jlmelville.github.io/uwot/articles/hnsw-umap.html) in the documentation. * The nearest neighbor descent approximate nearest neighbor search algorithm is now supported via the [rnndescent](https://cran.r-project.org/package=rnndescent) package. Set `nn_method = "nndescent"` to use it. The behavior of the method can be controlled by the new `nn_args` parameter. There are many supported metrics and possible parameters that can be set in `nn_args`, so please see the [article on nearest neighbor descent in uwot](https://jlmelville.github.io/uwot/articles/rnndescent-umap.html) in the documentation, and also the rnndescent package's [documentation](https://jlmelville.github.io/rnndescent/index.html) for details. `rnndescent` is only a suggested package, not a requirement, so you need to install it yourself (e.g. via `install.packages("rnndescent")`). * New function: `umap2`, which acts like `umap` but with modified defaults, reflecting my experience with UMAP and correcting some small mistakes. See the [umap2 article](https://jlmelville.github.io/uwot/articles/umap2.html) for more details. ## Bug fixes and minor improvements * `init_sdev = "range"` caused an error with a user-supplied `init` matrix. * Transforming new data with the `correlation` metric was actually using the `cosine` metric if you saved and reloaded the model. Thank you [Holly Hall](https://github.com/mdrnao) for the report and helpful detective work (). * `umap_transform` could fail if the new data to be transformed had the `scaled:center` and `scaled:scale` attributes set (e.g. from applying the `scale` function). * If you asked `umap_transform` to return the fuzzy graph ( `ret_extra = c("fgraph")`), it was transposed when `batch = TRUE, n_epochs = 0`. Thank you [PedroMilanezAlmeida](https://github.com/PedroMilanezAlmeida) for reporting (). * Setting `n_sgd_threads = "auto"` with `umap_transform` caused a crash. * A warning was being emitted due to not being specific enough about what `dist` class was meant that may have been particularly affecting Seurat users. Thank you [AndiMunteanu](https://github.com/AndiMunteanu) for reporting (and suggesting a solution) (). # uwot 0.1.16 ## Bug fixes and minor improvements * A small change to a header file was required to fully support the next version of [RcppAnnoy](https://cran.r-project.org/package=RcppAnnoy). Thank you [Dirk Eddelbuettel](https://github.com/eddelbuettel) for the PR (). # uwot 0.1.15 ## New features: * New function: `optimize_graph_layout`. Use this to produce optimized output coordinates that reflect an input similarity graph (such as that produced by the `similarity_graph` function. `similarity_graph` followed by `optimize_graph_layout` is the same as running `umap`, so the purpose of these functions is to allow for more flexibility and decoupling between generating the nearest neighbor graph and optimizing the low-dimensional approximation to it. Based on a request by user [Chengwei94](https://github.com/Chengwei94) (). * New functions: `simplicial_set_union` and `simplicial_set_intersect`. These allow for the combination of different fuzzy graph representations of a dataset into a single fuzzy graph using the UMAP simplicial set operations. Based on a request in the Python UMAP issues tracker by user [Dhar xion](https://github.com/ratheraarif). * New parameter for `umap_transform`: `ret_extra`. This works like the equivalent parameter for `umap`, and should be a character vector specifying the extra information you would like returned in addition to the embedding, in which case a list will be returned with an `embedding` member containing the optimized coordinates. Supported values are `"fgraph"`, `"nn"`, `"sigma"` and `"localr"`. Based on a request by user [PedroMilanezAlmeida](https://github.com/PedroMilanezAlmeida) (). * New parameter from `umap`, `tumap` and `umap_transform`: `seed`. This will do the equivalent of calling `set.seed` internally, and hence will help with reproducibility. The chosen seed is exported if `ret_model = TRUE` and `umap_transform` will use that seed if present, so you only need to specify it in `umap_transform` if you want to change the seed. The default behavior remains to not modify the random number state. Based on a request by [SuhasSrinivasan](https://github.com/SuhasSrinivasan) (). ## Bug fixes and minor improvements * A new setting for `init_sdev`: set `init_sdev = "range"` and initial coordinates will be range-scaled so each column takes values between 0-10. This pre-processing was added to the Python UMAP package at some point after `uwot` began development and so should probably always be used with the default `init = "spectral"` setting. However, it is not set by default to maintain backwards compatibility with older versions of `uwot`. * `ret_extra = c("sigma")` is now supported by `lvish`. The Gaussian bandwidths are returned in a `sigma` vector. In addition, a vector of intrinsic dimensionalities estimated for each point using an analytical expression of the finite difference method given by [Lee and co-workers](https://doi.org/10.1016/j.neucom.2014.12.095) is returned in the `dint` vector. * The `min_dist` and `spread` parameters are now returned in the model when `umap` is run with `ret_model = TRUE`. This is just for documentation purposes, these values are not used directly by the model in `umap_transform`. If the parameters `a` and `b` are set directly when invoking `umap`, then both `min_dist` and `spread` will be set to `NULL` in the returned model. This feature was added in response to a question from [kjiang18](https://github.com/kjiang18) (). * Some new checks for NA values in input data have been added. Also a warning will be emitted if `n_components` seems to have been set too high. * If `n_components` was greater than `n_neighbors` then `umap_transform` would crash the R session. Thank you to [ChVav](https://github.com/ChVav) for reporting this (). * Using `umap_transform` with a model where `dens_scale` was set could cause a segmentation fault, destroying the session. Even if it didn't it could give an entirely artifactual "ring" structure. Thank you [FemkeSmit](https://github.com/FemkeSmit) for reporting this and providing assistance in diagnosing the underlying cause (). * If you set `binary_edge_weights = TRUE`, this setting was not exported when `ret_model = TRUE`, and was therefore not respected by `umap_transform`. This has now been fixed, but you will need to regenerate any models that used binary edge weights. * The rdoc for the `init` param said that if there were multiple disconnected components, a spectral initialization would attempt to merge multiple sub-graphs. Not true: actually, spectral initialization is abandoned in favor of PCA. The documentation has been updated to reflect the true state of affairs. No idea what I was thinking of there. * `load_model` and `save_model` didn't work on Windows 7 due to how the version of `tar` there handles drive letters. Thank you [mytarmail](https://github.com/mytarmail) for the report (). * Warn if the initial coordinates have a very large scale (a standard deviation > 10.0), because this can lead to small gradients and poor optimization. Thank you [SuhasSrinivasan](https://github.com/SuhasSrinivasan) for the report (). * A change to accommodate a forthcoming version of [RcppAnnoy](https://cran.r-project.org/package=RcppAnnoy). Thank you [Dirk Eddelbuettel](https://github.com/eddelbuettel) for the PR (). # uwot 0.1.14 ## New features * New function: `similarity_graph`. If you are more interested in the high-dimensional graph/fuzzy simplicial set representation of your input data, and don't care about the low dimensional approximation, the `similarity_graph` function offers a similar API to `umap`, but neither the initialization nor optimization of low-dimensional coordinates will be performed. The return value is the same as that which would be returned in the results list as the `fgraph` member if you had provided `ret_extra = c("fgraph")`. Compared to getting the same result via running `umap`, this function is a bit more convenient to use, makes your intention clearer if you would be discarding the embedding, and saves a small amount of time. A t-SNE/LargeVis similarity graph can be returned by setting `method = "largevis"`. ## Bug fixes and minor improvements * If a model was generated without using pre-generated nearest neighbors, you couldn't use `umap_transform` with pre-generated nearest neighbors (also the error message was completely useless). Thank you to [AustinHartman](https://github.com/AustinHartman) for reporting this (). # uwot 0.1.13 * This is a resubmission of 0.1.12 but with an internal function (`fuzzy_simplicial_set`) refactored to behave more like that of previous versions. This change was breaking the behavior of the CRAN package [bbknnR](https://cran.r-project.org/package=bbknnR). # uwot 0.1.12 ## New features * New parameter: `dens_weight`. If set to a value between 0 and 1, an attempt is made to include the relative local densities of the input data in the output coordinates. This is an approximation to the [densMAP](https://doi.org/10.1038/s41587-020-00801-7) method. A large value of `dens_weight` will use a larger range of output densities to reflect the input data. If the data is too spread out, reduce the value of `dens_weight`. For more information see the [documentation at the uwot repo](https://jlmelville.github.io/uwot/articles/leopold.html). * New parameter: `binary_edge_weights`. If set to `TRUE`, instead of smoothed knn distances, non-zero edge weights all have a value of 1. This is how [PaCMAP](https://www.jmlr.org/papers/v22/20-1061.html) works and there is [practical](https://arxiv.org/abs/2007.08902) and [theoretical](https://proceedings.neurips.cc/paper/2021/hash/2de5d16682c3c35007e4e92982f1a2ba-Abstract.html) reasons to believe this won't have a big effect on UMAP but you can try it yourself. * New options for `ret_extra`: * `"sigma"`: the return value will contain a `sigma` entry, a vector of the smooth knn distance scaling normalization factors, one for each observation in the input data. A small value indicates a high density of points in the local neighborhood of that observation. For `lvish` the equivalent bandwidths calculated for the input perplexity is returned. * also, a vector `rho` will be exported, which is the distance to the nearest neighbor after the number of neighbors specified by the `local_connectivity`. Only applies for `umap` and `tumap`. * `"localr"`: exports a vector of the local radii, the sum of `sigma` and `rho` and used to scale the output coordinates when `dens_weight` is set. Even if not using `dens_weight`, visualizing the output coordinates using a color scale based on the value of `localr` can reveal regions of the input data with different densities. * For functions `umap` and `tumap` only: new data type for precomputed nearest neighbor data passed as the `nn_method` parameter: you may use a sparse distance matrix of format `dgCMatrix` with dimensions `N x N` where `N` is the number of observations in the input data. Distances should be arranged by column, i.e. a non-zero entry in row `j` of the `i`th column indicates that the `j`th observation in the input data is a nearest neighbor of the `i`th observation with the distance given by the value of that element. Note that this is a different format to the sparse distance matrix that can be passed as input to `X`: notably, the matrix is not assumed to be symmetric. Unlike other input formats, you may have a different number of neighbors for each observation (but there must be at least one neighbor defined per observation). * `umap_transform` can also take a sparse distance matrix as its `nn_method` parameter if precomputed nearest neighbor data is used to generate an initial model. The format is the same as for the `nn_method` with `umap`. Because distances are arranged by columns, the expected dimensions of the sparse matrix is `N_model x N_new` where `N_model` is the number of observations in the original data and `N_new` is the number of observations in the data to be transformed. ## Bug fixes and minor improvements * Models couldn't be re-saved after loading. Thank you to [ilyakorsunsky](https://github.com/ilyakorsunsky) for reporting this (). * [RSpectra](https://cran.r-project.org/package=RSpectra) is now a 'Suggests', rather than an 'Imports'. If you have RSpectra installed, it will be used automatically where previous versions required it (for spectral initialization). Otherwise, [irlba](https://cran.r-project.org/package=irlba) will be used. For two-dimensional output, you are unlikely to notice much difference in speed or accuracy with real-world data. For highly-structured simulation datasets (e.g. spectral initialization of a 1D line) then RSpectra will give much better, faster initializations, but these are not the typical use cases envisaged for this package. For embedding into higher dimensions (e.g. `n_components = 100` or higher), RSpectra is recommended and will likely out-perform irlba even if you have installed a good linear algebra library. * `init = "laplacian"` returned the wrong coordinates because of a slightly subtle issue around how to order the eigenvectors when using the random walk transition matrix rather than normalized graph laplacians. * The `init_sdev` parameter was ignored when the `init` parameter was a user-supplied matrix. Now the input will be scaled. * Matrix input was being converted to and from a data frame during pre-processing, causing R to allocate memory that it was disinclined to ever give up even after the function exited. This unnecessary manipulation is now avoided. * The behavior of the `bandwidth` parameter has been changed to give results more like the current version (0.5.2) of the Python UMAP implementation. This is likely to be a breaking change for non-default settings of `bandwidth`, but this is not a parameter which is actually exposed by the Python UMAP public API any more, so is on the road to deprecation in uwot too and I don't recommend you change this. * Transforming data with multiple blocks would give an error if the number of rows of the new data did not equal the number of number of rows in the original data. # uwot 0.1.11 ## New features * New parameter: `batch`. If `TRUE`, then results are reproducible when `n_sgd_threads > 1` (as long as you use `set.seed`). The price to be paid is that the optimization is slightly less efficient (because coordinates are not updated as quickly and hence gradients are staler for longer), so it is highly recommended to set `n_epochs = 500` or higher. Thank you to [Aaron Lun](https://github.com/LTLA) who not only came up with a way to implement this feature, but also wrote an entire [C++ implementation of UMAP](https://github.com/libscran/umappp) which does it (). * New parameter: `opt_args`. The default optimization method when `batch = TRUE` is [Adam](https://arxiv.org/abs/1412.6980). You can control its parameters by passing them in the `opt_args` list. As Adam is a momentum-based method it requires extra storage of previous gradient data. To avoid the extra memory overhead you can also use `opt_args = list(method = "sgd")` to use a stochastic gradient descent method like that used when `batch = FALSE`. * New parameter: `epoch_callback`. You may now pass a function which will be invoked at the end of each epoch. Mainly useful for producing an image of the state of the embedding at different points during the optimization. This is another feature taken from [umappp](https://github.com/libscran/umappp). * New parameter: `pca_method`, used when the `pca` parameter is supplied to reduce the initial dimensionality of the data. This controls which method is used to carry out the PCA and can be set to one of: * `"irlba"` which uses `irlba::irlba` to calculate a truncated SVD. If this routine deems that you are trying to extract 50% or more of the singular vectors, you will see a warning to that effect logged to the console. * `"rsvd"`, which uses `irlba::svdr` for truncated SVD. This method uses a small number of iterations which should give an accuracy/speed up trade-off similar to that of the [scikit-learn TruncatedSVD](https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.TruncatedSVD.html#sklearn.decomposition.TruncatedSVD) method. This can be much faster than using `"irlba"` but potentially at a cost in accuracy. However, for the purposes of dimensionality reduction as input to nearest neighbor search, this doesn't seem to matter much. * `"bigstatsr"`, which uses the [bigstatsr](https://cran.r-project.org/package=bigstatsr) package will be used. **Note**: that this is *not* a dependency of `uwot`. If you want to use `bigstatsr`, you must install it yourself. On platforms without easy access to fast linear algebra libraries (e.g. Windows), using `bigstatsr` may give a speed up to PCA calculations. * `"svd"`, which uses `base::svd`. **Warning**: this is likely to be very slow for most datasets and exists as a fallback for small datasets where the `"irlba"` method would print a warning. * `"auto"` (the default) which uses `"irlba"` to calculate a truncated SVD, unless you are attempting to extract 50% or more of the singular vectors, in which case `"svd"` is used. ## Bug fixes and minor improvements * If row names are provided in the input data (or nearest neighbor data, or initialization data if it's a matrix), this will be used to name the rows of the output embedding (), and also the nearest neighbor data if you set `ret_nn = TRUE`. If the names exist in more than one of the input data parameters listed above, but are inconsistent, no guarantees are made about which names will be used. Thank you [jwijffels](https://github.com/jwijffels) for reporting this. * In `umap_transform`, the learning rate is now down-scaled by a factor of 4, consistent with the Python implementation of UMAP. If you need the old behavior back, use the (newly added) `learning_rate` parameter in `umap_transform` to set it explicitly. If you used the default value in `umap` when creating the model, the correct setting in `umap_transform` is `learning_rate = 1.0`. * Setting `nn_method = "annoy"` and `verbose = TRUE` would lead to an error with datasets with fewer than 50 items in them. * Using multiple pre-computed nearest neighbors blocks is now supported with `umap_transform` (this was incorrectly documented to work). * Documentation around pre-calculated nearest neighbor data for `umap_transform` was wrong in other ways: it has now been corrected to indicate that there should be neighbor data for each item in the test data, but the neighbors and distances should refer to items in training data (i.e. the data used to build the model). * `n_neighbors` parameter is now correctly ignored in model generation if pre-calculated nearest neighbor data is provided. * Documentation incorrectly said `grain_size` didn't do anything. # uwot 0.1.10 This release is mainly to allow for some internal changes to keep compatibility with RcppAnnoy, used for the nearest neighbor calculations. ## Bug fixes and minor improvements * Passing in data with missing values will now raise an error early. Missing data in factor columns intended for supervised UMAP is still ok. Thank you David McGaughey for tweeting about this issue. * The documentation for the return value of `umap` and `tumap` now note that the contents of the `model` list are subject to change and not intended to be part of the uwot public API. I recommend not relying on the structure of the `model`, especially if your package is intended to appear on CRAN or Bioconductor, as any breakages will delay future releases of uwot to CRAN. # uwot 0.1.9 ## New features * New metric: `metric = "correlation"` a distance based on the Pearson correlation (). Supporting this required a change to the internals of how nearest neighbor data is stored. Backwards compatibility with models generated by previous versions using `ret_model = TRUE` should have been preserved. ## Bug fixes and minor improvements * New parameter, `nn_method`, for `umap_transform`: pass a list containing pre-computed nearest neighbor data (identical to that used in the `umap` function). You should not pass anything to the `X` parameter in this case. This extends the functionality for transforming new points to the case where nearest neighbor data between the original data and new data can be calculated external to `uwot`. Thanks to [Yuhan Hao](https://github.com/yuhanH) for contributing the PR ( and ). * New parameter, `init`, for `umap_transform`: provides a variety of options for initializing the output coordinates, analogously to the same parameter in the `umap` function (but without as many options currently). This is intended to replace `init_weighted`, which should be considered deprecated, but won't be removed until uwot 1.0 (whenever that is). Instead of `init_weighted = TRUE`, use `init = "weighted"`; replace `init_weighted = FALSE` with `init = "average"`. Additionally, you can pass a matrix to `init` to act as the initial coordinates. * Also in `umap_transform`: previously, setting `n_epochs = 0` was ignored: at least one iteration of optimization was applied. Now, `n_epochs = 0` is respected, and will return the initialized coordinates without any further optimization. * Minor performance improvement for single-threaded nearest neighbor search when `verbose = TRUE`: the progress bar calculations were taking up a detectable amount of time and has now been fixed. With very small data sets (< 50 items) the progress bar will no longer appear when building the index. * Passing a sparse distance matrix as input now supports upper/lower triangular matrix storage rather than wasting storage using an explicitly symmetric sparse matrix. * Minor license change: uwot used to be licensed under GPL-3 only; now it is GPL-3 or later. # uwot 0.1.8 ## Bug fixes and minor improvements * default for `n_threads` is now `NULL` to provide a bit more protection from changing dependencies. * parallel code now uses the standard C++11 implementation of threading rather than tinythread++. * The `grain_size` parameter has been undeprecated. As the version that deprecated this never made it to CRAN, this is unlikely to have affected many people. # uwot 0.1.7 ## Bug fixes and minor improvements * uwot should no longer trigger undefined behavior in sanitizers, due to the temporary replacement of the RcppParallel package with code "borrowed" from that package and using tinythread++ rather than tbb (). * Further sanitizer improvements in the nearest neighbor search code due to the upstream efforts of [erikbern](https://github.com/erikbern) and [eddelbuettel](https://github.com/eddelbuettel) (). * The `grain_size` parameter is now ignored and remains to avoid breaking backwards compatibility only. # uwot 0.1.6 ## New features * New parameter, `ret_extra`, a vector which can contain any combination of: `"model"` (same as `ret_model = TRUE`), `"nn"` (same as `ret_nn = TRUE`) and `fgraph` (see below). * New return value data: If the `ret_extra` vector contains `"fgraph"`, the returned list will contain an `fgraph` item representing the fuzzy simplicial input graph as a sparse N x N matrix. For `lvish`, use `"P"` instead of `"fgraph`" (). Note that there is a further sparsifying step where edges with a very low membership are removed if there is no prospect of the edge being sampled during optimization. This is controlled by `n_epochs`: the smaller the value, the more sparsifying will occur. If you are only interested in the fuzzy graph and not the embedded coordinates, set `n_epochs = 0`. * New function: `unload_uwot`, to unload the Annoy nearest neighbor indices in a model. This prevents the model from being used in `umap_transform`, but allows for the temporary working directory created by both `save_uwot` and `load_uwot` to be deleted. Previously, both `load_uwot` and `save_uwot` were attempting to delete the temporary working directories they used, but would always silently fail because Annoy is making use of files in those directories. * An attempt has been made to reduce the variability of results due to different compiler and C++ library versions on different machines. Visually results are unchanged in most cases, but this is a breaking change in terms of numerical output. The best chance of obtaining floating point determinism across machines is to use `init = "spca"`, fixed values of `a` and `b` (rather than allowing them to be calculated through setting `min_dist` and `spread`) and `approx_pow = TRUE`. Using the `tumap` method with `init = "spca"` is probably the most robust approach. ## Bug fixes and minor improvements * New behavior when `n_epochs = 0`. This used to behave like (`n_epochs = NULL`) and gave a default number of epochs (dependent on the number of vertices in the dataset). Now it more usefully carries out all calculations except optimization, so the returned coordinates are those specified by the `init` parameter, so this is an easy way to access e.g. the spectral or PCA initialization coordinates. If you want the input fuzzy graph (`ret_extra` vector contains `"fgraph"`), this will also prevent the graph having edges with very low membership being removed. You still get the old default epochs behavior by setting `n_epochs = NULL` or to a negative value. * `save_uwot` and `load_uwot` have been updated with a `verbose` parameter so it's easier to see what temporary files are being created. * `save_uwot` has a new parameter, `unload`, which if set to `TRUE` will delete the working directory for you, at the cost of unloading the model, i.e. it can't be used with `umap_transform` until you reload it with `load_uwot`. * `save_uwot` now returns the saved model with an extra field, `mod_dir`, which points to the location of the temporary working directory, so you should now assign the result of calling `save_uwot` to the model you saved, e.g. `model <- save_uwot(model, "my_model_file")`. This field is intended for use with `unload_uwot`. * `load_uwot` also returns the model with a `mod_dir` item for use with `unload_uwot`. * `save_uwot` and `load_uwot` were not correctly handling relative paths. * A previous bug fix to `load_uwot` in uwot 0.1.4 to work with newer versions of RcppAnnoy () failed in the typical case of a single metric for the nearest neighbor search using all available columns, giving an error message along the lines of: `Error: index size is not a multiple of vector size `. This has now been fixed, but required changes to both `save_uwot` and `load_uwot`, so existing saved models must be regenerated. Thank you to reporter [OuNao](https://github.com/OuNao). # uwot 0.1.5 ## Bug fixes and minor improvements * The R API was being accessed from inside multi-threaded code to seed the (non-R) random number generators. Probably this was causing users in downstream projects (seurat and monocle) to experience strange RcppParallel-related crashes. Thanks to [aldojongejan](https://github.com/aldojongejan) for reporting this (). * Passing a floating point value smaller than one to `n_threads` caused a crash. This was particularly insidious if running with a system with only one default thread available as the default `n_threads` becomes `0.5`. Now `n_threads` (and `n_sgd_threads`) are rounded to the nearest integer. * Initialization of supervised UMAP should now be faster (). Contributed by [Aaron Lun](https://github.com/LTLA). # uwot 0.1.4 ## Bug fixes and minor improvements * Fixed incorrect loading of Annoy indexes to be compatible with newer versions of RcppAnnoy (). My thanks to Dirk Eddelbuettel and Erik Bernhardsson for aid in identifying the problem. * Fix for `ERROR: there is already an InterruptableProgressMonitor instance defined`. * If `verbose = TRUE`, the `a`, `b` curve parameters are now logged. # uwot 0.1.3 ## Bug fixes and minor improvements * Fixed an issue where the session would crash if the Annoy nearest neighbor search was unable to find k neighbors for an item. ## Known issue Even with a fix for the bug mentioned above, if the nearest neighbor index file is larger than 2GB in size, Annoy may not be able to read the data back in. This should only occur with very large or high-dimensional datasets. The nearest neighbor search will fail under these conditions. A work-around is to set `n_threads = 0`, because the index will not be written to disk and re-loaded under these circumstances, at the cost of a longer search time. Alternatively, set the `pca` parameter to reduce the dimensionality or lower `n_trees`, both of which will reduce the size of the index on disk. However, either may lower the accuracy of the nearest neighbor results. # uwot 0.1.2 Initial CRAN release. ## New features * New parameter, `tmpdir`, which allows the user to specify the temporary directory where nearest neighbor indexes will be written during Annoy nearest neighbor search. The default is `base::tempdir()`. Only used if `n_threads > 1` and `nn_method = "annoy"`. ## Bug fixes and minor improvements * Fixed an issue with `lvish` where there was an off-by-one error when calculating input probabilities. * Added a safe-guard to `lvish` to prevent the gaussian precision, beta, becoming overly large when the binary search fails during perplexity calibration. * The `lvish` perplexity calibration uses the log-sum-exp trick to avoid numeric underflow if beta becomes large. # uwot 0.0.0.9010 (31 March 2019) ## New features * New parameter: `pcg_rand`. If `TRUE` (the default), then a random number generator from [the PCG family](https://www.pcg-random.org/) is used during the stochastic optimization phase. The old PRNG, a direct translation of an implementation of the Tausworthe "taus88" PRNG used in the Python version of UMAP, can be obtained by setting `pcg_rand = FALSE`. The new PRNG is slower, but is likely superior in its statistical randomness. This change in behavior will be break backwards compatibility: you will now get slightly different results even with the same seed. * New parameter: `fast_sgd`. If `TRUE`, then the following combination of parameters are set: `n_sgd_threads = "auto"`, `pcg_rand = FALSE` and `approx_pow = TRUE`. These will result in a substantially faster optimization phase, at the cost of being slightly less accurate and results not being exactly repeatable. `fast_sgd = FALSE` by default but if you are only interested in visualization, then `fast_sgd` gives perfectly good results. For more generic dimensionality reduction and reproducibility, keep `fast_sgd = FALSE`. * New parameter: `init_sdev` which specifies how large the standard deviation of each column of the initial coordinates should be. This will scale any input coordinates (including user-provided matrix coordinates). `init = "spca"` can now be thought of as an alias of `init = "pca", init_sdev = 1e-4`. This may be too aggressive scaling for some datasets. The typical UMAP spectral initializations tend to result in standard deviations of around `2` to `5`, so this might be more appropriate in some cases. If spectral initialization detects multiple components in the affinity graph and falls back to scaled PCA, it uses `init_sdev = 1`. * As a result of adding `init_sdev`, the `init` options `sspectral`, `slaplacian` and `snormlaplacian` have been removed (they weren't around for very long anyway). You can get the same behavior by e.g. `init = "spectral", init_sdev = 1e-4`. `init = "spca"` is sticking around because I use it a lot. ## Bug fixes and minor improvements * Spectral initialization (the default) was sometimes generating coordinates that had too large a range, due to an erroneous scale factor that failed to account for negative coordinate values. This could give rise to embeddings with very noticeable outliers distant from the main clusters. * Also during spectral initialization, the amount of noise being added had a standard deviation an order of magnitude too large compared to the Python implementation (this probably didn't make any difference though). * If requesting a spectral initialization, but multiple disconnected components are present, fall back to `init = "spca"`. * Removed dependency on C++ `` header. This breaks backwards compatibility even if you set `pcg_rand = FALSE`. * `metric = "cosine"` results were incorrectly using the unmodified Annoy angular distance. * Numeric matrix columns can be specified as the target for the `categorical` metric (fixes ). # uwot 0.0.0.9009 (1 January 2019) * Data is now stored column-wise during optimization, which should result in an increase in performance for larger values of `n_components` (e.g. approximately 50% faster optimization time with MNIST and `n_components = 50`). * New parameter: `pca_center`, which controls whether to center the data before applying PCA. It would be typical to set this to `FALSE` if you are applying PCA to binary data (although note you can't use this with setting with `metric = "hamming"`) * PCA will now be used when the `metric` is `"manhattan"` and `"cosine"`. It's still *not* applied when using `"hamming"` (data still needs to be in binary format, not real-valued). * If using mixed datatypes, you may override the `pca` and `pca_center` parameter values for a given data block by using a list for the value of the metric, with the column ids/names as an unnamed item and the overriding values as named items, e.g. instead of `manhattan = 1:100`, use `manhattan = list(1:100, pca_center = FALSE)` to turn off PCA centering for just that block. This functionality exists mainly for the case where you have mixed binary and real-valued data and want to apply PCA to both data types. It's normal to apply centering to real-valued data but not to binary data. ## Bug fixes and minor improvements * Fixed bug that affected `umap_transform`, where negative sampling was over the size of the test data (should be the training data). * Some other performance improvements (around 10% faster for the optimization stage with MNIST). * When `verbose = TRUE`, log the Annoy recall accuracy, which may help tune values of `n_trees` and `search_k`. # uwot 0.0.0.9008 (December 23 2018) ## New features * New parameter: `n_sgd_threads`, which controls the number of threads used in the stochastic gradient descent. By default this is now single-threaded and should result in reproducible results when using `set.seed`. To get back the old, less consistent, but faster settings, set `n_sgd_threads = "auto"`. * API change for consistency with Python UMAP: * `alpha` is now `learning_rate`. * `gamma` is now `repulsion_strength`. * Default spectral initialization now looks for disconnected components and initializes them separately (also applies to `laplacian` and `normlaplacian`). * New `init` options: `sspectral`, `snormlaplacian` and `slaplacian`. These are like `spectral`, `normlaplacian`, `laplacian` respectively, but scaled so that each dimension has a standard deviation of 1e-4. This is like the difference between the `pca` and `spca` options. ## Bug fixes and minor improvements * Hamming distance support (was actually using Euclidean distance). * Smooth knn/perplexity calibration results had a small dependency on the number of threads used. * Anomalously long spectral initialization times should now be reduced. * Internal changes and fixes thanks to a code review by [Aaron Lun](https://github.com/ltla). # uwot 0.0.0.9007 (December 9 2018) ## New features * New parameter `pca`: set this to a positive integer to reduce matrix of data frames to that number of columns using PCA. Only works if `metric = "euclidean"`. If you have > 100 columns, this can substantially improve the speed of the nearest neighbor search. t-SNE implementations often set this value to 50. ## Bug fixes and minor improvements * Laplacian Eigenmap initialization convergence failure is now correctly detected. * C++ code was over-writing data passed from R as a function argument. # uwot 0.0.0.9006 (December 5 2018) ## New features * Highly experimental mixed data type support for `metric`: instead of specifying a single metric name (e.g. `metric = "euclidean"`), you can pass a list, where the name of each item is the metric to use and the value is a vector of the names of the columns to use with that metric, e.g. `metric = list("euclidean" = c("A1", "A2"), "cosine" = c("B1", "B2", "B3"))` treats columns `A1` and `A2` as one block, using the Euclidean distance to find nearest neighbors, whereas `B1`, `B2` and `B3` are treated as a second block, using the cosine distance. * Factor columns can also be used in the metric, using the metric name `categorical`. * `y` may now be a data frame or matrix if multiple target data is available. * New parameter `target_metric`, to specify the distance metric to use with numerical `y`. This has the same capabilities as `metric`. * Multiple external nearest neighbor data sources are now supported. Instead of passing a list of two matrices, pass a list of lists, one for each external metric. * More details on mixed data types can be found at . * Compatibility with older versions of RcppParallel (contributed by [sirusb](https://github.com/sirusb)). * `scale = "Z"` To Z-scale each column of input (synonym for `scale = TRUE` or `scale = "scale"`). * New scaling option, `scale = "colrange"` to scale columns in the range (0, 1). # uwot 0.0.0.9005 (November 4 2018) ## New features * Hamming distance is now supported, due to upgrade to RcppAnnoy 0.0.11. # uwot 0.0.0.9004 (October 21 2018) ## New features * For supervised UMAP with numeric `y`, you may pass nearest neighbor data directly, in the same format as that supported by `X`-related nearest neighbor data. This may be useful if you don't want to use Euclidean distances for the `y` data, or if you have missing data (and have a way to assign nearest neighbors for those cases, obviously). See the [Nearest Neighbor Data Format](https://github.com/jlmelville/uwot#nearest-neighbor-data-format) section for details. # uwot 0.0.0.9003 (September 22 2018) ## New features * New parameter `ret_nn`: when `TRUE` returns nearest neighbor matrices as a `nn` list: indices in item `idx` and distances in item `dist`. Embedded coordinates are in `embedding`. Both `ret_nn` and `ret_model` can be `TRUE`, and should not cause any compatibility issues with supervised embeddings. * `nn_method` can now take precomputed nearest neighbor data. Must be a list of two matrices: `idx`, containing integer indexes, and `dist` containing distances. By no coincidence, this is the format return by `ret_nn`. ## Bug fixes and minor improvements * Embedding to `n_components = 1` was broken () * User-supplied matrices to `init` parameter were being modified, in defiance of basic R pass-by-copy semantics. # uwot 0.0.0.9002 (August 14 2018) ## Bug fixes and minor improvements * `metric = "cosine"` is working again for `n_threads` greater than `0` () # uwot 0.0.0.9001 ## New features * *August 5 2018*. You can now use an existing embedding to add new points via `umap_transform`. See the example section below. * *August 1 2018*. Numerical vectors are now supported for supervised dimension reduction. * *July 31 2018*. (Very) initial support for supervised dimension reduction: categorical data only at the moment. Pass in a factor vector (use `NA` for unknown labels) as the `y` parameter and edges with bad (or unknown) labels are down-weighted, hopefully leading to better separation of classes. This works remarkably well for the Fashion MNIST dataset. * *July 22 2018*. You can now use the cosine and Manhattan distances with the Annoy nearest neighbor search, via `metric = "cosine"` and `metric = "manhattan"`, respectively. Hamming distance is not supported because RcppAnnoy doesn't yet support it. uwot/inst/0000755000176200001440000000000014757004303012230 5ustar liggesusersuwot/inst/include/0000755000176200001440000000000014730166740013660 5ustar liggesusersuwot/inst/include/RcppPerpendicular.h0000644000176200001440000001077614730166740017466 0ustar liggesusers// Taken from RcppParallel.h and then modified slightly to rename header guards // and namespaces to avoid any potential clashes. RcppParallel is licensed under // GPLv2 or later: // RcppPerpendicular.h a version of parallel for based on RcppParallel // Copyright (C) 2020 James Melville // // This program is free software; you can redistribute it and/or // modify it under the terms of the GNU General Public License // as published by the Free Software Foundation; either version 2 // of the License, or (at your option) any later version. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with this program; if not, write to the Free Software // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, // USA. #ifndef RCPP_PERPENDICULAR #define RCPP_PERPENDICULAR #include #include #include namespace RcppPerpendicular { using IndexRange = std::pair; template auto worker_thread(Worker &worker, const IndexRange &range) -> void { try { worker(range.first, range.second); } catch (...) { } } template auto worker_thread_id(Worker &worker, const IndexRange &range, std::size_t thread_id) -> void { try { worker(range.first, range.second, thread_id); } catch (...) { } } // Function to calculate the ranges for a given input inline auto split_input_range(const IndexRange &range, std::size_t n_threads, std::size_t grain_size) -> std::vector { // determine max number of threads if (n_threads == 0) { n_threads = std::thread::hardware_concurrency(); } // compute grain_size (including enforcing requested minimum) std::size_t length = range.second - range.first; if (n_threads == 1) { grain_size = length; } else if ((length % n_threads) == 0) { // perfect division grain_size = (std::max)(length / n_threads, grain_size); } else { // imperfect division, divide by threads - 1 grain_size = (std::max)(length / (n_threads - 1), grain_size); } // allocate ranges std::vector ranges; std::size_t begin = range.first; while (begin < range.second) { std::size_t end = (std::min)(begin + grain_size, range.second); ranges.emplace_back(std::make_pair(begin, end)); begin = end; } return ranges; } // Execute the Worker over the IndexRange in parallel template inline void parallel_for(std::size_t begin, std::size_t end, Worker &worker, std::size_t n_threads, std::size_t grain_size = 1) { if (n_threads > 0) { // split the work IndexRange input_range(begin, end); std::vector ranges = split_input_range(input_range, n_threads, grain_size); std::vector threads; threads.reserve(ranges.size()); for (auto &range : ranges) { threads.push_back( std::thread(&worker_thread, std::ref(worker), range)); } for (auto &thread : threads) { thread.join(); } } else { worker(begin, end); } } template inline void parallel_for(std::size_t end, Worker &worker, std::size_t n_threads, std::size_t grain_size = 1) { parallel_for(0, end, worker, n_threads, grain_size); } template inline void pfor(std::size_t begin, std::size_t end, Worker &worker, std::size_t n_threads, std::size_t grain_size = 1) { if (n_threads > 0) { IndexRange input_range(begin, end); std::vector ranges = split_input_range(input_range, n_threads, grain_size); std::vector threads; for (std::size_t thread_id = 0; thread_id < ranges.size(); ++thread_id) { auto &range = ranges[thread_id]; threads.push_back(std::thread(&worker_thread_id, std::ref(worker), range, thread_id)); } for (auto &thread : threads) { thread.join(); } } else { worker(begin, end, 0); } } template inline void pfor(std::size_t end, Worker &worker, std::size_t n_threads, std::size_t grain_size = 1) { pfor(0, end, worker, n_threads, grain_size); } } // namespace RcppPerpendicular #endif // RCPP_PERPENDICULAR uwot/inst/include/uwot/0000755000176200001440000000000014733730066014657 5ustar liggesusersuwot/inst/include/uwot/perplexity.h0000644000176200001440000001210014733074465017233 0ustar liggesusers// BSD 2-Clause License // // Copyright 2020 James Melville // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are met: // // 1. Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimer. // // 2. Redistributions in binary form must reproduce the above copyright notice, // this list of conditions and the following disclaimer in the documentation // and/or other materials provided with the distribution. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE // OF SUCH DAMAGE. #ifndef UWOT_PERPLEXITY_H #define UWOT_PERPLEXITY_H #include #include #include #include namespace uwot { auto find_beta(const std::vector &d2, double target, double tol, std::size_t n_iter, std::size_t &n_window_search_fails) -> double { constexpr auto double_max = (std::numeric_limits::max)(); double beta = 1.0; double lo = 0.0; double hi = double_max; // best value seen is used only if binary search fails // (usually only happens if there are multiple degenerate distances) double beta_best = beta; double adiff_min = double_max; bool converged = false; // neighbors of i not including itself auto n_true_neighbors = d2.size(); for (std::size_t iter = 0; iter < n_iter; iter++) { double Z = 0.0; double H = 0.0; double sum_D2_W = 0.0; for (std::size_t k = 0; k < n_true_neighbors; k++) { double W = std::exp(-d2[k] * beta); Z += W; sum_D2_W += d2[k] * W; } if (Z > 0) { H = std::log(Z) + beta * sum_D2_W / Z; } double adiff = std::abs(H - target); if (adiff < tol) { converged = true; break; } // store best beta in case binary search fails if (adiff < adiff_min) { adiff_min = adiff; beta_best = beta; } if (H < target) { hi = beta; beta = 0.5 * (lo + hi); } else { lo = beta; if (hi == double_max) { beta *= 2.0; } else { beta = 0.5 * (lo + hi); } } } if (!converged) { ++n_window_search_fails; beta = beta_best; } return beta; } void perplexity_search(std::size_t i, const std::vector &nn_dist, std::size_t n_neighbors, double target, double tol, std::size_t n_iter, std::vector &d2, std::vector &nn_weights, bool save_sigmas, std::vector &sigmas, std::size_t &n_window_search_fails) { auto i_begin = n_neighbors * i; // log-sum-exp trick // shift squared distances by minimum (distances are already sorted) // D2, W and Z are their shifted versions // but P (and hence Shannon entropy) is unchanged // the self neighbor is ignored leading to some +/- 1 in loops and indexing double dmin = nn_dist[i_begin + 1] * nn_dist[i_begin + 1]; for (std::size_t k = 1; k < n_neighbors; k++) { d2[k - 1] = nn_dist[i_begin + k] * nn_dist[i_begin + k] - dmin; } double beta = find_beta(d2, target, tol, n_iter, n_window_search_fails); double Z = 0.0; for (std::size_t k = 0; k < n_neighbors - 1; k++) { double W = std::exp(-d2[k] * beta); Z += W; // no longer need d2 at this point, store final W there d2[k] = W; } for (std::size_t k = 1; k < n_neighbors; k++) { nn_weights[i_begin + k] = d2[k - 1] / Z; } if (save_sigmas) { sigmas[i] = 1 / sqrt(beta); } } void perplexity_search(std::size_t begin, std::size_t end, const std::vector &nn_dist, std::size_t n_neighbors, double target, double tol, std::size_t n_iter, std::vector &res, bool save_sigmas, std::vector &sigmas, std::atomic_size_t &n_search_fails) { // number of binary search failures in this window std::size_t n_window_search_fails = 0; std::vector d2(n_neighbors - 1, 0.0); for (std::size_t i = begin; i < end; i++) { perplexity_search(i, nn_dist, n_neighbors, target, tol, n_iter, d2, res, save_sigmas, sigmas, n_window_search_fails); } // Update global count of failures n_search_fails += n_window_search_fails; } } // namespace uwot #endif // UWOT_PERPLEXITY_H uwot/inst/include/uwot/coords.h0000644000176200001440000000446514577210515016330 0ustar liggesusers// BSD 2-Clause License // // Copyright 2021 James Melville // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are met: // // 1. Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimer. // // 2. Redistributions in binary form must reproduce the above copyright notice, // this list of conditions and the following disclaimer in the documentation // and/or other materials provided with the distribution. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE // OF SUCH DAMAGE. #ifndef UWOT_COORDS_H #define UWOT_COORDS_H #include #include namespace uwot { // For normal UMAP, tail_embedding is NULL and we want to pass // a shallow copy of head_embedding as tail_embedding. // When updating new values, tail_embedding is the new coordinate to optimize // and gets passed as normal. struct Coords { std::vector head_embedding; std::unique_ptr> tail_vec_ptr; Coords(std::vector &head_embedding) : head_embedding(head_embedding), tail_vec_ptr(nullptr) {} Coords(std::vector &head_embedding, std::vector &tail_embedding) : head_embedding(head_embedding), tail_vec_ptr(new std::vector(tail_embedding)) {} auto get_tail_embedding() -> std::vector & { if (tail_vec_ptr) { return *tail_vec_ptr; } else { return head_embedding; } } auto get_head_embedding() -> std::vector & { return head_embedding; } }; } // namespace uwot #endif uwot/inst/include/uwot/smooth_knn.h0000644000176200001440000002413514733074465017220 0ustar liggesusers// BSD 2-Clause License // // Copyright 2020 James Melville // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are met: // // 1. Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimer. // // 2. Redistributions in binary form must reproduce the above copyright notice, // this list of conditions and the following disclaimer in the documentation // and/or other materials provided with the distribution. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE // OF SUCH DAMAGE. #ifndef UWOT_SMOOTH_KNN_H #define UWOT_SMOOTH_KNN_H #include #include #include #include #include #include "RcppPerpendicular.h" namespace uwot { // Welford-style mean calculation auto mean_average(const std::vector &v, std::size_t begin, std::size_t end) -> double { long double mean = 0.0; auto b1 = 1 - begin; for (auto i = begin; i < end; ++i) { mean += (v[i] - mean) / (i + b1); } return static_cast(mean); } auto mean_average(const std::vector &v) -> double { return mean_average(v, 0, v.size()); } // nn_dist is sorted non-decreasing nearest neighbor distances // nzero_begin points to the index of the first non-zero distance // nzero_end points to one past the index of the last non-zero distance // (n_neighbors + 1) auto find_rho(const std::vector &nn_dist, std::size_t nzero_begin, std::size_t nzero_end, double local_connectivity, double tol) -> double { double rho = 0.0; auto nnzero = nzero_end - nzero_begin; if (nnzero >= local_connectivity) { auto index = static_cast(std::floor(local_connectivity)); double interpolation = local_connectivity - index; if (index > 0) { rho = nn_dist[nzero_begin + index - 1]; if (interpolation >= tol) { // rho = (1 - interp) * rho + interp * d_{i+1} rho += interpolation * (nn_dist[nzero_begin + index] - rho); } } else if (nnzero > 0) { // typical code-path: rho is the smallest non-zero distance rho = interpolation * nn_dist[nzero_begin]; } } else if (nnzero > 0) { // not enough non-zero distances, return the largest non-zero distance rho = nn_dist[nzero_end - 1]; } return rho; } // Find the normalization factor for the smoothed distances auto find_sigma(const std::vector &nn_dist, std::size_t i_begin, std::size_t i_end, double target, double rho, double tol, std::size_t n_iter, std::size_t &n_window_search_fails) -> double { constexpr auto double_max = (std::numeric_limits::max)(); // best value seen is used only if binary search fails // NB there is already a safeguard against sigma getting too large // so this is less of a problem than with the perplexity search double sigma = 1.0; double sigma_best = sigma; double adiff_min = double_max; double lo = 0.0; double hi = double_max; bool converged = false; for (std::size_t iter = 0; iter < n_iter; iter++) { double val = 0.0; // NB i_begin should point to the first non-self neighbor for (auto j = i_begin; j < i_end; j++) { auto rj = nn_dist[j] - rho; val += rj <= 0.0 ? 1.0 : std::exp(-rj / sigma); } double adiff = std::abs(val - target); if (adiff < tol) { converged = true; break; } // store best sigma in case binary search fails (usually in the presence // of multiple degenerate distances) if (adiff < adiff_min) { adiff_min = adiff; sigma_best = sigma; } if (val > target) { hi = sigma; sigma = 0.5 * (lo + hi); } else { lo = sigma; if (hi == double_max) { sigma *= 2; } else { sigma = 0.5 * (lo + hi); } } } if (!converged) { ++n_window_search_fails; sigma = sigma_best; } return sigma; } // NB nn_dist must be in sorted non-decreasing order void smooth_knn(std::size_t i, const std::vector &nn_dist, const std::vector &nn_ptr, bool skip_first, const std::vector &target, double local_connectivity, double tol, std::size_t n_iter, double min_k_dist_scale, double mean_distances, bool save_sigmas, std::vector &nn_weights, std::vector &sigmas, std::vector &rhos, std::size_t &n_window_search_fails) { // i_begin points to start of ith distances // i_end points to one past end of ith distances auto i_begin = 0; auto i_end = 0; auto n_neighbors = 0; // Space optimization for kNN (typical case): store the number of neighbors // as the only entry in nn_ptr if (nn_ptr.size() == 1) { n_neighbors = nn_ptr[0]; i_begin = n_neighbors * i; i_end = i_begin + n_neighbors; } else { i_begin = nn_ptr[i]; i_end = nn_ptr[i + 1]; n_neighbors = i_end - i_begin; } // nzero_begin points to start of ith non-zero distances auto nzero_begin = i_end; for (auto j = i_begin; j < i_end; j++) { if (nn_dist[j] > 0.0) { nzero_begin = j; break; } } auto rho = find_rho(nn_dist, nzero_begin, i_end, local_connectivity, tol); double targeti = target.size() == 1 ? target[0] : target[i]; // in case where self-distance (0) is passed as the nearest neighbor, skip // first item in neighbors when calculating sigma auto sigma = find_sigma(nn_dist, i_begin + (skip_first ? 1 : 0), i_end, targeti, rho, tol, n_iter, n_window_search_fails); // safeguard sigma if (rho > 0.0) { sigma = (std::max)(min_k_dist_scale * mean_average(nn_dist, i_begin, i_end), sigma); } else { sigma = (std::max)(min_k_dist_scale * mean_distances, sigma); } // create the final membership strengths for (auto j = i_begin; j < i_end; j++) { auto rj = nn_dist[j] - rho; nn_weights[j] = rj <= 0.0 ? 1.0 : std::exp(-rj / sigma); } if (save_sigmas) { sigmas[i] = sigma; rhos[i] = rho; } } void smooth_knn(std::size_t begin, std::size_t end, const std::vector &nn_dist, const std::vector &nn_ptr, bool skip_first, const std::vector &target, double local_connectivity, double tol, std::size_t n_iter, double min_k_dist_scale, double mean_distances, bool save_sigmas, std::vector &nn_weights, std::vector &sigmas, std::vector &rhos, std::atomic_size_t &n_search_fails) { // number of binary search failures in this window std::size_t n_window_search_fails = 0; for (std::size_t i = begin; i < end; i++) { smooth_knn(i, nn_dist, nn_ptr, skip_first, target, local_connectivity, tol, n_iter, min_k_dist_scale, mean_distances, save_sigmas, nn_weights, sigmas, rhos, n_window_search_fails); } // Update global count of failures n_search_fails += n_window_search_fails; } auto reset_local_metric(const std::vector &probabilities, std::size_t i_begin, std::size_t i_end, double target, double tol, std::size_t n_iter, std::size_t &n_window_search_fails) -> double { constexpr auto double_max = (std::numeric_limits::max)(); double lo = 0.0; double hi = double_max; double mid = 1.0; double mid_best = mid; double adiff_min = double_max; bool converged = false; for (std::size_t iter = 0; iter < n_iter; iter++) { double psum = 0.0; for (auto j = i_begin; j < i_end; j++) { psum += std::pow(probabilities[j], mid); } double adiff = std::abs(psum - target); if (adiff < tol) { converged = true; break; } if (adiff < adiff_min) { adiff_min = adiff; mid_best = mid; } if (psum < target) { hi = mid; mid = 0.5 * (lo + hi); } else { lo = mid; if (hi == double_max) { mid *= 2; } else { mid = 0.5 * (lo + hi); } } } if (!converged) { ++n_window_search_fails; mid = mid_best; } return mid; } void reset_local_metric(std::vector &probabilities, const std::vector &prob_ptr, std::size_t i, double target, double tol, std::size_t n_iter, std::size_t &n_window_search_fails) { auto i_begin = prob_ptr[i]; auto i_end = prob_ptr[i + 1]; auto mid = reset_local_metric(probabilities, i_begin, i_end, target, tol, n_iter, n_window_search_fails); // create the final membership strengths for (auto j = i_begin; j < i_end; j++) { probabilities[j] = std::pow(probabilities[j], mid); } } void reset_local_metric(std::size_t begin, std::size_t end, std::vector &probabilities, const std::vector &prob_ptr, double target, double tol, std::size_t n_iter, std::atomic_size_t &n_search_fails) { std::size_t n_window_search_fails = 0; for (auto i = begin; i < end; i++) { reset_local_metric(probabilities, prob_ptr, i, target, tol, n_iter, n_window_search_fails); } n_search_fails += n_window_search_fails; } } // namespace uwot #endif // UWOT_SMOOTH_KNN_H uwot/inst/include/uwot/supervised.h0000644000176200001440000001125314733074465017227 0ustar liggesusers// BSD 2-Clause License // // Copyright 2020 James Melville // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are met: // // 1. Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimer. // // 2. Redistributions in binary form must reproduce the above copyright notice, // this list of conditions and the following disclaimer in the documentation // and/or other materials provided with the distribution. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE // OF SUCH DAMAGE. #include #include #include #include #ifndef UWOT_SUPERVISED_H #define UWOT_SUPERVISED_H namespace uwot { void fast_intersection(const std::vector &rows, const std::vector &cols, std::vector &values, const std::vector &target, double unknown_dist = 1.0, double far_dist = 5.0, int na = (std::numeric_limits::min)() ) { double ex_unknown = std::exp(-unknown_dist); double ex_far = std::exp(-far_dist); auto len = values.size(); for (std::size_t nz = 0; nz < len; ++nz) { auto i = rows[nz]; auto j = cols[nz]; if (target[i] == na || target[j] == na) { values[nz] = values[nz] * ex_unknown; } else if (target[i] != target[j]) { values[nz] = values[nz] * ex_far; } } } void general_sset_intersection( const std::vector &indptr1, const std::vector &indices1, const std::vector &data1, const std::vector &indptr2, const std::vector &indices2, const std::vector &data2, const std::vector &result_row, const std::vector &result_col, std::vector &result_val, double mix_weight = 0.5) { double left_min = (std::max)(*std::min_element(data1.begin(), data1.end()) / 2.0, 1.0e-8); double right_min = (std::max)(*std::min_element(data2.begin(), data2.end()) / 2.0, 1.0e-8); for (std::size_t idx = 0; idx < result_row.size(); idx++) { auto i = result_col[idx]; auto j = result_row[idx]; double left_val = left_min; for (auto k = indptr1[i]; k < indptr1[i + 1]; k++) { if (indices1[k] == j) { left_val = data1[k]; } } double right_val = right_min; for (auto k = indptr2[i]; k < indptr2[i + 1]; k++) { if (indices2[k] == j) { right_val = data2[k]; } } if (left_val > left_min || right_val > right_min) { if (mix_weight < 0.5) { result_val[idx] = left_val * std::pow(right_val, (mix_weight / (1.0 - mix_weight))); } else { result_val[idx] = right_val * std::pow(left_val, (((1.0 - mix_weight) / mix_weight))); } } } } void general_sset_union( const std::vector &indptr1, const std::vector &indices1, const std::vector &data1, const std::vector &indptr2, const std::vector &indices2, const std::vector &data2, const std::vector &result_row, const std::vector &result_col, std::vector &result_val, double mix_weight = 0.5) { double left_min = (std::max)(*std::min_element(data1.begin(), data1.end()) / 2.0, 1.0e-8); double right_min = (std::max)(*std::min_element(data2.begin(), data2.end()) / 2.0, 1.0e-8); for (std::size_t idx = 0; idx < result_row.size(); idx++) { auto i = result_col[idx]; auto j = result_row[idx]; double left_val = left_min; for (auto k = indptr1[i]; k < indptr1[i + 1]; k++) { if (indices1[k] == j) { left_val = data1[k]; } } double right_val = right_min; for (auto k = indptr2[i]; k < indptr2[i + 1]; k++) { if (indices2[k] == j) { right_val = data2[k]; } } result_val[idx] = left_val + right_val - left_val * right_val; } } } // namespace uwot #endif // UWOT_SUPERVISED_H uwot/inst/include/uwot/epoch.h0000644000176200001440000001354414577210515016133 0ustar liggesusers// BSD 2-Clause License // // Copyright 2020 James Melville // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are met: // // 1. Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimer. // // 2. Redistributions in binary form must reproduce the above copyright notice, // this list of conditions and the following disclaimer in the documentation // and/or other materials provided with the distribution. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE // OF SUCH DAMAGE. #ifndef UWOT_EPOCH_H #define UWOT_EPOCH_H #include "sampler.h" #include "update.h" namespace uwot { template void optimize_layout(Worker &worker, Progress &progress, unsigned int n_epochs, Parallel ¶llel) { for (auto n = 0U; n < n_epochs; n++) { run_epoch(worker, n, n_epochs, parallel); if (progress.is_aborted()) { break; } progress.report(); } } template void run_epoch(Worker &worker, std::size_t epoch, std::size_t n_epochs, Parallel ¶llel) { worker.epoch_begin(epoch, n_epochs); parallel.pfor(worker.n_items, worker); worker.epoch_end(epoch, n_epochs, parallel); } // Gradient: the type of gradient used in the optimization // Update: type of update to the embedding coordinates template struct EdgeWorker { const Gradient gradient; Update &update; const std::vector &positive_head; const std::vector &positive_tail; uwot::Sampler sampler; std::size_t ndim; std::size_t n_tail_vertices; std::size_t n_items; std::size_t n_threads; RngFactory rng_factory; EdgeWorker(const Gradient &gradient, Update &update, const std::vector &positive_head, const std::vector &positive_tail, uwot::Sampler &sampler, std::size_t ndim, std::size_t n_tail_vertices, std::size_t n_threads) : gradient(gradient), update(update), positive_head(positive_head), positive_tail(positive_tail), sampler(sampler), ndim(ndim), n_tail_vertices(n_tail_vertices), n_items(positive_head.size()), n_threads(std::max(n_threads, std::size_t{1})), rng_factory(this->n_threads) {} void epoch_begin(std::size_t epoch, std::size_t n_epochs) { rng_factory.reseed(); sampler.epoch = epoch; update.epoch_begin(epoch, n_epochs); } template void epoch_end(std::size_t epoch, std::size_t n_epochs, Parallel ¶llel) { update.epoch_end(epoch, n_epochs, parallel); } void operator()(std::size_t begin, std::size_t end, std::size_t thread_id) { // Each window gets its own PRNG state, to prevent locking inside the loop. auto prng = rng_factory.create(end); // displacement between two points, cost of reallocating inside the loop // is noticeable, also cheaper to calculate it once in the d2 calc std::vector disp(ndim); for (auto edge = begin; edge < end; edge++) { process_edge(update, gradient, sampler, prng, positive_head, positive_tail, ndim, n_tail_vertices, edge, thread_id, disp); } } }; template struct NodeWorker { const Gradient gradient; Update &update; const std::vector &positive_head; const std::vector &positive_tail; const std::vector &positive_ptr; uwot::Sampler sampler; std::size_t ndim; std::size_t n_tail_vertices; std::size_t n_items; RngFactory rng_factory; NodeWorker(const Gradient &gradient, Update &update, const std::vector &positive_head, const std::vector &positive_tail, const std::vector &positive_ptr, uwot::Sampler &sampler, std::size_t ndim, std::size_t n_tail_vertices) : gradient(gradient), update(update), positive_head(positive_head), positive_tail(positive_tail), positive_ptr(positive_ptr), sampler(sampler), ndim(ndim), n_tail_vertices(n_tail_vertices), n_items(positive_ptr.size() - 1), rng_factory(n_items) {} void epoch_begin(std::size_t epoch, std::size_t n_epochs) { rng_factory.reseed(); sampler.epoch = epoch; update.epoch_begin(epoch, n_epochs); } template void epoch_end(std::size_t epoch, std::size_t n_epochs, Parallel ¶llel) { update.epoch_end(epoch, n_epochs, parallel); } void operator()(std::size_t begin, std::size_t end, std::size_t thread_id) { std::vector disp(ndim); for (auto p = begin; p < end; p++) { auto prng = rng_factory.create(p); for (auto edge = positive_ptr[p]; edge < positive_ptr[p + 1]; edge++) { process_edge(update, gradient, sampler, prng, positive_head, positive_tail, ndim, n_tail_vertices, edge, thread_id, disp); } } } }; } // namespace uwot #endif // UWOT_EPOCH_H uwot/inst/include/uwot/tauprng.h0000644000176200001440000000617714733074465016527 0ustar liggesusers// BSD 2-Clause License // // Copyright 2020 James Melville // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are met: // // 1. Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimer. // // 2. Redistributions in binary form must reproduce the above copyright notice, // this list of conditions and the following disclaimer in the documentation // and/or other materials provided with the distribution. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE // OF SUCH DAMAGE. // Three-component combined Tausworthe "taus88" PRNG from L'Ecuyer 1996. #ifndef UWOT_TAUPRNG_H #define UWOT_TAUPRNG_H namespace uwot { struct tau_prng { uint64_t state0; uint64_t state1; // technically this needs to always be > 7 uint64_t state2; // and this should be > 15 static constexpr uint64_t MAGIC0{4294967294}; static constexpr uint64_t MAGIC1{4294967288}; static constexpr uint64_t MAGIC2{4294967280}; tau_prng(uint64_t state0, uint64_t state1, uint64_t state2) : state0(state0), state1(state1 > 7 ? state1 : 8), state2(state2 > 15 ? state2 : 16) {} auto operator()() -> int32_t { state0 = (((state0 & MAGIC0) << 12) & 0xffffffff) ^ ((((state0 << 13) & 0xffffffff) ^ state0) >> 19); state1 = (((state1 & MAGIC1) << 4) & 0xffffffff) ^ ((((state1 << 2) & 0xffffffff) ^ state1) >> 25); state2 = (((state2 & MAGIC2) << 17) & 0xffffffff) ^ ((((state2 << 3) & 0xffffffff) ^ state2) >> 11); return state0 ^ state1 ^ state2; } // return a value in (0, n] auto operator()(std::size_t n, std::size_t, std::size_t) -> std::size_t { std::size_t result = (*this)() % n; return result; } }; // A deterministic number generator, based on a comment from Leland McInnes at: // https://www.reddit.com/r/MachineLearning/comments/1gsjfq9/comment/lxip9wy/ // "randomness doesn't really have to be that good, it just has to not select // the same things over and over. So simply computing something like // (edge_index * (epoch_number + 1)) % n_vertices can give "random enough" // results" struct deterministic_ng { auto operator()(std::size_t n_vertices, std::size_t edge_index, std::size_t epoch) -> std::size_t { return (edge_index * (epoch + 1)) % n_vertices; } }; } // namespace uwot #endif // UWOT_TAUPRNG_H uwot/inst/include/uwot/gradient.h0000644000176200001440000002134314733730066016630 0ustar liggesusers// BSD 2-Clause License // // Copyright 2020 James Melville // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are met: // // 1. Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimer. // // 2. Redistributions in binary form must reproduce the above copyright notice, // this list of conditions and the following disclaimer in the documentation // and/or other materials provided with the distribution. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE // OF SUCH DAMAGE. #ifndef UWOT_GRADIENT_H #define UWOT_GRADIENT_H #include #include #include namespace uwot { inline auto clamp(float v, float lo, float hi) -> float { float t = v < lo ? lo : v; return t > hi ? hi : t; } // return the squared euclidean distance between two points x[px] and y[py] // also store the displacement between x[px] and y[py] in diffxy // there is a small but noticeable performance improvement by doing so // rather than recalculating it in the gradient step inline auto d2diff(const std::vector &x, std::size_t px, const std::vector &y, std::size_t py, std::size_t ndim, float dist_eps, std::vector &diffxy) -> float { float d2 = 0.0; for (std::size_t d = 0; d < ndim; d++) { float diff = x[px + d] - y[py + d]; diffxy[d] = diff; d2 += diff * diff; } return (std::max)(dist_eps, d2); } // The gradient for the dth component of the displacement between two point, // which for Euclidean distance in the output is invariably grad_coeff * (X - Y) // Different methods clamp the magnitude of the gradient to different values template auto grad_d(const Gradient &gradient, const std::vector &disp, std::size_t d, float grad_coeff) -> float { return gradient.clamp_grad(grad_coeff * disp[d]); } template auto grad_attr(const Gradient &gradient, const std::vector &head_embedding, std::size_t dj, const std::vector &tail_embedding, std::size_t dk, std::size_t ndim, std::vector &disp) -> float { static const float dist_eps = std::numeric_limits::epsilon(); float d2 = d2diff(head_embedding, dj, tail_embedding, dk, ndim, dist_eps, disp); return gradient.grad_attr(d2, dj, dk); } template auto grad_rep(const Gradient &gradient, const std::vector &head_embedding, std::size_t dj, const std::vector &tail_embedding, std::size_t dk, std::size_t ndim, std::vector &disp) -> float { static const float dist_eps = std::numeric_limits::epsilon(); float d2 = d2diff(head_embedding, dj, tail_embedding, dk, ndim, dist_eps, disp); return gradient.grad_rep(d2, dj, dk); } // https://martin.ankerl.com/2012/01/25/optimized-approximative-pow-in-c-and-cpp/ // an approximation to pow inline auto fastPrecisePow(float a, float b) -> float { // calculate approximation with fraction of the exponent int e = static_cast(b); union { double d; int x[2]; } u = {a}; u.x[1] = static_cast((b - static_cast(e)) * static_cast(u.x[1] - 1072632447) + 1072632447.0); u.x[0] = 0; // exponentiation by squaring with the exponent's integer part // double r = u.d makes everything much slower, not sure why double r = 1.0; while (e) { if (e & 1) { r *= a; } a *= a; e >>= 1; } return static_cast(r * u.d); } // Class templated on the powfun function as suggested by Aaron Lun template class base_umap_gradient { public: base_umap_gradient(float a, float b, float gamma) : a(a), b(b), a_b_m2(-2.0 * a * b), gamma_b_2(2.0 * gamma * b){}; // Compared to the UMAP Python implementation, instead of doing d2^(b-1) // we can save a power calculation by using d2^b / d2 auto grad_attr(float d2, std::size_t, std::size_t) const -> float { float pd2b = powfun(d2, b); return (a_b_m2 * pd2b) / (d2 * (a * pd2b + 1.0)); } auto grad_rep(float d2, std::size_t, std::size_t) const -> float { return gamma_b_2 / ((0.001 + d2) * (a * powfun(d2, b) + 1.0)); } auto clamp_grad(float grad_d) const -> float { return clamp(grad_d, clamp_lo, clamp_hi); } static const constexpr float clamp_hi = 4.0; static const constexpr float clamp_lo = -4.0; private: float a; float b; float a_b_m2; float gamma_b_2; }; // UMAP using standard power function using umap_gradient = base_umap_gradient; // apUMAP: UMAP with an approximate power calculation using apumap_gradient = base_umap_gradient; // t-UMAP: the UMAP function with a = 1, and b = 1, which results in the Cauchy // distribution as used in t-SNE. This massively simplifies the gradient, // removing the pow calls, resulting in a noticeable speed increase (50% with // MNIST), although the resulting embedding has a larger spread than the // default. class tumap_gradient { public: tumap_gradient(float gamma) : gamma_2(gamma * 2.0){}; auto grad_attr(float d2, std::size_t, std::size_t) const -> float { return -2.0 / (d2 + 1.0); } auto grad_rep(float d2, std::size_t, std::size_t) const -> float { return gamma_2 / ((0.001 + d2) * (d2 + 1.0)); } auto clamp_grad(float grad_d) const -> float { return clamp(grad_d, clamp_lo, clamp_hi); } static const constexpr float clamp_hi = 4.0; static const constexpr float clamp_lo = -4.0; private: float gamma_2; }; // UMAP where a varies for each observation class umapai_gradient { public: umapai_gradient(const std::vector &ai, float b, std::size_t ndim) : ai(ai), b(b), ndim(ndim), b_m2(-2.0 * b), b_2(2.0 * b) {} auto clamp_grad(float grad_d) const -> float { return clamp(grad_d, clamp_lo, clamp_hi); } static const constexpr float clamp_hi = 4.0; static const constexpr float clamp_lo = -4.0; auto grad_attr(float d2, std::size_t i, std::size_t j) const -> float { auto a = ai[i / ndim] * ai[j / ndim]; float pd2b = std::pow(d2, b); return (a * b_m2 * pd2b) / (d2 * (a * pd2b + 1.0)); } auto grad_rep(float d2, std::size_t i, std::size_t j) const -> float { auto a = ai[i / ndim] * ai[j / ndim]; return b_2 / ((0.001 + d2) * (a * std::pow(d2, b) + 1.0)); } private: std::vector ai; float b; std::size_t ndim; float b_m2; float b_2; }; class umapai2_gradient { public: umapai2_gradient(const std::vector &ai, const std::vector &aj, float b, std::size_t ndim) : ai(ai), aj(aj), b(b), ndim(ndim), b_m2(-2.0 * b), b_2(2.0 * b) {} auto clamp_grad(float grad_d) const -> float { return clamp(grad_d, clamp_lo, clamp_hi); } static const constexpr float clamp_hi = 4.0; static const constexpr float clamp_lo = -4.0; auto grad_attr(float d2, std::size_t i, std::size_t j) const -> float { auto a = ai[i / ndim] * aj[j / ndim]; float pd2b = std::pow(d2, b); return (a * b_m2 * pd2b) / (d2 * (a * pd2b + 1.0)); } auto grad_rep(float d2, std::size_t i, std::size_t j) const -> float { auto a = ai[i / ndim] * aj[j / ndim]; return b_2 / ((0.001 + d2) * (a * std::pow(d2, b) + 1.0)); } private: std::vector ai; std::vector aj; float b; std::size_t ndim; float b_m2; float b_2; }; class largevis_gradient { public: largevis_gradient(float gamma) : gamma_2(gamma * 2.0) {} auto grad_attr(float d2, std::size_t, std::size_t) const -> float { return -2.0 / (d2 + 1.0); } auto grad_rep(float d2, std::size_t, std::size_t) const -> float { return gamma_2 / ((0.1 + d2) * (d2 + 1.0)); } auto clamp_grad(float grad_d) const -> float { return clamp(grad_d, clamp_lo, clamp_hi); } static const constexpr float clamp_hi = 5.0; static const constexpr float clamp_lo = -5.0; private: float gamma_2; }; } // namespace uwot #endif // UWOT_GRADIENT_H uwot/inst/include/uwot/optimize.h0000644000176200001440000001065414730166740016675 0ustar liggesusers// BSD 2-Clause License // // Copyright 2020 James Melville // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are met: // // 1. Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimer. // // 2. Redistributions in binary form must reproduce the above copyright notice, // this list of conditions and the following disclaimer in the documentation // and/or other materials provided with the distribution. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE // OF SUCH DAMAGE. #ifndef UWOT_OPTIMIZE_H #define UWOT_OPTIMIZE_H #include namespace uwot { float linear_decay(double val, std::size_t epoch, std::size_t n_epochs) { return val * (1.0 - (static_cast(epoch) / static_cast(n_epochs))); } float linear_grow(double val, std::size_t epoch, std::size_t n_epochs) { return val * (static_cast(epoch) / static_cast(n_epochs)); } struct Optimizer { virtual ~Optimizer() = default; virtual void update(std::vector &v, std::vector &grad, std::size_t begin, std::size_t end) = 0; virtual void epoch_end(std::size_t epoch, std::size_t n_epochs) = 0; }; struct Sgd : public Optimizer { float initial_alpha; float alpha; Sgd(float alpha) : initial_alpha(alpha), alpha(alpha){}; virtual ~Sgd() = default; void update(std::vector &v, std::vector &grad, std::size_t begin, std::size_t end) override { for (std::size_t i = begin; i < end; i++) { v[i] += alpha * grad[i]; } } void epoch_end(std::size_t epoch, std::size_t n_epochs) override { alpha = linear_decay(initial_alpha, epoch, n_epochs); } }; struct Adam : public Optimizer { float initial_alpha; float alpha; float beta1; float beta2; float beta11; // 1 - beta1 float beta1t; // beta1 ^ t float beta21; // 1 - beta2 float beta2t; // beta2 ^ t float eps; // rather than calculate the debiased values for m and v (mhat and vhat) // directly, fold them into a scaling factor for alpha as described at the // end of the first paragraph of section 2 of the Adam paper // technically you also need to rescale eps (given as eps-hat in the paper) float epsc; // scaled eps float ad_scale; // scaled alpha std::vector mt; std::vector vt; Adam(float alpha, float beta1, float beta2, float eps, std::size_t vec_size) : initial_alpha(alpha), alpha(alpha), beta1(beta1), beta2(beta2), beta11(1.0 - beta1), beta1t(beta1), beta21(1.0 - beta2), beta2t(beta2), eps(eps), epsc(eps * sqrt(beta21)), ad_scale(alpha * sqrt(beta21) / beta11), mt(vec_size), vt(vec_size) {} virtual ~Adam() = default; void update(std::vector &v, std::vector &grad, std::size_t begin, std::size_t end) override { for (std::size_t i = begin; i < end; i++) { // this takes advantage of updating in-place to give a more compact // version of mt[i] = beta1 * mt[i] + beta11 * grad[i] etc. vt[i] += beta21 * (grad[i] * grad[i] - vt[i]); mt[i] += beta11 * (grad[i] - mt[i]); // ad_scale and epsc handle the debiasing v[i] += ad_scale * mt[i] / (sqrt(vt[i]) + epsc); } } void epoch_end(std::size_t epoch, std::size_t n_epochs) override { alpha = linear_decay(initial_alpha, epoch, n_epochs); // update debiasing factors beta1t *= beta1; beta2t *= beta2; float sqrt_b2t1 = sqrt(1.0 - beta2t); // rescale alpha and eps to take account of debiasing ad_scale = alpha * sqrt_b2t1 / (1.0 - beta1t); epsc = sqrt_b2t1 * eps; } }; } // namespace uwot #endif // UWOT_OPTIMIZE_H uwot/inst/include/uwot/transform.h0000644000176200001440000000711114730166740017042 0ustar liggesusers// BSD 2-Clause License // // Copyright 2020 James Melville // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are met: // // 1. Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimer. // // 2. Redistributions in binary form must reproduce the above copyright notice, // this list of conditions and the following disclaimer in the documentation // and/or other materials provided with the distribution. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE // OF SUCH DAMAGE. #ifndef UWOT_TRANSFORM_H #define UWOT_TRANSFORM_H namespace uwot { inline auto sum_nbrs(std::size_t i, const std::vector &train_embedding, const std::vector &nn_index, std::size_t n_neighbors, std::size_t ndim, std::vector &sumc) -> void { for (std::size_t j = 0; j < n_neighbors; j++) { auto nbr = nn_index[i * n_neighbors + j]; for (std::size_t k = 0; k < ndim; k++) { sumc[k] += train_embedding[ndim * nbr + k]; } } } inline auto sum_nbrs_weighted(std::size_t i, const std::vector &train_embedding, const std::vector &nn_index, std::size_t n_neighbors, std::size_t ndim, const std::vector &nn_weights, std::vector &sumc, double &sumw) -> void { std::size_t i_nbrs = i * n_neighbors; for (std::size_t j = 0; j < n_neighbors; j++) { auto w = nn_weights[i_nbrs + j]; sumw += w; auto nbr = nn_index[i_nbrs + j]; for (std::size_t k = 0; k < ndim; k++) { sumc[k] += train_embedding[ndim * nbr + k] * w; } } } void init_by_mean(std::size_t begin, std::size_t end, std::size_t ndim, std::size_t n_neighbors, const std::vector &nn_index, const std::vector &nn_weights, std::size_t n_test_vertices, const std::vector &train_embedding, std::size_t n_train_vertices, std::vector &embedding) { bool weighted = nn_weights.size() > 0; std::vector sumc(ndim); for (std::size_t i = begin; i < end; i++) { std::fill(sumc.begin(), sumc.end(), 0.0); double sumw = 0.0; // cost of checking this boolean N times is not going to be a bottleneck if (weighted) { sum_nbrs_weighted(i, train_embedding, nn_index, n_neighbors, ndim, nn_weights, sumc, sumw); } else { sumw = static_cast(n_neighbors); sum_nbrs(i, train_embedding, nn_index, n_neighbors, ndim, sumc); } for (std::size_t k = 0; k < ndim; k++) { embedding[ndim * i + k] = sumc[k] / sumw; } } } } // namespace uwot #endif // UWOT_TRANSFORM_H uwot/inst/include/uwot/update.h0000644000176200001440000002254114733074465016322 0ustar liggesusers// BSD 2-Clause License // // Copyright 2021 James Melville // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are met: // // 1. Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimer. // // 2. Redistributions in binary form must reproduce the above copyright notice, // this list of conditions and the following disclaimer in the documentation // and/or other materials provided with the distribution. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE // OF SUCH DAMAGE. #ifndef UWOT_UPDATE_H #define UWOT_UPDATE_H #include #include "gradient.h" #include "optimize.h" #include "sampler.h" namespace uwot { // (User-supplied) callback to be invoked at the end of each epoch // e.g. to plot or save coordinates for diagnostic purposes struct EpochCallback { virtual void operator()(std::size_t epoch, std::size_t n_epochs, const std::vector &head_embedding, const std::vector &tail_embedding) = 0; virtual ~EpochCallback() {} }; struct DoNothingCallback : EpochCallback { void operator()(std::size_t, std::size_t, const std::vector &, const std::vector &) override {} }; template inline void process_edge(Update &update, Gradient &gradient, Sampler &sampler, Prng &prng, const std::vector &positive_head, const std::vector &positive_tail, std::size_t ndim, std::size_t n_tail_vertices, std::size_t edge, std::size_t thread_id, std::vector &disp) { if (!sampler.is_sample_edge(edge)) { return; } const std::size_t dj = ndim * positive_head[edge]; const std::size_t dk = ndim * positive_tail[edge]; // j and k are joined by an edge: push them together update_attract(update, gradient, dj, dk, ndim, disp, thread_id); // Negative sampling step: assume any other point (dkn) is a -ve example const std::size_t epoch = sampler.epoch; std::size_t n_neg_samples = sampler.get_num_neg_samples(edge); for (std::size_t p = 0; p < n_neg_samples; p++) { const std::size_t dkn = prng(n_tail_vertices, edge, epoch) * ndim; if (dj == dkn) { continue; } // push them apart update_repel(update, gradient, dj, dkn, ndim, disp, thread_id); } sampler.next_sample(edge, n_neg_samples); } template inline void update_attract(Update &update, const Gradient &gradient, std::size_t dj, std::size_t dk, std::size_t ndim, std::vector &disp, std::size_t key) { float grad_coeff = grad_attr(gradient, update.head_embedding, dj, update.tail_embedding, dk, ndim, disp); for (std::size_t d = 0; d < ndim; d++) { update.attract(dj, dk, d, grad_d(gradient, disp, d, grad_coeff), key); } } template inline void update_repel(Update &update, const Gradient &gradient, std::size_t dj, std::size_t dk, std::size_t ndim, std::vector &disp, std::size_t key) { float grad_coeff = grad_rep(gradient, update.head_embedding, dj, update.tail_embedding, dk, ndim, disp); for (std::size_t d = 0; d < ndim; d++) { update.repel(dj, dk, d, grad_d(gradient, disp, d, grad_coeff), key); } } // Function to decide whether to move both vertices in an edge // Default empty version does nothing: used in umap_transform when // some of the vertices should be held fixed template inline void update_vec(std::vector &, float, std::size_t, std::size_t) {} // Specialization to move vertex/update gradient: used in umap when both // vertices in an edge should be moved template <> inline void update_vec(std::vector &vec, float val, std::size_t i, std::size_t j) { vec[i + j] += val; } // If DoMoveTailVertex = true, graph is symmetric and head and tail point to the // same data. So we can just update the head coord i with double the gradient // now and not worry about updating it when it shows up in the edge list as tail // point j template inline void update_head_grad_vec(std::vector &vec, std::size_t i, float val) { vec[i] += 2.0 * val; } // Specialization for DoMoveTailVertex = true. In this case the edges are not // symmetric and the tail embedding should be held fixed, so the head node only // get one lot of gradient updating template <> inline void update_head_grad_vec(std::vector &vec, std::size_t i, float val) { vec[i] += val; } // DoMoveVertex: true if both ends of a positive edge should be updated template struct InPlaceUpdate { std::vector &head_embedding; std::vector &tail_embedding; Sgd opt; std::unique_ptr epoch_callback; InPlaceUpdate(std::vector &head_embedding, std::vector &tail_embedding, float alpha, EpochCallback *epoch_callback) : head_embedding(head_embedding), tail_embedding(tail_embedding), opt(alpha), epoch_callback(std::move(epoch_callback)) {} inline void attract(std::size_t dj, std::size_t dk, std::size_t d, float grad_d, std::size_t) { float update_d = opt.alpha * grad_d; head_embedding[dj + d] += update_d; // we don't only always want points in the tail to move // e.g. if adding new points to an existing embedding update_vec(tail_embedding, -update_d, d, dk); } inline void repel(std::size_t dj, std::size_t dk, std::size_t d, float grad_d, std::size_t) { head_embedding[dj + d] += opt.alpha * grad_d; // Python implementation doesn't move the negative sample but as Damrich // and Hamprecht (2021) note, it ought to. However they also note it has // no qualitative effect on the results. This is presumably because // including this repulsion is the same as doubling the negative sample rate // which doesn't have a huge effect on going from the default of 5 to 10 // update_vec(tail_embedding, opt.alpha * -grad_d, d, dk); } inline void epoch_begin(std::size_t, std::size_t) {} template void epoch_end(std::size_t epoch, std::size_t n_epochs, Parallel &) { opt.epoch_end(epoch, n_epochs); (*epoch_callback)(epoch, n_epochs, head_embedding, tail_embedding); } }; // 1. When DoMoveVertex is true then we want to update the head and tail nodes // of an edge. In this case the head and tail coordinates point to the same data // so it doesn't matter whether we calculate the gradient for or update the // coordinates in head or tail. // 2. When DoMoveVertex is false then the head and tail coordinates point to // different data. The tail coordinates are fixed in this case, so again they // do not move. Hence both so in both cases we only ever need to update the head // coordinates. template struct BatchUpdate { std::vector &head_embedding; std::vector &tail_embedding; std::unique_ptr opt; std::vector gradient; std::unique_ptr epoch_callback; BatchUpdate(std::vector &head_embedding, std::vector &tail_embedding, std::unique_ptr opt, EpochCallback *epoch_callback) : head_embedding(head_embedding), tail_embedding(tail_embedding), opt(std::move(opt)), gradient(head_embedding.size()), epoch_callback(std::move(epoch_callback)) {} inline void attract(std::size_t dj, std::size_t dk, std::size_t d, float grad_d, std::size_t) { update_head_grad_vec(gradient, dj + d, grad_d); } inline void repel(std::size_t dj, std::size_t dk, std::size_t d, float grad_d, std::size_t) { gradient[dj + d] += grad_d; } void epoch_begin(std::size_t, std::size_t) { std::fill(gradient.begin(), gradient.end(), 0.0f); } template void epoch_end(std::size_t epoch, std::size_t n_epochs, Parallel ¶llel) { auto worker = [&](std::size_t begin, std::size_t end, std::size_t) { opt->update(head_embedding, gradient, begin, end); }; parallel.pfor(head_embedding.size(), worker); opt->epoch_end(epoch, n_epochs); (*epoch_callback)(epoch, n_epochs, head_embedding, tail_embedding); } }; } // namespace uwot #endif // UWOT_UPDATE_H uwot/inst/include/uwot/connected_components.h0000644000176200001440000000562414577210515021244 0ustar liggesusers// BSD 2-Clause License // // Copyright 2020 James Melville // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are met: // // 1. Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimer. // // 2. Redistributions in binary form must reproduce the above copyright notice, // this list of conditions and the following disclaimer in the documentation // and/or other materials provided with the distribution. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE // OF SUCH DAMAGE. // Translated from the Python source code of: // scipy.sparse.csgraph.connected_components // Author: Jake Vanderplas -- // License: BSD, (C) 2012 #ifndef UWOT_CONNECTED_COMPONENTS_H #define UWOT_CONNECTED_COMPONENTS_H namespace uwot { auto connected_components_undirected(std::size_t N, const std::vector &indices1, const std::vector &indptr1, const std::vector &indices2, const std::vector &indptr2) -> std::pair> { constexpr int VOID = -1; constexpr int END = -2; std::vector labels(N, VOID); std::vector SS(labels); unsigned int label = 0; auto SS_head = END; for (std::size_t v = 0; v < N; ++v) { auto vv = v; if (labels[vv] == VOID) { SS_head = vv; SS[vv] = END; while (SS_head != END) { vv = SS_head; SS_head = SS[vv]; labels[vv] = label; for (auto jj = indptr1[vv]; jj < indptr1[vv + 1]; ++jj) { auto ww = indices1[jj]; if (SS[ww] == VOID) { SS[ww] = SS_head; SS_head = ww; } } for (auto jj = indptr2[vv]; jj < indptr2[vv + 1]; ++jj) { auto ww = indices2[jj]; if (SS[ww] == VOID) { SS[ww] = SS_head; SS_head = ww; } } } ++label; } } return {label, labels}; } } // namespace uwot #endif // UWOT_CONNECTED_COMPONENTS_H uwot/inst/include/uwot/sampler.h0000644000176200001440000000554514577210515016502 0ustar liggesusers// BSD 2-Clause License // // Copyright 2020 James Melville // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are met: // // 1. Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimer. // // 2. Redistributions in binary form must reproduce the above copyright notice, // this list of conditions and the following disclaimer in the documentation // and/or other materials provided with the distribution. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE // OF SUCH DAMAGE. #ifndef UWOT_SAMPLER_H #define UWOT_SAMPLER_H #include namespace uwot { // Weighted edge sampler class Sampler { public: std::size_t epoch; Sampler(const std::vector &epochs_per_sample, float negative_sample_rate) : epoch(0), epochs_per_sample(epochs_per_sample), epoch_of_next_sample(epochs_per_sample), epochs_per_negative_sample(epochs_per_sample.size()), epoch_of_next_negative_sample(epochs_per_sample.size()) { std::size_t esz = epochs_per_sample.size(); float nsr = 1.0 / negative_sample_rate; for (std::size_t i = 0; i < esz; i++) { epochs_per_negative_sample[i] = epochs_per_sample[i] * nsr; epoch_of_next_negative_sample[i] = epochs_per_negative_sample[i]; } } auto is_sample_edge(std::size_t edge) const -> bool { return epoch_of_next_sample[edge] <= epoch; } auto get_num_neg_samples(std::size_t edge) const -> std::size_t { return static_cast( (epoch - epoch_of_next_negative_sample[edge]) / epochs_per_negative_sample[edge]); } void next_sample(std::size_t edge, std::size_t num_neg_samples) { epoch_of_next_sample[edge] += epochs_per_sample[edge]; epoch_of_next_negative_sample[edge] += num_neg_samples * epochs_per_negative_sample[edge]; } private: std::vector epochs_per_sample; std::vector epoch_of_next_sample; std::vector epochs_per_negative_sample; std::vector epoch_of_next_negative_sample; }; } // namespace uwot #endif // UWOT_SAMPLER_H uwot/inst/doc/0000755000176200001440000000000014757004303012775 5ustar liggesusersuwot/inst/doc/uwot.R0000644000176200001440000000660114757004302014120 0ustar liggesusers## ----include = FALSE---------------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ## ----setup-------------------------------------------------------------------- library(uwot) library(RSpectra) ## ----plot function------------------------------------------------------------ kabsch <- function(pm, qm) { pm_dims <- dim(pm) if (!all(dim(qm) == pm_dims)) { stop(call. = TRUE, "Point sets must have the same dimensions") } # The rotation matrix will have (ncol - 1) leading ones in the diagonal diag_ones <- rep(1, pm_dims[2] - 1) # center the points pm <- scale(pm, center = TRUE, scale = FALSE) qm <- scale(qm, center = TRUE, scale = FALSE) am <- crossprod(pm, qm) svd_res <- svd(am) # use the sign of the determinant to ensure a right-hand coordinate system d <- determinant(tcrossprod(svd_res$v, svd_res$u))$sign dm <- diag(c(diag_ones, d)) # rotation matrix um <- svd_res$v %*% tcrossprod(dm, svd_res$u) # Rotate and then translate to the original centroid location of qm sweep(t(tcrossprod(um, pm)), 2, -attr(qm, "scaled:center")) } iris_pca2 <- prcomp(iris[, 1:4])$x[, 1:2] plot_umap <- function(coords, col = iris$Species, pca = iris_pca2) { plot(kabsch(coords, pca), col = col, xlab = "", ylab = "") } ## ----basic UMAP--------------------------------------------------------------- set.seed(42) iris_umap <- umap(iris) plot_umap(iris_umap) ## ----min_dist 0.5------------------------------------------------------------- set.seed(42) iris_umap_md05 <- umap(iris, min_dist = 0.3) plot_umap(iris_umap_md05) ## ----5 neighbors-------------------------------------------------------------- set.seed(42) iris_umap_nbrs5 <- umap(iris, n_neighbors = 5, min_dist = 0.3) plot_umap(iris_umap_nbrs5) ## ----100 neighbors------------------------------------------------------------ set.seed(42) iris_umap_nbrs100 <- umap(iris, n_neighbors = 100, min_dist = 0.3) plot_umap(iris_umap_nbrs100) ## ----spca init---------------------------------------------------------------- set.seed(42) iris_umap_spca <- umap(iris, init = "spca", init_sdev = "range", min_dist = 0.3 ) plot_umap(iris_umap_spca) ## ----UMAP with density scaling------------------------------------------------ set.seed(42) iris_umapds <- umap(iris, min_dist = 0.3, dens_scale = 0.5) plot_umap(iris_umapds) ## ----create a UMAP model------------------------------------------------------ set.seed(42) iris_train <- iris[iris$Species %in% c("setosa", "versicolor"), ] iris_train_umap <- umap(iris_train, min_dist = 0.3, ret_model = TRUE) plot( iris_train_umap$embedding, col = iris_train$Species, xlab = "", ylab = "", main = "UMAP setosa + versicolor" ) ## ----embed new coordinates---------------------------------------------------- iris_test <- iris[iris$Species == "virginica", ] set.seed(42) iris_test_umap <- umap_transform(iris_test, iris_train_umap) plot( rbind(iris_train_umap$embedding, iris_test_umap), col = iris$Species, xlab = "", ylab = "", main = "UMAP transform virginica" ) ## ----echo=FALSE, out.width="75%", fig.cap="MNIST UMAP (Python)"--------------- knitr::include_graphics("mnist-py.png") ## ----echo=FALSE, out.width="75%", fig.cap="MNIST UMAP (R)"-------------------- knitr::include_graphics("mnist-r.png") uwot/inst/doc/uwot.html0000644000176200001440000074434314757004303014700 0ustar liggesusers uwot

uwot

library(uwot)
#> Loading required package: Matrix
library(RSpectra)

uwot is the a package for implementing the UMAP dimensionality reduction method. For more information on UMAP, see the original paper and the Python package.

We’ll use the iris dataset in these examples. It’s not the ideal dataset because it’s not terribly large nor high-dimensional (with only 4 numeric columns), but you’ll get the general idea.

The default output dimensionality of UMAP is into two-dimensions, so it’s amenable for visualization, but you can set a larger value with n_components. In this vignette we’ll stick with two dimensions. We will need a function to make plotting easier:

kabsch <- function(pm, qm) {
  pm_dims <- dim(pm)
  if (!all(dim(qm) == pm_dims)) {
    stop(call. = TRUE, "Point sets must have the same dimensions")
  }
  # The rotation matrix will have (ncol - 1) leading ones in the diagonal
  diag_ones <- rep(1, pm_dims[2] - 1)

  # center the points
  pm <- scale(pm, center = TRUE, scale = FALSE)
  qm <- scale(qm, center = TRUE, scale = FALSE)

  am <- crossprod(pm, qm)

  svd_res <- svd(am)
  # use the sign of the determinant to ensure a right-hand coordinate system
  d <- determinant(tcrossprod(svd_res$v, svd_res$u))$sign
  dm <- diag(c(diag_ones, d))

  # rotation matrix
  um <- svd_res$v %*% tcrossprod(dm, svd_res$u)

  # Rotate and then translate to the original centroid location of qm
  sweep(t(tcrossprod(um, pm)), 2, -attr(qm, "scaled:center"))
}
iris_pca2 <- prcomp(iris[, 1:4])$x[, 1:2]
plot_umap <- function(coords, col = iris$Species, pca = iris_pca2) {
  plot(kabsch(coords, pca), col = col, xlab = "", ylab = "")
}

Most of this code is just the kabsch algorithm to align two point sets, which I am going to use to align the results of UMAP over the first two principal components. This is to keep the relative orientation of the output the same across different plots which makes it a bit easier to see the differences between them. UMAP is a stochastic algorithm so the output will be different each time you run it and small changes to the parameters can affect the absolute values of the coordinates, although the interpoint differences are usually similar. There’s no need to go to such trouble in most circumstances: the output of umap is a perfectly useful 2D matrix of coordinates you can pass into a plotting function with no further processing required.

Basic UMAP

The defaults of the umap function should work for most datasets. No scaling of the input data is done, but non-numeric columns are ignored:

set.seed(42)
iris_umap <- umap(iris)
plot_umap(iris_umap)

Parameters

uwot has accumulated many parameters over time, but most of the time there are only a handful you need worry about. The most important ones are:

min_dist

This is a mainly aesthetic parameter, which defines how close points can get in the output space. A smaller value tends to make any clusters in the output more compact. You should experiment with values between 0 and 1, although don’t choose exactly zero. The default is 0.01, which seems like it’s a bit small for iris. Let’s crank up min_dist to 0.3:

set.seed(42)
iris_umap_md05 <- umap(iris, min_dist = 0.3)
plot_umap(iris_umap_md05)

This has made the clusters bigger and closer together, so we’ll use min_dist = 0.3 for the other examples with iris.

n_neighbors

This defines the number of items in the dataset that define the neighborhood around each point. Set it too low and you will get a more fragmented layout. Set it too high and you will get something that will miss a lot of local structure.

Here’s a result with 5 neighbors:

set.seed(42)
iris_umap_nbrs5 <- umap(iris, n_neighbors = 5, min_dist = 0.3)
plot_umap(iris_umap_nbrs5)

It’s not hugely different from the default of 15 neighbors, but the clusters are a bit more broken up.

There should be a more pronounced difference going the other way and looking at 100 neighbors:

set.seed(42)
iris_umap_nbrs100 <- umap(iris, n_neighbors = 100, min_dist = 0.3)
plot_umap(iris_umap_nbrs100)

Here there is a much more uniform appearance to the results. It’s always worth trying a few different values of n_neighbors, especially larger values, although larger values of n_neighbors will lead to longer run times. Sometimes small clusters that you think are meaningful may in fact be artifacts of setting n_neighbors too small, so starting with a larger value and looking at the effect of reducing n_neighbors can help you avoid over interpreting results.

init

The default initialization of UMAP is to use spectral initialization, which acts upon the (symmetrized) k-nearest neighbor graph that is in determined by your choice of n_neighbors. This is usually a good choice, but it involves a very sparse matrix, which can sometimes be a bit too sparse, which leads to numerical difficulties which manifest as slow run times or even hanging calculations. If your dataset causes these issues, you can either try increasing n_neighbors but I have seen cases where that would be inconvenient in terms of CPU and RAM usage. An alternative is to use the first two principal components of the data, which at least uses the data you provide to give a solid global picture of the data that UMAP can refine. It’s not appropriate for every dataset, but in most cases, it’s a perfectly good alternative.

The only gotcha with it is that depending on the scaling of your data, the initial coordinates can have large inter-point distances. UMAP will not optimize that well, so such an output should be scaled to a small standard deviation. If you set init = "spca", it will do all that for you, although to be more aligned with the UMAP coordinate initialization, I recommend you also set init_sdev = "range" as well. init_sdev can also take a numerical value for the standard deviation. Values from 1e-4 to 10 are reasonable, but I recommend you stick to the default of "range".

set.seed(42)
iris_umap_spca <-
  umap(iris,
    init = "spca",
    init_sdev = "range",
    min_dist = 0.3
  )
plot_umap(iris_umap_spca)

This doesn’t have a big effect on iris, but it’s good to know about this as an option: and it can also smooth out the effect of changing n_neighbors on the initial coordinates with the standard spectral initialization, which can make it easier to see the effect of changing n_neighbors on the final result.

Some other init options to know about:

  • "random": if the worst comes to the worst, you can always fall back to randomly assigning the initial coordinates. You really want to avoid this if you can though, because it will take longer to optimize the coordinates to the same quality, so you will need to increase n_epochs to compensate. Even if you do that, it’s much more likely that you will end up in a minimum that is less desirable than one based on a good initialization. This will make interpreting the results harder, as you are more likely to end up with different clusters beings split or mixed with each other.
  • If you have some coordinates you like from another method, you can pass them in as a matrix. But remember will probably want to scale them with init_sdev though.

dens_scale

The dens_scale parameter varies from 0 to 1 and controls how much of the relative densities of the input data is attempted to be preserved in the output.

set.seed(42)
iris_umapds <- umap(iris, min_dist = 0.3, dens_scale = 0.5)
plot_umap(iris_umapds)

This has shrunk the black cluster on the left of the plot (those are of species setosa), which reflect that the density of the setosa points is less spread out in the input data than the other two species. For more on dens_scale please read its dedicated article.

Embedding New Data

Once you have an embedding, you can use it to embed new data, although you need to remember to ask for a “model” to return. Instead of just the coordinates, you will now get back a list which contains all the extra parameters you will need for transforming new data. The coordinates are still available in the $embedding component.

Let’s try building a UMAP with just the setosa and versicolor iris species:

set.seed(42)

iris_train <- iris[iris$Species %in% c("setosa", "versicolor"), ]
iris_train_umap <-
  umap(iris_train, min_dist = 0.3, ret_model = TRUE)
plot(
  iris_train_umap$embedding,
  col = iris_train$Species,
  xlab = "",
  ylab = "",
  main = "UMAP setosa + versicolor"
)

Next, you can use umap_transform to embed the new points:

iris_test <- iris[iris$Species == "virginica", ]
set.seed(42)
iris_test_umap <- umap_transform(iris_test, iris_train_umap)
plot(
  rbind(iris_train_umap$embedding, iris_test_umap),
  col = iris$Species,
  xlab = "",
  ylab = "",
  main = "UMAP transform virginica"
)

The green points in the top-right show the embedded data. Note that the original (black and red) clusters do not get optimized any further. While we haven’t perfectly reproduced the full UMAP, the virginica points are located in more or less the right place, close to the versicolor items. Just like with any machine learning method, you must be careful with how you choose your training set.

Supported Distances

For small (N < 4096) and Euclidean distance, exact nearest neighbors are found using the FNN package. Otherwise, approximate nearest neighbors are found using RcppAnnoy. The supported distance metrics (set by the metric parameter) are:

  • Euclidean
  • Cosine
  • Pearson Correlation (correlation)
  • Manhattan
  • Hamming

Exactly what constitutes the cosine distance can differ between packages. uwot tries to follow how the Python version of UMAP defines it, which is 1 minus the cosine similarity. This differs slightly from how Annoy defines its angular distance, so be aware that uwot internally converts the Annoy version of the distance. Also be aware that the Pearson correlation distance is the cosine distance applied to row-centered vectors.

If you need other metrics, and can generate the nearest neighbor info externally, you can pass the data directly to uwot via the nn_method parameter. Please note that the Hamming support is a lot slower than the other metrics. I do not recommend using it if you have more than a few hundred features, and even then expect it to take several minutes during the index building phase in situations where the Euclidean metric would take only a few seconds.

Multi-threading support

Parallelization can be used for the nearest neighbor index search, the smooth knn/perplexity calibration, and the optimization, which is the same approach that LargeVis takes.

You can (and should) adjust the number of threads via the n_threads, which controls the nearest neighbor and smooth knn calibration, and the n_sgd_threads parameter, which controls the number of threads used during optimization. For the n_threads, the default is the number of available cores. For n_sgd_threads the default is 0, which ensures reproducibility of results with a fixed seed.

Python Comparison

For the datasets I’ve tried it with, the results look at least reminiscent of those obtained using the official Python implementation. Below are results for the 70,000 MNIST digits (downloaded using the snedata package). Below, is the result of using the official Python UMAP implementation (via the reticulate package). Under that is the result of using uwot.

MNIST UMAP (Python)

MNIST UMAP (Python)

MNIST UMAP (R)

MNIST UMAP (R)

The project documentation contains some more examples, and comparison with Python.

Limitations and Other Issues

Nearest Neighbor Calculation

uwot leans heavily on the Annoy library for approximate nearest neighbor search. As a result, compared to the Python version of UMAP, uwot has much more limited support for different distance measurements, and no support for sparse matrix data input.

However, uwot does let you pass in nearest neighbor data. So if you have access to other nearest neighbor methods, you can generate data that can be used with uwot. See the Nearest Neighbor Data Format article. Or if you can calculate a distance matrix for your data, you can pass it in as dist object.

For larger distance matrices, you can pass in a sparseMatrix (from the Matrix package).

Experience with COIL-100, which has 49,152 features, suggests that Annoy will definitely struggle with datasets of this dimensionality. Even 3000 dimensions can cause problems, although this is not a difficulty specific to Annoy. Reducing the dimensionality with PCA to an intermediate dimensionality (e.g. 100) can help. Use e.g. pca = 100 to do this. This can also be slow on platforms without good linear algebra support and you should assure yourself that 100 principal components won’t be throwing away excessive amounts of information.

Spectral Initialization

The spectral initialization default for umap (and the Laplacian Eigenmap initialization, init = "laplacian") can sometimes run into problems. If it fails to converge it will fall back to random initialization, but on occasion I’ve seen it take an extremely long time (a couple of hours) to converge. Recent changes have hopefully reduced the chance of this happening, but if initialization is taking more than a few minutes, I suggest stopping the calculation and using the scaled PCA (init = "spca") instead.

Supporting Libraries

All credit to the following packages which do a lot of the hard work:

  • Coordinate initialization uses RSpectra to do the eigendecomposition of the normalized Laplacian.
  • The optional PCA initialization and initial dimensionality reduction uses irlba.
  • The smooth k-nearest neighbor distance and stochastic gradient descent optimization routines are written in C++ (using Rcpp, aping the Python code as closely as possible.
  • Some of the multi-threading code is based on RcppParallel.
uwot/inst/doc/uwot.Rmd0000644000176200001440000003712614756267051014462 0ustar liggesusers--- title: "uwot" output: rmarkdown::html_vignette: fig_width: 4 fig_height: 4 vignette: > %\VignetteIndexEntry{uwot} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ``` ```{r setup} library(uwot) library(RSpectra) ``` `uwot` is the a package for implementing the UMAP dimensionality reduction method. For more information on UMAP, see the [original paper](https://arxiv.org/abs/1802.03426) and the [Python package](https://github.com/lmcinnes/umap). We'll use the `iris` dataset in these examples. It's not the ideal dataset because it's not terribly large nor high-dimensional (with only 4 numeric columns), but you'll get the general idea. The default output dimensionality of UMAP is into two-dimensions, so it's amenable for visualization, but you can set a larger value with `n_components`. In this vignette we'll stick with two dimensions. We will need a function to make plotting easier: ```{r plot function} kabsch <- function(pm, qm) { pm_dims <- dim(pm) if (!all(dim(qm) == pm_dims)) { stop(call. = TRUE, "Point sets must have the same dimensions") } # The rotation matrix will have (ncol - 1) leading ones in the diagonal diag_ones <- rep(1, pm_dims[2] - 1) # center the points pm <- scale(pm, center = TRUE, scale = FALSE) qm <- scale(qm, center = TRUE, scale = FALSE) am <- crossprod(pm, qm) svd_res <- svd(am) # use the sign of the determinant to ensure a right-hand coordinate system d <- determinant(tcrossprod(svd_res$v, svd_res$u))$sign dm <- diag(c(diag_ones, d)) # rotation matrix um <- svd_res$v %*% tcrossprod(dm, svd_res$u) # Rotate and then translate to the original centroid location of qm sweep(t(tcrossprod(um, pm)), 2, -attr(qm, "scaled:center")) } iris_pca2 <- prcomp(iris[, 1:4])$x[, 1:2] plot_umap <- function(coords, col = iris$Species, pca = iris_pca2) { plot(kabsch(coords, pca), col = col, xlab = "", ylab = "") } ``` Most of this code is just the [kabsch algorithm](https://en.wikipedia.org/wiki/Kabsch_algorithm) to align two point sets, which I am going to use to align the results of UMAP over the first two principal components. This is to keep the relative orientation of the output the same across different plots which makes it a bit easier to see the differences between them. UMAP is a stochastic algorithm so the output will be different each time you run it and small changes to the parameters can affect the *absolute* values of the coordinates, although the interpoint differences are usually similar. There's no need to go to such trouble in most circumstances: the output of `umap` is a perfectly useful 2D matrix of coordinates you can pass into a plotting function with no further processing required. ## Basic UMAP The defaults of the `umap` function should work for most datasets. No scaling of the input data is done, but non-numeric columns are ignored: ```{r basic UMAP} set.seed(42) iris_umap <- umap(iris) plot_umap(iris_umap) ``` ### Parameters `uwot` has accumulated many parameters over time, but most of the time there are only a handful you need worry about. The most important ones are: #### `min_dist` This is a mainly aesthetic parameter, which defines how close points can get in the output space. A smaller value tends to make any clusters in the output more compact. You should experiment with values between 0 and 1, although don't choose exactly zero. The default is 0.01, which seems like it's a bit small for `iris`. Let's crank up `min_dist` to `0.3`: ```{r min_dist 0.5} set.seed(42) iris_umap_md05 <- umap(iris, min_dist = 0.3) plot_umap(iris_umap_md05) ``` This has made the clusters bigger and closer together, so we'll use `min_dist = 0.3` for the other examples with `iris`. #### `n_neighbors` This defines the number of items in the dataset that define the neighborhood around each point. Set it too low and you will get a more fragmented layout. Set it too high and you will get something that will miss a lot of local structure. Here's a result with 5 neighbors: ```{r 5 neighbors} set.seed(42) iris_umap_nbrs5 <- umap(iris, n_neighbors = 5, min_dist = 0.3) plot_umap(iris_umap_nbrs5) ``` It's not hugely different from the default of 15 neighbors, but the clusters are a bit more broken up. There should be a more pronounced difference going the other way and looking at 100 neighbors: ```{r 100 neighbors} set.seed(42) iris_umap_nbrs100 <- umap(iris, n_neighbors = 100, min_dist = 0.3) plot_umap(iris_umap_nbrs100) ``` Here there is a much more uniform appearance to the results. It's always worth trying a few different values of `n_neighbors`, especially larger values, although larger values of `n_neighbors` will lead to longer run times. Sometimes small clusters that you think are meaningful may in fact be artifacts of setting `n_neighbors` too small, so starting with a larger value and looking at the effect of reducing `n_neighbors` can help you avoid over interpreting results. #### `init` The default initialization of UMAP is to use spectral initialization, which acts upon the (symmetrized) k-nearest neighbor graph that is in determined by your choice of `n_neighbors`. This is usually a good choice, but it involves a very sparse matrix, which can sometimes be a bit *too* sparse, which leads to numerical difficulties which manifest as slow run times or even hanging calculations. If your dataset causes these issues, you can either try increasing `n_neighbors` but I have seen cases where that would be inconvenient in terms of CPU and RAM usage. An alternative is to use the first two principal components of the data, which at least uses the data you provide to give a solid global picture of the data that UMAP can refine. It's not appropriate for every dataset, but in most cases, it's a perfectly good alternative. The only gotcha with it is that depending on the scaling of your data, the initial coordinates can have large inter-point distances. UMAP will not optimize that well, so such an output should be scaled to a small standard deviation. If you set `init = "spca"`, it will do all that for you, although to be more aligned with the UMAP coordinate initialization, I recommend you also set `init_sdev = "range"` as well. `init_sdev` can also take a numerical value for the standard deviation. Values from `1e-4` to `10` are reasonable, but I recommend you stick to the default of `"range"`. ```{r spca init} set.seed(42) iris_umap_spca <- umap(iris, init = "spca", init_sdev = "range", min_dist = 0.3 ) plot_umap(iris_umap_spca) ``` This doesn't have a big effect on `iris`, but it's good to know about this as an option: and it can also smooth out the effect of changing `n_neighbors` on the initial coordinates with the standard spectral initialization, which can make it easier to see the effect of changing `n_neighbors` on the final result. Some other `init` options to know about: * `"random"`: if the worst comes to the worst, you can always fall back to randomly assigning the initial coordinates. You really want to avoid this if you can though, because it will take longer to optimize the coordinates to the same quality, so you will need to increase `n_epochs` to compensate. Even if you do that, it's *much* more likely that you will end up in a minimum that is less desirable than one based on a good initialization. This will make interpreting the results harder, as you are more likely to end up with different clusters beings split or mixed with each other. * If you have some coordinates you like from another method, you can pass them in as a matrix. But remember will probably want to scale them with `init_sdev` though. #### `dens_scale` The `dens_scale` parameter varies from 0 to 1 and controls how much of the relative densities of the input data is attempted to be preserved in the output. ```{r UMAP with density scaling} set.seed(42) iris_umapds <- umap(iris, min_dist = 0.3, dens_scale = 0.5) plot_umap(iris_umapds) ``` This has shrunk the black cluster on the left of the plot (those are of species `setosa`), which reflect that the density of the `setosa` points is less spread out in the input data than the other two species. For more on `dens_scale` please read its dedicated [article](https://jlmelville.github.io/uwot/articles/leopold.html). ## Embedding New Data Once you have an embedding, you can use it to embed new data, although you need to remember to ask for a "model" to return. Instead of just the coordinates, you will now get back a list which contains all the extra parameters you will need for transforming new data. The coordinates are still available in the `$embedding` component. Let's try building a UMAP with just the `setosa` and `versicolor` iris species: ```{r create a UMAP model} set.seed(42) iris_train <- iris[iris$Species %in% c("setosa", "versicolor"), ] iris_train_umap <- umap(iris_train, min_dist = 0.3, ret_model = TRUE) plot( iris_train_umap$embedding, col = iris_train$Species, xlab = "", ylab = "", main = "UMAP setosa + versicolor" ) ``` Next, you can use `umap_transform` to embed the new points: ```{r embed new coordinates} iris_test <- iris[iris$Species == "virginica", ] set.seed(42) iris_test_umap <- umap_transform(iris_test, iris_train_umap) plot( rbind(iris_train_umap$embedding, iris_test_umap), col = iris$Species, xlab = "", ylab = "", main = "UMAP transform virginica" ) ``` The green points in the top-right show the embedded data. Note that the original (black and red) clusters do not get optimized any further. While we haven't perfectly reproduced the full UMAP, the `virginica` points are located in more or less the right place, close to the `versicolor` items. Just like with any machine learning method, you must be careful with how you choose your training set. ## Supported Distances For small (N < 4096) and Euclidean distance, exact nearest neighbors are found using the [FNN](https://cran.r-project.org/package=FNN) package. Otherwise, approximate nearest neighbors are found using [RcppAnnoy](https://cran.r-project.org/package=RcppAnnoy). The supported distance metrics (set by the `metric` parameter) are: * Euclidean * Cosine * Pearson Correlation (`correlation`) * Manhattan * Hamming Exactly what constitutes the cosine distance can differ between packages. `uwot` tries to follow how the Python version of UMAP defines it, which is 1 minus the cosine similarity. This differs slightly from how Annoy defines its angular distance, so be aware that `uwot` internally converts the Annoy version of the distance. Also be aware that the Pearson correlation distance is the cosine distance applied to row-centered vectors. If you need other metrics, and can generate the nearest neighbor info externally, you can pass the data directly to `uwot` via the `nn_method` parameter. Please note that the Hamming support is a lot slower than the other metrics. I do not recommend using it if you have more than a few hundred features, and even then expect it to take several minutes during the index building phase in situations where the Euclidean metric would take only a few seconds. ## Multi-threading support Parallelization can be used for the nearest neighbor index search, the smooth knn/perplexity calibration, and the optimization, which is the same approach that [LargeVis](https://github.com/lferry007/LargeVis) takes. You can (and should) adjust the number of threads via the `n_threads`, which controls the nearest neighbor and smooth knn calibration, and the `n_sgd_threads` parameter, which controls the number of threads used during optimization. For the `n_threads`, the default is the number of available cores. For `n_sgd_threads` the default is `0`, which ensures reproducibility of results with a fixed seed. ## Python Comparison For the datasets I've tried it with, the results look at least reminiscent of those obtained using the [official Python implementation](https://github.com/lmcinnes/umap). Below are results for the 70,000 MNIST digits (downloaded using the [snedata](https://github.com/jlmelville/snedata) package). Below, is the result of using the official Python UMAP implementation (via the [reticulate](https://cran.r-project.org/package=reticulate) package). Under that is the result of using `uwot`. ```{r, echo=FALSE, out.width="75%", fig.cap="MNIST UMAP (Python)"} knitr::include_graphics("mnist-py.png") ``` ```{r, echo=FALSE, out.width="75%", fig.cap="MNIST UMAP (R)"} knitr::include_graphics("mnist-r.png") ``` The project documentation contains some more [examples](https://jlmelville.github.io/uwot/articles/umap-examples.html), and [comparison with Python](https://jlmelville.github.io/uwot/articles/pycompare.html). ## Limitations and Other Issues ### Nearest Neighbor Calculation `uwot` leans heavily on the [Annoy](https://github.com/spotify/annoy) library for approximate nearest neighbor search. As a result, compared to the Python version of UMAP, `uwot` has much more limited support for different distance measurements, and no support for sparse matrix data input. However, `uwot` *does* let you pass in nearest neighbor data. So if you have access to other nearest neighbor methods, you can generate data that can be used with `uwot`. See the [Nearest Neighbor Data Format](https://jlmelville.github.io/uwot/articles/nearest-neighbors-format.html) article. Or if you can calculate a distance matrix for your data, you can pass it in as `dist` object. For larger distance matrices, you can pass in a `sparseMatrix` (from the [Matrix](https://cran.r-project.org/package=Matrix) package). Experience with [COIL-100](https://cave.cs.columbia.edu/repository/COIL-100), which has 49,152 features, suggests that Annoy will *definitely* struggle with datasets of this dimensionality. Even 3000 dimensions can cause problems, although this is not a difficulty specific to Annoy. Reducing the dimensionality with PCA to an intermediate dimensionality (e.g. 100) can help. Use e.g. `pca = 100` to do this. This can also be slow on platforms without good linear algebra support and you should assure yourself that 100 principal components won't be throwing away excessive amounts of information. ### Spectral Initialization The spectral initialization default for `umap` (and the Laplacian Eigenmap initialization, `init = "laplacian"`) can sometimes run into problems. If it fails to converge it will fall back to random initialization, but on occasion I've seen it take an extremely long time (a couple of hours) to converge. Recent changes have hopefully reduced the chance of this happening, but if initialization is taking more than a few minutes, I suggest stopping the calculation and using the scaled PCA (`init = "spca"`) instead. ## Supporting Libraries All credit to the following packages which do a lot of the hard work: * Coordinate initialization uses [RSpectra](https://cran.r-project.org/package=RSpectra) to do the eigendecomposition of the normalized Laplacian. * The optional PCA initialization and initial dimensionality reduction uses [irlba](https://cran.r-project.org/package=irlba). * The smooth k-nearest neighbor distance and stochastic gradient descent optimization routines are written in C++ (using [Rcpp](https://cran.r-project.org/package=Rcpp), aping the Python code as closely as possible. * Some of the multi-threading code is based on [RcppParallel](https://github.com/RcppCore/RcppParallel). uwot/build/0000755000176200001440000000000014757004303012352 5ustar liggesusersuwot/build/vignette.rds0000644000176200001440000000027714757004303014717 0ustar liggesusersb```b`aab`b2 1# '(-/ MAgqfIA & ȂLaBRŚZ% 5/$~NN,/AQs[fN*ސ89 d Bw(,/׃ @?{49'ݣ\)%ziE@ w`E[uwot/build/partial.rdb0000644000176200001440000000007514757004200014475 0ustar liggesusersb```b`aab`b1g``d`aҬy@D?M7uwot/man/0000755000176200001440000000000014733074465012040 5ustar liggesusersuwot/man/umap2.Rd0000644000176200001440000011751014735021201013336 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/umap2.R \name{umap2} \alias{umap2} \title{Dimensionality Reduction with UMAP} \usage{ umap2( X, n_neighbors = 15, n_components = 2, metric = "euclidean", n_epochs = NULL, learning_rate = 1, scale = FALSE, init = "spectral", init_sdev = "range", spread = 1, min_dist = 0.1, set_op_mix_ratio = 1, local_connectivity = 1, bandwidth = 1, repulsion_strength = 1, negative_sample_rate = 5, a = NULL, b = NULL, nn_method = NULL, n_trees = 50, search_k = 2 * n_neighbors * n_trees, approx_pow = FALSE, y = NULL, target_n_neighbors = n_neighbors, target_metric = "euclidean", target_weight = 0.5, pca = NULL, pca_center = TRUE, pcg_rand = TRUE, fast_sgd = FALSE, ret_model = FALSE, ret_nn = FALSE, ret_extra = c(), n_threads = NULL, n_sgd_threads = 0, grain_size = 1, tmpdir = tempdir(), verbose = getOption("verbose", TRUE), batch = TRUE, opt_args = NULL, epoch_callback = NULL, pca_method = NULL, binary_edge_weights = FALSE, dens_scale = NULL, seed = NULL, nn_args = list(), rng_type = NULL ) } \arguments{ \item{X}{Input data. Can be a \code{\link{data.frame}}, \code{\link{matrix}}, \code{\link[stats]{dist}} object or \code{\link[Matrix]{sparseMatrix}}. Matrix and data frames should contain one observation per row. Data frames will have any non-numeric columns removed, although factor columns will be used if explicitly included via \code{metric} (see the help for \code{metric} for details). Sparse matrices must be in the \code{dgCMatrix} format, and you must also install \href{https://cran.r-project.org/package=rnndescent}{rnndescent} and set \code{nn_method = "nndescent"} \code{X} can also be \code{NULL} if pre-computed nearest neighbor data is passed to \code{nn_method}, and \code{init} is not \code{"spca"} or \code{"pca"}.} \item{n_neighbors}{The size of local neighborhood (in terms of number of neighboring sample points) used for manifold approximation. Larger values result in more global views of the manifold, while smaller values result in more local data being preserved. In general values should be in the range \code{2} to \code{100}.} \item{n_components}{The dimension of the space to embed into. This defaults to \code{2} to provide easy visualization, but can reasonably be set to any integer value in the range \code{2} to \code{100}.} \item{metric}{Type of distance metric to use to find nearest neighbors. For \code{nn_method = "annoy"} this can be one of: \itemize{ \item \code{"euclidean"} (the default) \item \code{"cosine"} \item \code{"manhattan"} \item \code{"hamming"} \item \code{"correlation"} (a distance based on the Pearson correlation) \item \code{"categorical"} (see below) } For \code{nn_method = "hnsw"} this can be one of: \itemize{ \item \code{"euclidean"} \item \code{"cosine"} \item \code{"correlation"} } If \href{https://cran.r-project.org/package=rnndescent}{rnndescent} is installed and \code{nn_method = "nndescent"} is specified then many more metrics are avaiable, including: \itemize{ \item \code{"braycurtis"} \item \code{"canberra"} \item \code{"chebyshev"} \item \code{"dice"} \item \code{"hamming"} \item \code{"hellinger"} \item \code{"jaccard"} \item \code{"jensenshannon"} \item \code{"kulsinski"} \item \code{"rogerstanimoto"} \item \code{"russellrao"} \item \code{"sokalmichener"} \item \code{"sokalsneath"} \item \code{"spearmanr"} \item \code{"symmetrickl"} \item \code{"tsss"} \item \code{"yule"} } For more details see the package documentation of \code{rnndescent}. For \code{nn_method = "fnn"}, the distance metric is always "euclidean". If \code{X} is a data frame or matrix, then multiple metrics can be specified, by passing a list to this argument, where the name of each item in the list is one of the metric names above. The value of each list item should be a vector giving the names or integer ids of the columns to be included in a calculation, e.g. \code{metric = list(euclidean = 1:4, manhattan = 5:10)}. Each metric calculation results in a separate fuzzy simplicial set, which are intersected together to produce the final set. Metric names can be repeated. Because non-numeric columns are removed from the data frame, it is safer to use column names than integer ids. Factor columns can also be used by specifying the metric name \code{"categorical"}. Factor columns are treated different from numeric columns and although multiple factor columns can be specified in a vector, each factor column specified is processed individually. If you specify a non-factor column, it will be coerced to a factor. For a given data block, you may override the \code{pca} and \code{pca_center} arguments for that block, by providing a list with one unnamed item containing the column names or ids, and then any of the \code{pca} or \code{pca_center} overrides as named items, e.g. \code{metric = list(euclidean = 1:4, manhattan = list(5:10, pca_center = FALSE))}. This exists to allow mixed binary and real-valued data to be included and to have PCA applied to both, but with centering applied only to the real-valued data (it is typical not to apply centering to binary data before PCA is applied).} \item{n_epochs}{Number of epochs to use during the optimization of the embedded coordinates. By default, this value is set to \code{500} for datasets containing 10,000 vertices or less, and \code{200} otherwise. If \code{n_epochs = 0}, then coordinates determined by \code{"init"} will be returned.} \item{learning_rate}{Initial learning rate used in optimization of the coordinates.} \item{scale}{Scaling to apply to \code{X} if it is a data frame or matrix: \itemize{ \item{\code{"none"} or \code{FALSE} or \code{NULL}} No scaling. \item{\code{"Z"} or \code{"scale"} or \code{TRUE}} Scale each column to zero mean and variance 1. \item{\code{"maxabs"}} Center each column to mean 0, then divide each element by the maximum absolute value over the entire matrix. \item{\code{"range"}} Range scale the entire matrix, so the smallest element is 0 and the largest is 1. \item{\code{"colrange"}} Scale each column in the range (0,1). } For UMAP, the default is \code{"none"}.} \item{init}{Type of initialization for the coordinates. Options are: \itemize{ \item \code{"spectral"} Spectral embedding using the normalized Laplacian of the fuzzy 1-skeleton, with Gaussian noise added. \item \code{"normlaplacian"}. Spectral embedding using the normalized Laplacian of the fuzzy 1-skeleton, without noise. \item \code{"random"}. Coordinates assigned using a uniform random distribution between -10 and 10. \item \code{"lvrandom"}. Coordinates assigned using a Gaussian distribution with standard deviation 1e-4, as used in LargeVis (Tang et al., 2016) and t-SNE. \item \code{"laplacian"}. Spectral embedding using the Laplacian Eigenmap (Belkin and Niyogi, 2002). \item \code{"pca"}. The first two principal components from PCA of \code{X} if \code{X} is a data frame, and from a 2-dimensional classical MDS if \code{X} is of class \code{"dist"}. \item \code{"spca"}. Like \code{"pca"}, but each dimension is then scaled so the standard deviation is 1e-4, to give a distribution similar to that used in t-SNE. This is an alias for \code{init = "pca", init_sdev = 1e-4}. \item \code{"agspectral"} An "approximate global" modification of \code{"spectral"} which all edges in the graph to a value of 1, and then sets a random number of edges (\code{negative_sample_rate} edges per vertex) to 0.1, to approximate the effect of non-local affinities. \item A matrix of initial coordinates. } For spectral initializations, (\code{"spectral"}, \code{"normlaplacian"}, \code{"laplacian"}, \code{"agspectral"}), if more than one connected component is identified, no spectral initialization is attempted. Instead a PCA-based initialization is attempted. If \code{verbose = TRUE} the number of connected components are logged to the console. The existence of multiple connected components implies that a global view of the data cannot be attained with this initialization. Increasing the value of \code{n_neighbors} may help.} \item{init_sdev}{If non-\code{NULL}, scales each dimension of the initialized coordinates (including any user-supplied matrix) to this standard deviation. By default, (\code{init_sdev = "range"}), each column of the initial coordinates are range scaled between 0-10. Scaling the input may help if the unscaled versions result in initial coordinates with large inter-point distances or outliers. This usually results in small gradients during optimization and very little progress being made to the layout. Shrinking the initial embedding by rescaling can help under these circumstances. Scaling the result of \code{init = "pca"} is usually recommended and \code{init = "spca"} as an alias for \code{init = "pca", init_sdev = 1e-4} but for the spectral initializations the scaled versions usually aren't necessary unless you are using a large value of \code{n_neighbors} (e.g. \code{n_neighbors = 150} or higher).} \item{spread}{The effective scale of embedded points. In combination with \code{min_dist}, this determines how clustered/clumped the embedded points are.} \item{min_dist}{The effective minimum distance between embedded points. Smaller values will result in a more clustered/clumped embedding where nearby points on the manifold are drawn closer together, while larger values will result on a more even dispersal of points. The value should be set relative to the \code{spread} value, which determines the scale at which embedded points will be spread out.} \item{set_op_mix_ratio}{Interpolate between (fuzzy) union and intersection as the set operation used to combine local fuzzy simplicial sets to obtain a global fuzzy simplicial sets. Both fuzzy set operations use the product t-norm. The value of this parameter should be between \code{0.0} and \code{1.0}; a value of \code{1.0} will use a pure fuzzy union, while \code{0.0} will use a pure fuzzy intersection.} \item{local_connectivity}{The local connectivity required -- i.e. the number of nearest neighbors that should be assumed to be connected at a local level. The higher this value the more connected the manifold becomes locally. In practice this should be not more than the local intrinsic dimension of the manifold.} \item{bandwidth}{The effective bandwidth of the kernel if we view the algorithm as similar to Laplacian Eigenmaps. Larger values induce more connectivity and a more global view of the data, smaller values concentrate more locally.} \item{repulsion_strength}{Weighting applied to negative samples in low dimensional embedding optimization. Values higher than one will result in greater weight being given to negative samples.} \item{negative_sample_rate}{The number of negative edge/1-simplex samples to use per positive edge/1-simplex sample in optimizing the low dimensional embedding.} \item{a}{More specific parameters controlling the embedding. If \code{NULL} these values are set automatically as determined by \code{min_dist} and \code{spread}.} \item{b}{More specific parameters controlling the embedding. If \code{NULL} these values are set automatically as determined by \code{min_dist} and \code{spread}.} \item{nn_method}{Method for finding nearest neighbors. Options are: \itemize{ \item \code{"fnn"}. Use exact nearest neighbors via the \href{https://cran.r-project.org/package=FNN}{FNN} package. \item \code{"annoy"} Use approximate nearest neighbors via the \href{https://cran.r-project.org/package=RcppAnnoy}{RcppAnnoy} package. \item \code{"hnsw"} Use approximate nearest neighbors with the Hierarchical Navigable Small World (HNSW) method (Malkov and Yashunin, 2018) via the \href{https://cran.r-project.org/package=RcppHNSW}{RcppHNSW} package. \code{RcppHNSW} is not a dependency of this package: this option is only available if you have installed \code{RcppHNSW} yourself. Also, HNSW only supports the following arguments for \code{metric} and \code{target_metric}: \code{"euclidean"}, \code{"cosine"} and \code{"correlation"}. \item \code{"nndescent"} Use approximate nearest neighbors with the Nearest Neighbor Descent method (Dong et al., 2011) via the \href{https://cran.r-project.org/package=rnndescent}{rnndescent} package. \code{rnndescent} is not a dependency of this package: this option is only available if you have installed \code{rnndescent} yourself. } By default, if \code{X} has less than 4,096 vertices, the exact nearest neighbors are found. Otherwise, approximate nearest neighbors are used. You may also pass pre-calculated nearest neighbor data to this argument. It must be one of two formats, either a list consisting of two elements: \itemize{ \item \code{"idx"}. A \code{n_vertices x n_neighbors} matrix containing the integer indexes of the nearest neighbors in \code{X}. Each vertex is considered to be its own nearest neighbor, i.e. \code{idx[, 1] == 1:n_vertices}. \item \code{"dist"}. A \code{n_vertices x n_neighbors} matrix containing the distances of the nearest neighbors. } or a sparse distance matrix of type \code{dgCMatrix}, with dimensions \code{n_vertices x n_vertices}. Distances should be arranged by column, i.e. a non-zero entry in row \code{j} of the \code{i}th column indicates that the \code{j}th observation in \code{X} is a nearest neighbor of the \code{i}th observation with the distance given by the value of that element. The \code{n_neighbors} parameter is ignored when using precomputed nearest neighbor data. If using the sparse distance matrix input, each column can contain a different number of neighbors.} \item{n_trees}{Number of trees to build when constructing the nearest neighbor index. The more trees specified, the larger the index, but the better the results. With \code{search_k}, determines the accuracy of the Annoy nearest neighbor search. Only used if the \code{nn_method} is \code{"annoy"}. Sensible values are between \code{10} to \code{100}.} \item{search_k}{Number of nodes to search during the neighbor retrieval. The larger k, the more the accurate results, but the longer the search takes. With \code{n_trees}, determines the accuracy of the Annoy nearest neighbor search. Only used if the \code{nn_method} is \code{"annoy"}.} \item{approx_pow}{If \code{TRUE}, use an approximation to the power function in the UMAP gradient, from \url{https://martin.ankerl.com/2012/01/25/optimized-approximative-pow-in-c-and-cpp/}. Ignored if \code{dens_scale} is non-\code{NULL}.} \item{y}{Optional target data for supervised dimension reduction. Can be a vector, matrix or data frame. Use the \code{target_metric} parameter to specify the metrics to use, using the same syntax as \code{metric}. Usually either a single numeric or factor column is used, but more complex formats are possible. The following types are allowed: \itemize{ \item Factor columns with the same length as \code{X}. \code{NA} is allowed for any observation with an unknown level, in which case UMAP operates as a form of semi-supervised learning. Each column is treated separately. \item Numeric data. \code{NA} is \emph{not} allowed in this case. Use the parameter \code{target_n_neighbors} to set the number of neighbors used with \code{y}. If unset, \code{n_neighbors} is used. Unlike factors, numeric columns are grouped into one block unless \code{target_metric} specifies otherwise. For example, if you wish columns \code{a} and \code{b} to be treated separately, specify \code{target_metric = list(euclidean = "a", euclidean = "b")}. Otherwise, the data will be effectively treated as a matrix with two columns. \item Nearest neighbor data, consisting of a list of two matrices, \code{idx} and \code{dist}. These represent the precalculated nearest neighbor indices and distances, respectively. This is the same format as that expected for precalculated data in \code{nn_method}. This format assumes that the underlying data was a numeric vector. Any user-supplied value of the \code{target_n_neighbors} parameter is ignored in this case, because the the number of columns in the matrices is used for the value. Multiple nearest neighbor data using different metrics can be supplied by passing a list of these lists. } Unlike \code{X}, all factor columns included in \code{y} are automatically used.} \item{target_n_neighbors}{Number of nearest neighbors to use to construct the target simplicial set. Default value is \code{n_neighbors}. Applies only if \code{y} is non-\code{NULL} and \code{numeric}.} \item{target_metric}{The metric used to measure distance for \code{y} if using supervised dimension reduction. Used only if \code{y} is numeric.} \item{target_weight}{Weighting factor between data topology and target topology. A value of 0.0 weights entirely on data, a value of 1.0 weights entirely on target. The default of 0.5 balances the weighting equally between data and target. Only applies if \code{y} is non-\code{NULL}.} \item{pca}{If set to a positive integer value, reduce data to this number of columns using PCA. Doesn't applied if the distance \code{metric} is \code{"hamming"}, or the dimensions of the data is larger than the number specified (i.e. number of rows and columns must be larger than the value of this parameter). If you have > 100 columns in a data frame or matrix, reducing the number of columns in this way may substantially increase the performance of the nearest neighbor search at the cost of a potential decrease in accuracy. In many t-SNE applications, a value of 50 is recommended, although there's no guarantee that this is appropriate for all settings.} \item{pca_center}{If \code{TRUE}, center the columns of \code{X} before carrying out PCA. For binary data, it's recommended to set this to \code{FALSE}.} \item{pcg_rand}{If \code{TRUE}, use the PCG random number generator (O'Neill, 2014) during optimization. Otherwise, use the faster (but probably less statistically good) Tausworthe "taus88" generator. The default is \code{TRUE}. This parameter has been superseded by \code{rng_type} -- if both are set, \code{rng_type} takes precedence.} \item{fast_sgd}{If \code{TRUE}, then the following combination of parameters is set: \code{pcg_rand = TRUE}, \code{n_sgd_threads = "auto"} and \code{approx_pow = TRUE}. The default is \code{FALSE}. Setting this to \code{TRUE} will speed up the stochastic optimization phase, but give a potentially less accurate embedding, and which will not be exactly reproducible even with a fixed seed. For visualization, \code{fast_sgd = TRUE} will give perfectly good results. For more generic dimensionality reduction, it's safer to leave \code{fast_sgd = FALSE}. If \code{fast_sgd = TRUE}, then user-supplied values of \code{pcg_rand}, \code{n_sgd_threads}, and \code{approx_pow} are ignored.} \item{ret_model}{If \code{TRUE}, then return extra data that can be used to add new data to an existing embedding via \code{\link{umap_transform}}. The embedded coordinates are returned as the list item \code{embedding}. If \code{FALSE}, just return the coordinates. This parameter can be used in conjunction with \code{ret_nn} and \code{ret_extra}. Note that some settings are incompatible with the production of a UMAP model: external neighbor data (passed via a list to \code{nn_method}), and factor columns that were included via the \code{metric} parameter. In the latter case, the model produced is based only on the numeric data. A transformation using new data is possible, but the factor columns in the new data are ignored. Note that setting \code{ret_model = TRUE} forces the use of the approximate nearest neighbors method. Because small datasets would otherwise use exact nearest neighbor calculations, setting \code{ret_model = TRUE} means that different results may be returned for small datasets in terms of both the returned nearest neighbors (if requested) and the final embedded coordinates, compared to \code{ret_model = FALSE}, even if the random number seed is fixed. To avoid this, explicitly set \code{nn_method = "annoy"} in the \code{ret_model = FALSE} case.} \item{ret_nn}{If \code{TRUE}, then in addition to the embedding, also return nearest neighbor data that can be used as input to \code{nn_method} to avoid the overhead of repeatedly calculating the nearest neighbors when manipulating unrelated parameters (e.g. \code{min_dist}, \code{n_epochs}, \code{init}). See the "Value" section for the names of the list items. If \code{FALSE}, just return the coordinates. Note that the nearest neighbors could be sensitive to data scaling, so be wary of reusing nearest neighbor data if modifying the \code{scale} parameter. This parameter can be used in conjunction with \code{ret_model} and \code{ret_extra}.} \item{ret_extra}{A vector indicating what extra data to return. May contain any combination of the following strings: \itemize{ \item \code{"model"} Same as setting \code{ret_model = TRUE}. \item \code{"nn"} Same as setting \code{ret_nn = TRUE}. \item \code{"fgraph"} the high dimensional fuzzy graph (i.e. the fuzzy simplicial set of the merged local views of the input data). The graph is returned as a sparse symmetric N x N matrix of class \link[Matrix]{dgCMatrix-class}, where a non-zero entry (i, j) gives the membership strength of the edge connecting vertex i and vertex j. This can be considered analogous to the input probability (or similarity or affinity) used in t-SNE and LargeVis. Note that the graph is further sparsified by removing edges with sufficiently low membership strength that they would not be sampled by the probabilistic edge sampling employed for optimization and therefore the number of non-zero elements in the matrix is dependent on \code{n_epochs}. If you are only interested in the fuzzy input graph (e.g. for clustering), setting \code{n_epochs = 0} will avoid any further sparsifying. Be aware that setting `binary_edge_weights = TRUE` will affect this graph (all non-zero edge weights will be 1). \item \code{"sigma"} the normalization value for each observation in the dataset when constructing the smoothed distances to each of its neighbors. This gives some sense of the local density of each observation in the high dimensional space: higher values of \code{sigma} indicate a higher dispersion or lower density. }} \item{n_threads}{Number of threads to use (except during stochastic gradient descent). Default is half the number of concurrent threads supported by the system. For nearest neighbor search, only applies if \code{nn_method = "annoy"}. If \code{n_threads > 1}, then the Annoy index will be temporarily written to disk in the location determined by \code{\link[base]{tempfile}}.} \item{n_sgd_threads}{Number of threads to use during stochastic gradient descent. If set to > 1, then be aware that if \code{batch = FALSE}, results will \emph{not} be reproducible, even if \code{set.seed} is called with a fixed seed before running. Set to \code{"auto"} to use the same value as \code{n_threads}. Default is to use only one thread, unless \code{batch = TRUE} in which case \code{"auto"} used.} \item{grain_size}{The minimum amount of work to do on each thread. If this value is set high enough, then less than \code{n_threads} or \code{n_sgd_threads} will be used for processing, which might give a performance improvement if the overhead of thread management and context switching was outweighing the improvement due to concurrent processing. This should be left at default (\code{1}) and work will be spread evenly over all the threads specified.} \item{tmpdir}{Temporary directory to store nearest neighbor indexes during nearest neighbor search. Default is \code{\link{tempdir}}. The index is only written to disk if \code{n_threads > 1} and \code{nn_method = "annoy"}; otherwise, this parameter is ignored.} \item{verbose}{If \code{TRUE}, log details to the console.} \item{batch}{If \code{TRUE}, then embedding coordinates are updated at the end of each epoch rather than during the epoch. In batch mode, results are reproducible with a fixed random seed even with \code{n_sgd_threads > 1}, at the cost of a slightly higher memory use. You may also have to modify \code{learning_rate} and increase \code{n_epochs}, so whether this provides a speed increase over the single-threaded optimization is likely to be dataset and hardware-dependent.} \item{opt_args}{A list of optimizer parameters, used when \code{batch = TRUE}. The default optimization method used is Adam (Kingma and Ba, 2014). \itemize{ \item \code{method} The optimization method to use. Either \code{"adam"} or \code{"sgd"} (stochastic gradient descent). Default: \code{"adam"}. \item \code{beta1} (Adam only). The weighting parameter for the exponential moving average of the first moment estimator. Effectively the momentum parameter. Should be a floating point value between 0 and 1. Higher values can smooth oscillatory updates in poorly-conditioned situations and may allow for a larger \code{learning_rate} to be specified, but too high can cause divergence. Default: \code{0.5}. \item \code{beta2} (Adam only). The weighting parameter for the exponential moving average of the uncentered second moment estimator. Should be a floating point value between 0 and 1. Controls the degree of adaptivity in the step-size. Higher values put more weight on previous time steps. Default: \code{0.9}. \item \code{eps} (Adam only). Intended to be a small value to prevent division by zero, but in practice can also affect convergence due to its interaction with \code{beta2}. Higher values reduce the effect of the step-size adaptivity and bring the behavior closer to stochastic gradient descent with momentum. Typical values are between 1e-8 and 1e-3. Default: \code{1e-7}. \item \code{alpha} The initial learning rate. Default: the value of the \code{learning_rate} parameter. }} \item{epoch_callback}{A function which will be invoked at the end of every epoch. Its signature should be: \code{(epoch, n_epochs, coords)}, where: \itemize{ \item \code{epoch} The current epoch number (between \code{1} and \code{n_epochs}). \item \code{n_epochs} Number of epochs to use during the optimization of the embedded coordinates. \item \code{coords} The embedded coordinates as of the end of the current epoch, as a matrix with dimensions (N, \code{n_components}). }} \item{pca_method}{Method to carry out any PCA dimensionality reduction when the \code{pca} parameter is specified. Allowed values are: \itemize{ \item{\code{"irlba"}}. Uses \code{\link[irlba]{prcomp_irlba}} from the \href{https://cran.r-project.org/package=irlba}{irlba} package. \item{\code{"rsvd"}}. Uses 5 iterations of \code{\link[irlba]{svdr}} from the \href{https://cran.r-project.org/package=irlba}{irlba} package. This is likely to give much faster but potentially less accurate results than using \code{"irlba"}. For the purposes of nearest neighbor calculation and coordinates initialization, any loss of accuracy doesn't seem to matter much. \item{\code{"bigstatsr"}}. Uses \code{\link[bigstatsr]{big_randomSVD}} from the \href{https://cran.r-project.org/package=bigstatsr}{bigstatsr} package. The SVD methods used in \code{bigstatsr} may be faster on systems without access to efficient linear algebra libraries (e.g. Windows). \strong{Note}: \code{bigstatsr} is \emph{not} a dependency of uwot: if you choose to use this package for PCA, you \emph{must} install it yourself. \item{\code{"svd"}}. Uses \code{\link[base]{svd}} for the SVD. This is likely to be slow for all but the smallest datasets. \item{\code{"auto"}} (the default). Uses \code{"irlba"}, unless more than 50% of the full set of singular vectors would be calculated, in which case \code{"svd"} is used. }} \item{binary_edge_weights}{If \code{TRUE} then edge weights in the input graph are treated as binary (0/1) rather than real valued. This affects the sampling frequency of neighbors and is the strategy used by the PaCMAP method (Wang and co-workers, 2020). Practical (Böhm and co-workers, 2020) and theoretical (Damrich and Hamprecht, 2021) work suggests this has little effect on UMAP's performance.} \item{dens_scale}{A value between 0 and 1. If > 0 then the output attempts to preserve relative local density around each observation. This uses an approximation to the densMAP method (Narayan and co-workers, 2021). The larger the value of \code{dens_scale}, the greater the range of output densities that will be used to map the input densities. This option is ignored if using multiple \code{metric} blocks.} \item{seed}{Integer seed to use to initialize the random number generator state. Combined with \code{n_sgd_threads = 1} or \code{batch = TRUE}, this should give consistent output across multiple runs on a given installation. Setting this value is equivalent to calling \code{\link[base]{set.seed}}, but it may be more convenient in some situations than having to call a separate function. The default is to not set a seed. If \code{ret_model = TRUE}, the seed will be stored in the output model and then used to set the seed inside \code{\link{umap_transform}}.} \item{nn_args}{A list containing additional arguments to pass to the nearest neighbor method. For \code{nn_method = "annoy"}, you can specify \code{"n_trees"} and \code{"search_k"}, and these will override the \code{n_trees} and \code{search_k} parameters. For \code{nn_method = "hnsw"}, you may specify the following arguments: \itemize{ \item \code{M} The maximum number of neighbors to keep for each vertex. Reasonable values are \code{2} to \code{100}. Higher values give better recall at the cost of more memory. Default value is \code{16}. \item \code{ef_construction} A positive integer specifying the size of the dynamic list used during index construction. A higher value will provide better results at the cost of a longer time to build the index. Default is \code{200}. \item \code{ef} A positive integer specifying the size of the dynamic list used during search. This cannot be smaller than \code{n_neighbors} and cannot be higher than the number of items in the index. Default is \code{10}. } For \code{nn_method = "nndescent"}, you may specify the following arguments: \itemize{ \item \code{n_trees} The number of trees to use in a random projection forest to initialize the search. A larger number will give more accurate results at the cost of a longer computation time. The default of \code{NULL} means that the number is chosen based on the number of observations in \code{X}. \item \code{max_candidates} The number of potential neighbors to explore per iteration. By default, this is set to \code{n_neighbors} or \code{60}, whichever is smaller. A larger number will give more accurate results at the cost of a longer computation time. \item \code{n_iters} The number of iterations to run the search. A larger number will give more accurate results at the cost of a longer computation time. By default, this will be chosen based on the number of observations in \code{X}. You may also need to modify the convergence criterion \code{delta}. \item \code{delta} The minimum relative change in the neighbor graph allowed before early stopping. Should be a value between 0 and 1. The smaller the value, the smaller the amount of progress between iterations is allowed. Default value of \code{0.001} means that at least 0.1% of the neighbor graph must be updated at each iteration. \item \code{init} How to initialize the nearest neighbor descent. By default this is set to \code{"tree"} and uses a random project forest. If you set this to \code{"rand"}, then a random selection is used. Usually this is less accurate than using RP trees, but for high-dimensional cases, there may be little difference in the quality of the initialization and random initialization will be a lot faster. If you set this to \code{"rand"}, then the \code{n_trees} parameter is ignored. \item \code{pruning_degree_multiplier} The maximum number of edges per node to retain in the search graph, relative to \code{n_neighbors}. A larger value will give more accurate results at the cost of a longer computation time. Default is \code{1.5}. This parameter only affects neighbor search when transforming new data with \code{\link{umap_transform}}. \item \code{epsilon} Controls the degree of the back-tracking when traversing the search graph. Setting this to \code{0.0} will do a greedy search with no back-tracking. A larger value will give more accurate results at the cost of a longer computation time. Default is \code{0.1}. This parameter only affects neighbor search when transforming new data with \code{\link{umap_transform}}. \item \code{max_search_fraction} Specifies the maximum fraction of the search graph to traverse. By default, this is set to \code{1.0}, so the entire graph (i.e. all items in \code{X}) may be visited. You may want to set this to a smaller value if you have a very large dataset (in conjunction with \code{epsilon}) to avoid an inefficient exhaustive search of the data in \code{X}. This parameter only affects neighbor search when transforming new data with \code{\link{umap_transform}}. }} \item{rng_type}{The type of random number generator to use during optimization. One of: \itemize{ \item{\code{"pcg"}}. Use the PCG random number generator (O'Neill, 2014). \item{\code{"tausworthe"}}. Use the Tausworthe "taus88" generator. \item{\code{"deterministic"}}. Use a deterministic number generator. This isn't actually random, but may provide enough variation in the negative sampling to give a good embedding and can provide a noticeable speed-up. } For backwards compatibility, by default this is unset and the choice of \code{pcg_rand} is used (making "pcg" the effective default).} } \value{ A matrix of optimized coordinates, or: \itemize{ \item if \code{ret_model = TRUE} (or \code{ret_extra} contains \code{"model"}), returns a list containing extra information that can be used to add new data to an existing embedding via \code{\link{umap_transform}}. In this case, the coordinates are available in the list item \code{embedding}. \bold{NOTE}: The contents of the \code{model} list should \emph{not} be considered stable or part of the public API, and are purposely left undocumented. \item if \code{ret_nn = TRUE} (or \code{ret_extra} contains \code{"nn"}), returns the nearest neighbor data as a list called \code{nn}. This contains one list for each \code{metric} calculated, itself containing a matrix \code{idx} with the integer ids of the neighbors; and a matrix \code{dist} with the distances. The \code{nn} list (or a sub-list) can be used as input to the \code{nn_method} parameter. \item if \code{ret_extra} contains \code{"fgraph"}, returns the high dimensional fuzzy graph as a sparse matrix called \code{fgraph}, of type \link[Matrix]{dgCMatrix-class}. \item if \code{ret_extra} contains \code{"sigma"}, returns a vector of the smooth knn distance normalization terms for each observation as \code{"sigma"} and a vector \code{"rho"} containing the largest distance to the locally connected neighbors of each observation. \item if \code{ret_extra} contains \code{"localr"}, returns a vector of the estimated local radii, the sum of \code{"sigma"} and \code{"rho"}. } The returned list contains the combined data from any combination of specifying \code{ret_model}, \code{ret_nn} and \code{ret_extra}. } \description{ Carry out dimensionality reduction of a dataset using the Uniform Manifold Approximation and Projection (UMAP) method (McInnes et al., 2018). } \details{ This function behaves like \code{\link{umap}} except with some updated defaults that make it behave more like the Python implementation and which cannot be added to \code{\link{umap}} without breaking backwards compatibility. In addition: \itemize{ \item if \href{https://cran.r-project.org/package=RcppHNSW}{RcppHNSW} is installed, it will be used in preference to Annoy if a compatible metric is requested. \item if RcppHNSW is not present, but \href{https://cran.r-project.org/package=rnndescent}{rnndescent} is installed, it will be used in preference to Annoy if a compatible metric is requested. \item if \code{batch = TRUE} then the default \code{n_sgd_threads} is set to the same value as \code{n_threads}. \item if the input data \code{X} is a sparse matrix, it is interpreted similarly to a dense matrix or dataframe, and not as a distance matrix. This requires \code{rnndescent} package to be installed. } } \examples{ iris30 <- iris[c(1:10, 51:60, 101:110), ] iris_umap <- umap2(iris30, n_neighbors = 5) } \references{ Belkin, M., & Niyogi, P. (2002). Laplacian eigenmaps and spectral techniques for embedding and clustering. In \emph{Advances in neural information processing systems} (pp. 585-591). \url{http://papers.nips.cc/paper/1961-laplacian-eigenmaps-and-spectral-techniques-for-embedding-and-clustering.pdf} Böhm, J. N., Berens, P., & Kobak, D. (2020). A unifying perspective on neighbor embeddings along the attraction-repulsion spectrum. \emph{arXiv preprint} \emph{arXiv:2007.08902}. \url{https://arxiv.org/abs/2007.08902} Damrich, S., & Hamprecht, F. A. (2021). On UMAP's true loss function. \emph{Advances in Neural Information Processing Systems}, \emph{34}. \url{https://proceedings.neurips.cc/paper/2021/hash/2de5d16682c3c35007e4e92982f1a2ba-Abstract.html} Dong, W., Moses, C., & Li, K. (2011, March). Efficient k-nearest neighbor graph construction for generic similarity measures. In \emph{Proceedings of the 20th international conference on World Wide Web} (pp. 577-586). ACM. \doi{10.1145/1963405.1963487}. Kingma, D. P., & Ba, J. (2014). Adam: A method for stochastic optimization. \emph{arXiv preprint} \emph{arXiv}:1412.6980. \url{https://arxiv.org/abs/1412.6980} Malkov, Y. A., & Yashunin, D. A. (2018). Efficient and robust approximate nearest neighbor search using hierarchical navigable small world graphs. \emph{IEEE transactions on pattern analysis and machine intelligence}, \emph{42}(4), 824-836. McInnes, L., Healy, J., & Melville, J. (2018). UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction \emph{arXiv preprint} \emph{arXiv}:1802.03426. \url{https://arxiv.org/abs/1802.03426} Narayan, A., Berger, B., & Cho, H. (2021). Assessing single-cell transcriptomic variability through density-preserving data visualization. \emph{Nature biotechnology}, \emph{39}(6), 765-774. \doi{10.1038/s41587-020-00801-7} O'Neill, M. E. (2014). \emph{PCG: A family of simple fast space-efficient statistically good algorithms for random number generation} (Report No. HMC-CS-2014-0905). Harvey Mudd College. Tang, J., Liu, J., Zhang, M., & Mei, Q. (2016, April). Visualizing large-scale and high-dimensional data. In \emph{Proceedings of the 25th International Conference on World Wide Web} (pp. 287-297). International World Wide Web Conferences Steering Committee. \url{https://arxiv.org/abs/1602.00370} Van der Maaten, L., & Hinton, G. (2008). Visualizing data using t-SNE. \emph{Journal of Machine Learning Research}, \emph{9} (2579-2605). \url{https://www.jmlr.org/papers/v9/vandermaaten08a.html} Wang, Y., Huang, H., Rudin, C., & Shaposhnik, Y. (2021). Understanding How Dimension Reduction Tools Work: An Empirical Approach to Deciphering t-SNE, UMAP, TriMap, and PaCMAP for Data Visualization. \emph{Journal of Machine Learning Research}, \emph{22}(201), 1-73. \url{https://www.jmlr.org/papers/v22/20-1061.html} } uwot/man/simplicial_set_union.Rd0000644000176200001440000000251614730166740016537 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/uwot.R \name{simplicial_set_union} \alias{simplicial_set_union} \title{Merge Similarity Graph by Simplicial Set Union} \usage{ simplicial_set_union(x, y, n_threads = NULL, verbose = FALSE) } \arguments{ \item{x}{A sparse matrix representing the first similarity graph in the union operation.} \item{y}{A sparse matrix representing the second similarity graph in the union operation.} \item{n_threads}{Number of threads to use when resetting the local metric. Default is half the number of concurrent threads supported by the system.} \item{verbose}{If \code{TRUE}, log progress to the console.} } \value{ A sparse matrix containing the union of \code{x} and \code{y}. } \description{ Combine two similarity graphs by treating them as fuzzy topological sets and forming the union. } \examples{ # Form two different "views" of the same data iris30 <- iris[c(1:10, 51:60, 101:110), ] iris_sg12 <- similarity_graph(iris30[, 1:2], n_neighbors = 5) iris_sg34 <- similarity_graph(iris30[, 3:4], n_neighbors = 5) # Combine the two representations into one iris_combined <- simplicial_set_union(iris_sg12, iris_sg34) # Optimize the layout based on the combined view iris_combined_umap <- optimize_graph_layout(iris_combined, n_epochs = 100) } uwot/man/tumap.Rd0000644000176200001440000012136714735021251013452 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/uwot.R \name{tumap} \alias{tumap} \title{Dimensionality Reduction Using t-Distributed UMAP (t-UMAP)} \usage{ tumap( X, n_neighbors = 15, n_components = 2, metric = "euclidean", n_epochs = NULL, learning_rate = 1, scale = FALSE, init = "spectral", init_sdev = NULL, set_op_mix_ratio = 1, local_connectivity = 1, bandwidth = 1, repulsion_strength = 1, negative_sample_rate = 5, nn_method = NULL, n_trees = 50, search_k = 2 * n_neighbors * n_trees, n_threads = NULL, n_sgd_threads = 0, grain_size = 1, y = NULL, target_n_neighbors = n_neighbors, target_metric = "euclidean", target_weight = 0.5, pca = NULL, pca_center = TRUE, pcg_rand = TRUE, fast_sgd = FALSE, ret_model = FALSE, ret_nn = FALSE, ret_extra = c(), tmpdir = tempdir(), verbose = getOption("verbose", TRUE), batch = FALSE, opt_args = NULL, epoch_callback = NULL, pca_method = NULL, binary_edge_weights = FALSE, seed = NULL, nn_args = list(), rng_type = NULL ) } \arguments{ \item{X}{Input data. Can be a \code{\link{data.frame}}, \code{\link{matrix}}, \code{\link[stats]{dist}} object or \code{\link[Matrix]{sparseMatrix}}. Matrix and data frames should contain one observation per row. Data frames will have any non-numeric columns removed, although factor columns will be used if explicitly included via \code{metric} (see the help for \code{metric} for details). A sparse matrix is interpreted as a distance matrix, and is assumed to be symmetric, so you can also pass in an explicitly upper or lower triangular sparse matrix to save storage. There must be at least \code{n_neighbors} non-zero distances for each row. Both implicit and explicit zero entries are ignored. Set zero distances you want to keep to an arbitrarily small non-zero value (e.g. \code{1e-10}). \code{X} can also be \code{NULL} if pre-computed nearest neighbor data is passed to \code{nn_method}, and \code{init} is not \code{"spca"} or \code{"pca"}.} \item{n_neighbors}{The size of local neighborhood (in terms of number of neighboring sample points) used for manifold approximation. Larger values result in more global views of the manifold, while smaller values result in more local data being preserved. In general values should be in the range \code{2} to \code{100}.} \item{n_components}{The dimension of the space to embed into. This defaults to \code{2} to provide easy visualization, but can reasonably be set to any integer value in the range \code{2} to \code{100}.} \item{metric}{Type of distance metric to use to find nearest neighbors. For \code{nn_method = "annoy"} this can be one of: \itemize{ \item \code{"euclidean"} (the default) \item \code{"cosine"} \item \code{"manhattan"} \item \code{"hamming"} \item \code{"correlation"} (a distance based on the Pearson correlation) \item \code{"categorical"} (see below) } For \code{nn_method = "hnsw"} this can be one of: \itemize{ \item \code{"euclidean"} \item \code{"cosine"} \item \code{"correlation"} } If \href{https://cran.r-project.org/package=rnndescent}{rnndescent} is installed and \code{nn_method = "nndescent"} is specified then many more metrics are avaiable, including: \itemize{ \item \code{"braycurtis"} \item \code{"canberra"} \item \code{"chebyshev"} \item \code{"dice"} \item \code{"hamming"} \item \code{"hellinger"} \item \code{"jaccard"} \item \code{"jensenshannon"} \item \code{"kulsinski"} \item \code{"rogerstanimoto"} \item \code{"russellrao"} \item \code{"sokalmichener"} \item \code{"sokalsneath"} \item \code{"spearmanr"} \item \code{"symmetrickl"} \item \code{"tsss"} \item \code{"yule"} } For more details see the package documentation of \code{rnndescent}. For \code{nn_method = "fnn"}, the distance metric is always "euclidean". If \code{X} is a data frame or matrix, then multiple metrics can be specified, by passing a list to this argument, where the name of each item in the list is one of the metric names above. The value of each list item should be a vector giving the names or integer ids of the columns to be included in a calculation, e.g. \code{metric = list(euclidean = 1:4, manhattan = 5:10)}. Each metric calculation results in a separate fuzzy simplicial set, which are intersected together to produce the final set. Metric names can be repeated. Because non-numeric columns are removed from the data frame, it is safer to use column names than integer ids. Factor columns can also be used by specifying the metric name \code{"categorical"}. Factor columns are treated different from numeric columns and although multiple factor columns can be specified in a vector, each factor column specified is processed individually. If you specify a non-factor column, it will be coerced to a factor. For a given data block, you may override the \code{pca} and \code{pca_center} arguments for that block, by providing a list with one unnamed item containing the column names or ids, and then any of the \code{pca} or \code{pca_center} overrides as named items, e.g. \code{metric = list(euclidean = 1:4, manhattan = list(5:10, pca_center = FALSE))}. This exists to allow mixed binary and real-valued data to be included and to have PCA applied to both, but with centering applied only to the real-valued data (it is typical not to apply centering to binary data before PCA is applied).} \item{n_epochs}{Number of epochs to use during the optimization of the embedded coordinates. By default, this value is set to \code{500} for datasets containing 10,000 vertices or less, and \code{200} otherwise. If \code{n_epochs = 0}, then coordinates determined by \code{"init"} will be returned.} \item{learning_rate}{Initial learning rate used in optimization of the coordinates.} \item{scale}{Scaling to apply to \code{X} if it is a data frame or matrix: \itemize{ \item{\code{"none"} or \code{FALSE} or \code{NULL}} No scaling. \item{\code{"Z"} or \code{"scale"} or \code{TRUE}} Scale each column to zero mean and variance 1. \item{\code{"maxabs"}} Center each column to mean 0, then divide each element by the maximum absolute value over the entire matrix. \item{\code{"range"}} Range scale the entire matrix, so the smallest element is 0 and the largest is 1. \item{\code{"colrange"}} Scale each column in the range (0,1). } For t-UMAP, the default is \code{"none"}.} \item{init}{Type of initialization for the coordinates. Options are: \itemize{ \item \code{"spectral"} Spectral embedding using the normalized Laplacian of the fuzzy 1-skeleton, with Gaussian noise added. \item \code{"normlaplacian"}. Spectral embedding using the normalized Laplacian of the fuzzy 1-skeleton, without noise. \item \code{"random"}. Coordinates assigned using a uniform random distribution between -10 and 10. \item \code{"lvrandom"}. Coordinates assigned using a Gaussian distribution with standard deviation 1e-4, as used in LargeVis (Tang et al., 2016) and t-SNE. \item \code{"laplacian"}. Spectral embedding using the Laplacian Eigenmap (Belkin and Niyogi, 2002). \item \code{"pca"}. The first two principal components from PCA of \code{X} if \code{X} is a data frame, and from a 2-dimensional classical MDS if \code{X} is of class \code{"dist"}. \item \code{"spca"}. Like \code{"pca"}, but each dimension is then scaled so the standard deviation is 1e-4, to give a distribution similar to that used in t-SNE. This is an alias for \code{init = "pca", init_sdev = 1e-4}. \item \code{"agspectral"} An "approximate global" modification of \code{"spectral"} which all edges in the graph to a value of 1, and then sets a random number of edges (\code{negative_sample_rate} edges per vertex) to 0.1, to approximate the effect of non-local affinities. \item A matrix of initial coordinates. } For spectral initializations, (\code{"spectral"}, \code{"normlaplacian"}, \code{"laplacian"}, \code{"agspectral"}), if more than one connected component is identified, no spectral initialization is attempted. Instead a PCA-based initialization is attempted. If \code{verbose = TRUE} the number of connected components are logged to the console. The existence of multiple connected components implies that a global view of the data cannot be attained with this initialization. Increasing the value of \code{n_neighbors} may help.} \item{init_sdev}{If non-\code{NULL}, scales each dimension of the initialized coordinates (including any user-supplied matrix) to this standard deviation. By default no scaling is carried out, except when \code{init = "spca"}, in which case the value is \code{0.0001}. Scaling the input may help if the unscaled versions result in initial coordinates with large inter-point distances or outliers. This usually results in small gradients during optimization and very little progress being made to the layout. Shrinking the initial embedding by rescaling can help under these circumstances. Scaling the result of \code{init = "pca"} is usually recommended and \code{init = "spca"} as an alias for \code{init = "pca", init_sdev = 1e-4} but for the spectral initializations the scaled versions usually aren't necessary unless you are using a large value of \code{n_neighbors} (e.g. \code{n_neighbors = 150} or higher). For compatibility with recent versions of the Python UMAP package, if you are using \code{init = "spectral"}, then you should also set \code{init_sdev = "range"}, which will range scale each of the columns containing the initial data between 0-10. This is not set by default to maintain backwards compatibility with previous versions of uwot.} \item{set_op_mix_ratio}{Interpolate between (fuzzy) union and intersection as the set operation used to combine local fuzzy simplicial sets to obtain a global fuzzy simplicial sets. Both fuzzy set operations use the product t-norm. The value of this parameter should be between \code{0.0} and \code{1.0}; a value of \code{1.0} will use a pure fuzzy union, while \code{0.0} will use a pure fuzzy intersection.} \item{local_connectivity}{The local connectivity required -- i.e. the number of nearest neighbors that should be assumed to be connected at a local level. The higher this value the more connected the manifold becomes locally. In practice this should be not more than the local intrinsic dimension of the manifold.} \item{bandwidth}{The effective bandwidth of the kernel if we view the algorithm as similar to Laplacian Eigenmaps. Larger values induce more connectivity and a more global view of the data, smaller values concentrate more locally.} \item{repulsion_strength}{Weighting applied to negative samples in low dimensional embedding optimization. Values higher than one will result in greater weight being given to negative samples.} \item{negative_sample_rate}{The number of negative edge/1-simplex samples to use per positive edge/1-simplex sample in optimizing the low dimensional embedding.} \item{nn_method}{Method for finding nearest neighbors. Options are: \itemize{ \item \code{"fnn"}. Use exact nearest neighbors via the \href{https://cran.r-project.org/package=FNN}{FNN} package. \item \code{"annoy"} Use approximate nearest neighbors via the \href{https://cran.r-project.org/package=RcppAnnoy}{RcppAnnoy} package. \item \code{"hnsw"} Use approximate nearest neighbors with the Hierarchical Navigable Small World (HNSW) method (Malkov and Yashunin, 2018) via the \href{https://cran.r-project.org/package=RcppHNSW}{RcppHNSW} package. \code{RcppHNSW} is not a dependency of this package: this option is only available if you have installed \code{RcppHNSW} yourself. Also, HNSW only supports the following arguments for \code{metric} and \code{target_metric}: \code{"euclidean"}, \code{"cosine"} and \code{"correlation"}. \item \code{"nndescent"} Use approximate nearest neighbors with the Nearest Neighbor Descent method (Dong et al., 2011) via the \href{https://cran.r-project.org/package=rnndescent}{rnndescent} package. \code{rnndescent} is not a dependency of this package: this option is only available if you have installed \code{rnndescent} yourself. } By default, if \code{X} has less than 4,096 vertices, the exact nearest neighbors are found. Otherwise, approximate nearest neighbors are used. You may also pass pre-calculated nearest neighbor data to this argument. It must be one of two formats, either a list consisting of two elements: \itemize{ \item \code{"idx"}. A \code{n_vertices x n_neighbors} matrix containing the integer indexes of the nearest neighbors in \code{X}. Each vertex is considered to be its own nearest neighbor, i.e. \code{idx[, 1] == 1:n_vertices}. \item \code{"dist"}. A \code{n_vertices x n_neighbors} matrix containing the distances of the nearest neighbors. } or a sparse distance matrix of type \code{dgCMatrix}, with dimensions \code{n_vertices x n_vertices}. Distances should be arranged by column, i.e. a non-zero entry in row \code{j} of the \code{i}th column indicates that the \code{j}th observation in \code{X} is a nearest neighbor of the \code{i}th observation with the distance given by the value of that element. The \code{n_neighbors} parameter is ignored when using precomputed nearest neighbor data. If using the sparse distance matrix input, each column can contain a different number of neighbors.} \item{n_trees}{Number of trees to build when constructing the nearest neighbor index. The more trees specified, the larger the index, but the better the results. With \code{search_k}, determines the accuracy of the Annoy nearest neighbor search. Only used if the \code{nn_method} is \code{"annoy"}. Sensible values are between \code{10} to \code{100}.} \item{search_k}{Number of nodes to search during the neighbor retrieval. The larger k, the more the accurate results, but the longer the search takes. With \code{n_trees}, determines the accuracy of the Annoy nearest neighbor search. Only used if the \code{nn_method} is \code{"annoy"}.} \item{n_threads}{Number of threads to use (except during stochastic gradient descent). Default is half the number of concurrent threads supported by the system. For nearest neighbor search, only applies if \code{nn_method = "annoy"}. If \code{n_threads > 1}, then the Annoy index will be temporarily written to disk in the location determined by \code{\link[base]{tempfile}}.} \item{n_sgd_threads}{Number of threads to use during stochastic gradient descent. If set to > 1, then be aware that if \code{batch = FALSE}, results will \emph{not} be reproducible, even if \code{set.seed} is called with a fixed seed before running. Set to \code{"auto"} to use the same value as \code{n_threads}.} \item{grain_size}{The minimum amount of work to do on each thread. If this value is set high enough, then less than \code{n_threads} or \code{n_sgd_threads} will be used for processing, which might give a performance improvement if the overhead of thread management and context switching was outweighing the improvement due to concurrent processing. This should be left at default (\code{1}) and work will be spread evenly over all the threads specified.} \item{y}{Optional target data for supervised dimension reduction. Can be a vector, matrix or data frame. Use the \code{target_metric} parameter to specify the metrics to use, using the same syntax as \code{metric}. Usually either a single numeric or factor column is used, but more complex formats are possible. The following types are allowed: \itemize{ \item Factor columns with the same length as \code{X}. \code{NA} is allowed for any observation with an unknown level, in which case UMAP operates as a form of semi-supervised learning. Each column is treated separately. \item Numeric data. \code{NA} is \emph{not} allowed in this case. Use the parameter \code{target_n_neighbors} to set the number of neighbors used with \code{y}. If unset, \code{n_neighbors} is used. Unlike factors, numeric columns are grouped into one block unless \code{target_metric} specifies otherwise. For example, if you wish columns \code{a} and \code{b} to be treated separately, specify \code{target_metric = list(euclidean = "a", euclidean = "b")}. Otherwise, the data will be effectively treated as a matrix with two columns. \item Nearest neighbor data, consisting of a list of two matrices, \code{idx} and \code{dist}. These represent the precalculated nearest neighbor indices and distances, respectively. This is the same format as that expected for precalculated data in \code{nn_method}. This format assumes that the underlying data was a numeric vector. Any user-supplied value of the \code{target_n_neighbors} parameter is ignored in this case, because the the number of columns in the matrices is used for the value. Multiple nearest neighbor data using different metrics can be supplied by passing a list of these lists. } Unlike \code{X}, all factor columns included in \code{y} are automatically used.} \item{target_n_neighbors}{Number of nearest neighbors to use to construct the target simplicial set. Default value is \code{n_neighbors}. Applies only if \code{y} is non-\code{NULL} and \code{numeric}.} \item{target_metric}{The metric used to measure distance for \code{y} if using supervised dimension reduction. Used only if \code{y} is numeric.} \item{target_weight}{Weighting factor between data topology and target topology. A value of 0.0 weights entirely on data, a value of 1.0 weights entirely on target. The default of 0.5 balances the weighting equally between data and target. Only applies if \code{y} is non-\code{NULL}.} \item{pca}{If set to a positive integer value, reduce data to this number of columns using PCA. Doesn't applied if the distance \code{metric} is \code{"hamming"}, or the dimensions of the data is larger than the number specified (i.e. number of rows and columns must be larger than the value of this parameter). If you have > 100 columns in a data frame or matrix, reducing the number of columns in this way may substantially increase the performance of the nearest neighbor search at the cost of a potential decrease in accuracy. In many t-SNE applications, a value of 50 is recommended, although there's no guarantee that this is appropriate for all settings.} \item{pca_center}{If \code{TRUE}, center the columns of \code{X} before carrying out PCA. For binary data, it's recommended to set this to \code{FALSE}.} \item{pcg_rand}{If \code{TRUE}, use the PCG random number generator (O'Neill, 2014) during optimization. Otherwise, use the faster (but probably less statistically good) Tausworthe "taus88" generator. The default is \code{TRUE}. This parameter has been superseded by \code{rng_type} -- if both are set, \code{rng_type} takes precedence.} \item{fast_sgd}{If \code{TRUE}, then the following combination of parameters is set: \code{pcg_rand = TRUE} and \code{n_sgd_threads = "auto"}. The default is \code{FALSE}. Setting this to \code{TRUE} will speed up the stochastic optimization phase, but give a potentially less accurate embedding, and which will not be exactly reproducible even with a fixed seed. For visualization, \code{fast_sgd = TRUE} will give perfectly good results. For more generic dimensionality reduction, it's safer to leave \code{fast_sgd = FALSE}. If \code{fast_sgd = TRUE}, then user-supplied values of \code{pcg_rand} and \code{n_sgd_threads}, are ignored.} \item{ret_model}{If \code{TRUE}, then return extra data that can be used to add new data to an existing embedding via \code{\link{umap_transform}}. The embedded coordinates are returned as the list item \code{embedding}. If \code{FALSE}, just return the coordinates. This parameter can be used in conjunction with \code{ret_nn} and \code{ret_extra}. Note that some settings are incompatible with the production of a UMAP model: external neighbor data (passed via a list to \code{nn_method}), and factor columns that were included via the \code{metric} parameter. In the latter case, the model produced is based only on the numeric data. A transformation using new data is possible, but the factor columns in the new data are ignored. Note that setting \code{ret_model = TRUE} forces the use of the approximate nearest neighbors method. Because small datasets would otherwise use exact nearest neighbor calculations, setting \code{ret_model = TRUE} means that different results may be returned for small datasets in terms of both the returned nearest neighbors (if requested) and the final embedded coordinates, compared to \code{ret_model = FALSE}, even if the random number seed is fixed. To avoid this, explicitly set \code{nn_method = "annoy"} in the \code{ret_model = FALSE} case.} \item{ret_nn}{If \code{TRUE}, then in addition to the embedding, also return nearest neighbor data that can be used as input to \code{nn_method} to avoid the overhead of repeatedly calculating the nearest neighbors when manipulating unrelated parameters (e.g. \code{min_dist}, \code{n_epochs}, \code{init}). See the "Value" section for the names of the list items. If \code{FALSE}, just return the coordinates. Note that the nearest neighbors could be sensitive to data scaling, so be wary of reusing nearest neighbor data if modifying the \code{scale} parameter. This parameter can be used in conjunction with \code{ret_model} and \code{ret_extra}.} \item{ret_extra}{A vector indicating what extra data to return. May contain any combination of the following strings: \itemize{ \item \code{"model"} Same as setting \code{ret_model = TRUE}. \item \code{"nn"} Same as setting \code{ret_nn = TRUE}. \item \code{"fgraph"} the high dimensional fuzzy graph (i.e. the fuzzy simplicial set of the merged local views of the input data). The graph is returned as a sparse symmetric N x N matrix of class \link[Matrix]{dgCMatrix-class}, where a non-zero entry (i, j) gives the membership strength of the edge connecting vertex i and vertex j. This can be considered analogous to the input probability (or similarity or affinity) used in t-SNE and LargeVis. Note that the graph is further sparsified by removing edges with sufficiently low membership strength that they would not be sampled by the probabilistic edge sampling employed for optimization and therefore the number of non-zero elements in the matrix is dependent on \code{n_epochs}. If you are only interested in the fuzzy input graph (e.g. for clustering), setting \code{n_epochs = 0} will avoid any further sparsifying. Be aware that setting \code{binary_edge_weights = TRUE} will affect this graph (all non-zero edge weights will be 1). \item \code{"sigma"} the normalization value for each observation in the dataset when constructing the smoothed distances to each of its neighbors. This gives some sense of the local density of each observation in the high dimensional space: higher values of \code{sigma} indicate a higher dispersion or lower density. }} \item{tmpdir}{Temporary directory to store nearest neighbor indexes during nearest neighbor search. Default is \code{\link{tempdir}}. The index is only written to disk if \code{n_threads > 1} and \code{nn_method = "annoy"}; otherwise, this parameter is ignored.} \item{verbose}{If \code{TRUE}, log details to the console.} \item{batch}{If \code{TRUE}, then embedding coordinates are updated at the end of each epoch rather than during the epoch. In batch mode, results are reproducible with a fixed random seed even with \code{n_sgd_threads > 1}, at the cost of a slightly higher memory use. You may also have to modify \code{learning_rate} and increase \code{n_epochs}, so whether this provides a speed increase over the single-threaded optimization is likely to be dataset and hardware-dependent.} \item{opt_args}{A list of optimizer parameters, used when \code{batch = TRUE}. The default optimization method used is Adam (Kingma and Ba, 2014). \itemize{ \item \code{method} The optimization method to use. Either \code{"adam"} or \code{"sgd"} (stochastic gradient descent). Default: \code{"adam"}. \item \code{beta1} (Adam only). The weighting parameter for the exponential moving average of the first moment estimator. Effectively the momentum parameter. Should be a floating point value between 0 and 1. Higher values can smooth oscillatory updates in poorly-conditioned situations and may allow for a larger \code{learning_rate} to be specified, but too high can cause divergence. Default: \code{0.5}. \item \code{beta2} (Adam only). The weighting parameter for the exponential moving average of the uncentered second moment estimator. Should be a floating point value between 0 and 1. Controls the degree of adaptivity in the step-size. Higher values put more weight on previous time steps. Default: \code{0.9}. \item \code{eps} (Adam only). Intended to be a small value to prevent division by zero, but in practice can also affect convergence due to its interaction with \code{beta2}. Higher values reduce the effect of the step-size adaptivity and bring the behavior closer to stochastic gradient descent with momentum. Typical values are between 1e-8 and 1e-3. Default: \code{1e-7}. \item \code{alpha} The initial learning rate. Default: the value of the \code{learning_rate} parameter. }} \item{epoch_callback}{A function which will be invoked at the end of every epoch. Its signature should be: \code{(epoch, n_epochs, coords)}, where: \itemize{ \item \code{epoch} The current epoch number (between \code{1} and \code{n_epochs}). \item \code{n_epochs} Number of epochs to use during the optimization of the embedded coordinates. \item \code{coords} The embedded coordinates as of the end of the current epoch, as a matrix with dimensions (N, \code{n_components}). }} \item{pca_method}{Method to carry out any PCA dimensionality reduction when the \code{pca} parameter is specified. Allowed values are: \itemize{ \item{\code{"irlba"}}. Uses \code{\link[irlba]{prcomp_irlba}} from the \href{https://cran.r-project.org/package=irlba}{irlba} package. \item{\code{"rsvd"}}. Uses 5 iterations of \code{\link[irlba]{svdr}} from the \href{https://cran.r-project.org/package=irlba}{irlba} package. This is likely to give much faster but potentially less accurate results than using \code{"irlba"}. For the purposes of nearest neighbor calculation and coordinates initialization, any loss of accuracy doesn't seem to matter much. \item{\code{"bigstatsr"}}. Uses \code{\link[bigstatsr]{big_randomSVD}} from the \href{https://cran.r-project.org/package=bigstatsr}{bigstatsr} package. The SVD methods used in \code{bigstatsr} may be faster on systems without access to efficient linear algebra libraries (e.g. Windows). \strong{Note}: \code{bigstatsr} is \emph{not} a dependency of uwot: if you choose to use this package for PCA, you \emph{must} install it yourself. \item{\code{"svd"}}. Uses \code{\link[base]{svd}} for the SVD. This is likely to be slow for all but the smallest datasets. \item{\code{"auto"}} (the default). Uses \code{"irlba"}, unless more than 50% of the full set of singular vectors would be calculated, in which case \code{"svd"} is used. }} \item{binary_edge_weights}{If \code{TRUE} then edge weights in the input graph are treated as binary (0/1) rather than real valued. This affects the sampling frequency of neighbors and is the strategy used by the PaCMAP method (Wang and co-workers, 2020). Practical (Böhm and co-workers, 2020) and theoretical (Damrich and Hamprecht, 2021) work suggests this has little effect on UMAP's performance.} \item{seed}{Integer seed to use to initialize the random number generator state. Combined with \code{n_sgd_threads = 1} or \code{batch = TRUE}, this should give consistent output across multiple runs on a given installation. Setting this value is equivalent to calling \code{\link[base]{set.seed}}, but it may be more convenient in some situations than having to call a separate function. The default is to not set a seed. If \code{ret_model = TRUE}, the seed will be stored in the output model and then used to set the seed inside \code{\link{umap_transform}}.} \item{nn_args}{A list containing additional arguments to pass to the nearest neighbor method. For \code{nn_method = "annoy"}, you can specify \code{"n_trees"} and \code{"search_k"}, and these will override the \code{n_trees} and \code{search_k} parameters. For \code{nn_method = "hnsw"}, you may specify the following arguments: \itemize{ \item \code{M} The maximum number of neighbors to keep for each vertex. Reasonable values are \code{2} to \code{100}. Higher values give better recall at the cost of more memory. Default value is \code{16}. \item \code{ef_construction} A positive integer specifying the size of the dynamic list used during index construction. A higher value will provide better results at the cost of a longer time to build the index. Default is \code{200}. \item \code{ef} A positive integer specifying the size of the dynamic list used during search. This cannot be smaller than \code{n_neighbors} and cannot be higher than the number of items in the index. Default is \code{10}. } For \code{nn_method = "nndescent"}, you may specify the following arguments: \itemize{ \item \code{n_trees} The number of trees to use in a random projection forest to initialize the search. A larger number will give more accurate results at the cost of a longer computation time. The default of \code{NULL} means that the number is chosen based on the number of observations in \code{X}. \item \code{max_candidates} The number of potential neighbors to explore per iteration. By default, this is set to \code{n_neighbors} or \code{60}, whichever is smaller. A larger number will give more accurate results at the cost of a longer computation time. \item \code{n_iters} The number of iterations to run the search. A larger number will give more accurate results at the cost of a longer computation time. By default, this will be chosen based on the number of observations in \code{X}. You may also need to modify the convergence criterion \code{delta}. \item \code{delta} The minimum relative change in the neighbor graph allowed before early stopping. Should be a value between 0 and 1. The smaller the value, the smaller the amount of progress between iterations is allowed. Default value of \code{0.001} means that at least 0.1% of the neighbor graph must be updated at each iteration. \item \code{init} How to initialize the nearest neighbor descent. By default this is set to \code{"tree"} and uses a random project forest. If you set this to \code{"rand"}, then a random selection is used. Usually this is less accurate than using RP trees, but for high-dimensional cases, there may be little difference in the quality of the initialization and random initialization will be a lot faster. If you set this to \code{"rand"}, then the \code{n_trees} parameter is ignored. \item \code{pruning_degree_multiplier} The maximum number of edges per node to retain in the search graph, relative to \code{n_neighbors}. A larger value will give more accurate results at the cost of a longer computation time. Default is \code{1.5}. This parameter only affects neighbor search when transforming new data with \code{\link{umap_transform}}. \item \code{epsilon} Controls the degree of the back-tracking when traversing the search graph. Setting this to \code{0.0} will do a greedy search with no back-tracking. A larger value will give more accurate results at the cost of a longer computation time. Default is \code{0.1}. This parameter only affects neighbor search when transforming new data with \code{\link{umap_transform}}. \item \code{max_search_fraction} Specifies the maximum fraction of the search graph to traverse. By default, this is set to \code{1.0}, so the entire graph (i.e. all items in \code{X}) may be visited. You may want to set this to a smaller value if you have a very large dataset (in conjunction with \code{epsilon}) to avoid an inefficient exhaustive search of the data in \code{X}. This parameter only affects neighbor search when transforming new data with \code{\link{umap_transform}}. } For \code{nn_method = "nndescent"}, you may specify the following arguments: \itemize{ \item \code{n_trees} The number of trees to use in a random projection forest to initialize the search. A larger number will give more accurate results at the cost of a longer computation time. The default of \code{NULL} means that the number is chosen based on the number of observations in \code{X}. \item \code{max_candidates} The number of potential neighbors to explore per iteration. By default, this is set to \code{n_neighbors} or \code{60}, whichever is smaller. A larger number will give more accurate results at the cost of a longer computation time. \item \code{n_iters} The number of iterations to run the search. A larger number will give more accurate results at the cost of a longer computation time. By default, this will be chosen based on the number of observations in \code{X}. You may also need to modify the convergence criterion \code{delta}. \item \code{delta} The minimum relative change in the neighbor graph allowed before early stopping. Should be a value between 0 and 1. The smaller the value, the smaller the amount of progress between iterations is allowed. Default value of \code{0.001} means that at least 0.1% of the neighbor graph must be updated at each iteration. \item \code{init} How to initialize the nearest neighbor descent. By default this is set to \code{"tree"} and uses a random project forest. If you set this to \code{"rand"}, then a random selection is used. Usually this is less accurate than using RP trees, but for high-dimensional cases, there may be little difference in the quality of the initialization and random initialization will be a lot faster. If you set this to \code{"rand"}, then the \code{n_trees} parameter is ignored. \item \code{pruning_degree_multiplier} The maximum number of edges per node to retain in the search graph, relative to \code{n_neighbors}. A larger value will give more accurate results at the cost of a longer computation time. Default is \code{1.5}. This parameter only affects neighbor search when transforming new data with \code{\link{umap_transform}}. \item \code{epsilon} Controls the degree of the back-tracking when traversing the search graph. Setting this to \code{0.0} will do a greedy search with no back-tracking. A larger value will give more accurate results at the cost of a longer computation time. Default is \code{0.1}. This parameter only affects neighbor search when transforming new data with \code{\link{umap_transform}}. \item \code{max_search_fraction} Specifies the maximum fraction of the search graph to traverse. By default, this is set to \code{1.0}, so the entire graph (i.e. all items in \code{X}) may be visited. You may want to set this to a smaller value if you have a very large dataset (in conjunction with \code{epsilon}) to avoid an inefficient exhaustive search of the data in \code{X}. This parameter only affects neighbor search when transforming new data with \code{\link{umap_transform}}. }} \item{rng_type}{The type of random number generator to use during optimization. One of: \itemize{ \item{\code{"pcg"}}. Use the PCG random number generator (O'Neill, 2014). \item{\code{"tausworthe"}}. Use the Tausworthe "taus88" generator. \item{\code{"deterministic"}}. Use a deterministic number generator. This isn't actually random, but may provide enough variation in the negative sampling to give a good embedding and can provide a noticeable speed-up. } For backwards compatibility, by default this is unset and the choice of \code{pcg_rand} is used (making "pcg" the effective default).} } \value{ A matrix of optimized coordinates, or: \itemize{ \item if \code{ret_model = TRUE} (or \code{ret_extra} contains \code{"model"}), returns a list containing extra information that can be used to add new data to an existing embedding via \code{\link{umap_transform}}. In this case, the coordinates are available in the list item \code{embedding}. \bold{NOTE}: The contents of the \code{model} list should \emph{not} be considered stable or part of the public API, and are purposely left undocumented. \item if \code{ret_nn = TRUE} (or \code{ret_extra} contains \code{"nn"}), returns the nearest neighbor data as a list called \code{nn}. This contains one list for each \code{metric} calculated, itself containing a matrix \code{idx} with the integer ids of the neighbors; and a matrix \code{dist} with the distances. The \code{nn} list (or a sub-list) can be used as input to the \code{nn_method} parameter. \item if \code{ret_extra} contains \code{"fgraph"} returns the high dimensional fuzzy graph as a sparse matrix called \code{fgraph}, of type \link[Matrix]{dgCMatrix-class}. \item if \code{ret_extra} contains \code{"sigma"}, returns a vector of the smooth knn distance normalization terms for each observation as \code{"sigma"} and a vector \code{"rho"} containing the largest distance to the locally connected neighbors of each observation. \item if \code{ret_extra} contains \code{"localr"}, returns a vector of the estimated local radii, the sum of \code{"sigma"} and \code{"rho"}. } The returned list contains the combined data from any combination of specifying \code{ret_model}, \code{ret_nn} and \code{ret_extra}. } \description{ A faster (but less flexible) version of the UMAP (McInnes et al, 2018) gradient. For more detail on UMAP, see the \code{\link{umap}} function. } \details{ By setting the UMAP curve parameters \code{a} and \code{b} to \code{1}, you get back the Cauchy distribution as used in t-SNE (van der Maaten and Hinton, 2008) and LargeVis (Tang et al., 2016). It also results in a substantially simplified gradient expression. This can give a speed improvement of around 50\%. } \examples{ iris_tumap <- tumap(iris, n_neighbors = 50, learning_rate = 0.5) } \references{ Belkin, M., & Niyogi, P. (2002). Laplacian eigenmaps and spectral techniques for embedding and clustering. In \emph{Advances in neural information processing systems} (pp. 585-591). \url{http://papers.nips.cc/paper/1961-laplacian-eigenmaps-and-spectral-techniques-for-embedding-and-clustering.pdf} Böhm, J. N., Berens, P., & Kobak, D. (2020). A unifying perspective on neighbor embeddings along the attraction-repulsion spectrum. \emph{arXiv preprint} \emph{arXiv:2007.08902}. \url{https://arxiv.org/abs/2007.08902} Damrich, S., & Hamprecht, F. A. (2021). On UMAP's true loss function. \emph{Advances in Neural Information Processing Systems}, \emph{34}. \url{https://proceedings.neurips.cc/paper/2021/hash/2de5d16682c3c35007e4e92982f1a2ba-Abstract.html} Dong, W., Moses, C., & Li, K. (2011, March). Efficient k-nearest neighbor graph construction for generic similarity measures. In \emph{Proceedings of the 20th international conference on World Wide Web} (pp. 577-586). ACM. \doi{10.1145/1963405.1963487}. Kingma, D. P., & Ba, J. (2014). Adam: A method for stochastic optimization. \emph{arXiv preprint} \emph{arXiv}:1412.6980. \url{https://arxiv.org/abs/1412.6980} Malkov, Y. A., & Yashunin, D. A. (2018). Efficient and robust approximate nearest neighbor search using hierarchical navigable small world graphs. \emph{IEEE transactions on pattern analysis and machine intelligence}, \emph{42}(4), 824-836. McInnes, L., Healy, J., & Melville, J. (2018). UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction \emph{arXiv preprint} \emph{arXiv}:1802.03426. \url{https://arxiv.org/abs/1802.03426} O'Neill, M. E. (2014). \emph{PCG: A family of simple fast space-efficient statistically good algorithms for random number generation} (Report No. HMC-CS-2014-0905). Harvey Mudd College. Tang, J., Liu, J., Zhang, M., & Mei, Q. (2016, April). Visualizing large-scale and high-dimensional data. In \emph{Proceedings of the 25th International Conference on World Wide Web} (pp. 287-297). International World Wide Web Conferences Steering Committee. \url{https://arxiv.org/abs/1602.00370} Van der Maaten, L., & Hinton, G. (2008). Visualizing data using t-SNE. \emph{Journal of Machine Learning Research}, \emph{9} (2579-2605). \url{https://www.jmlr.org/papers/v9/vandermaaten08a.html} Wang, Y., Huang, H., Rudin, C., & Shaposhnik, Y. (2021). Understanding How Dimension Reduction Tools Work: An Empirical Approach to Deciphering t-SNE, UMAP, TriMap, and PaCMAP for Data Visualization. \emph{Journal of Machine Learning Research}, \emph{22}(201), 1-73. \url{https://www.jmlr.org/papers/v22/20-1061.html} } uwot/man/simplicial_set_intersect.Rd0000644000176200001440000000327314730166740017410 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/uwot.R \name{simplicial_set_intersect} \alias{simplicial_set_intersect} \title{Merge Similarity Graph by Simplicial Set Intersection} \usage{ simplicial_set_intersect(x, y, weight = 0.5, n_threads = NULL, verbose = FALSE) } \arguments{ \item{x}{A sparse matrix representing the first similarity graph in the intersection operation.} \item{y}{A sparse matrix representing the second similarity graph in the intersection operation.} \item{weight}{A value between \code{0 - 1}, controlling the relative influence of \code{x} and \code{y} in the intersection. Default (\code{0.5}) gives equal influence. Values smaller than \code{0.5} put more weight on \code{x}. Values greater than \code{0.5} put more weight on \code{y}.} \item{n_threads}{Number of threads to use when resetting the local metric. Default is half the number of concurrent threads supported by the system.} \item{verbose}{If \code{TRUE}, log progress to the console.} } \value{ A sparse matrix containing the intersection of \code{x} and \code{y}. } \description{ Combine two similarity graphs by treating them as fuzzy topological sets and forming the intersection. } \examples{ # Form two different "views" of the same data iris30 <- iris[c(1:10, 51:60, 101:110), ] iris_sg12 <- similarity_graph(iris30[, 1:2], n_neighbors = 5) iris_sg34 <- similarity_graph(iris30[, 3:4], n_neighbors = 5) # Combine the two representations into one iris_combined <- simplicial_set_intersect(iris_sg12, iris_sg34) # Optimize the layout based on the combined view iris_combined_umap <- optimize_graph_layout(iris_combined, n_epochs = 100) } uwot/man/similarity_graph.Rd0000644000176200001440000006346714730166740015711 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/uwot.R \name{similarity_graph} \alias{similarity_graph} \title{Similarity Graph} \usage{ similarity_graph( X = NULL, n_neighbors = NULL, metric = "euclidean", scale = NULL, set_op_mix_ratio = 1, local_connectivity = 1, nn_method = NULL, n_trees = 50, search_k = 2 * n_neighbors * n_trees, perplexity = 50, method = "umap", y = NULL, target_n_neighbors = n_neighbors, target_metric = "euclidean", target_weight = 0.5, pca = NULL, pca_center = TRUE, ret_extra = c(), n_threads = NULL, grain_size = 1, kernel = "gauss", tmpdir = tempdir(), verbose = getOption("verbose", TRUE), pca_method = NULL, binary_edge_weights = FALSE, nn_args = list() ) } \arguments{ \item{X}{Input data. Can be a \code{\link{data.frame}}, \code{\link{matrix}}, \code{\link[stats]{dist}} object or \code{\link[Matrix]{sparseMatrix}}. Matrix and data frames should contain one observation per row. Data frames will have any non-numeric columns removed, although factor columns will be used if explicitly included via \code{metric} (see the help for \code{metric} for details). A sparse matrix is interpreted as a distance matrix, and is assumed to be symmetric, so you can also pass in an explicitly upper or lower triangular sparse matrix to save storage. There must be at least \code{n_neighbors} non-zero distances for each row. Both implicit and explicit zero entries are ignored. Set zero distances you want to keep to an arbitrarily small non-zero value (e.g. \code{1e-10}). \code{X} can also be \code{NULL} if pre-computed nearest neighbor data is passed to \code{nn_method}.} \item{n_neighbors}{The size of local neighborhood (in terms of number of neighboring sample points) used for manifold approximation. Larger values result in more global views of the manifold, while smaller values result in more local data being preserved. In general values should be in the range \code{2} to \code{100}.} \item{metric}{Type of distance metric to use to find nearest neighbors. For \code{nn_method = "annoy"} this can be one of: \itemize{ \item \code{"euclidean"} (the default) \item \code{"cosine"} \item \code{"manhattan"} \item \code{"hamming"} \item \code{"correlation"} (a distance based on the Pearson correlation) \item \code{"categorical"} (see below) } For \code{nn_method = "hnsw"} this can be one of: \itemize{ \item \code{"euclidean"} \item \code{"cosine"} \item \code{"correlation"} } If \href{https://cran.r-project.org/package=rnndescent}{rnndescent} is installed and \code{nn_method = "nndescent"} is specified then many more metrics are avaiable, including: \itemize{ \item \code{"braycurtis"} \item \code{"canberra"} \item \code{"chebyshev"} \item \code{"dice"} \item \code{"hamming"} \item \code{"hellinger"} \item \code{"jaccard"} \item \code{"jensenshannon"} \item \code{"kulsinski"} \item \code{"rogerstanimoto"} \item \code{"russellrao"} \item \code{"sokalmichener"} \item \code{"sokalsneath"} \item \code{"spearmanr"} \item \code{"symmetrickl"} \item \code{"tsss"} \item \code{"yule"} } For more details see the package documentation of \code{rnndescent}. For \code{nn_method = "fnn"}, the distance metric is always "euclidean". If \code{X} is a data frame or matrix, then multiple metrics can be specified, by passing a list to this argument, where the name of each item in the list is one of the metric names above. The value of each list item should be a vector giving the names or integer ids of the columns to be included in a calculation, e.g. \code{metric = list(euclidean = 1:4, manhattan = 5:10)}. Each metric calculation results in a separate fuzzy simplicial set, which are intersected together to produce the final set. Metric names can be repeated. Because non-numeric columns are removed from the data frame, it is safer to use column names than integer ids. Factor columns can also be used by specifying the metric name \code{"categorical"}. Factor columns are treated different from numeric columns and although multiple factor columns can be specified in a vector, each factor column specified is processed individually. If you specify a non-factor column, it will be coerced to a factor. For a given data block, you may override the \code{pca} and \code{pca_center} arguments for that block, by providing a list with one unnamed item containing the column names or ids, and then any of the \code{pca} or \code{pca_center} overrides as named items, e.g. \code{metric = list(euclidean = 1:4, manhattan = list(5:10, pca_center = FALSE))}. This exists to allow mixed binary and real-valued data to be included and to have PCA applied to both, but with centering applied only to the real-valued data (it is typical not to apply centering to binary data before PCA is applied).} \item{scale}{Scaling to apply to \code{X} if it is a data frame or matrix: \itemize{ \item{\code{"none"} or \code{FALSE} or \code{NULL}} No scaling. \item{\code{"Z"} or \code{"scale"} or \code{TRUE}} Scale each column to zero mean and variance 1. \item{\code{"maxabs"}} Center each column to mean 0, then divide each element by the maximum absolute value over the entire matrix. \item{\code{"range"}} Range scale the entire matrix, so the smallest element is 0 and the largest is 1. \item{\code{"colrange"}} Scale each column in the range (0,1). } For \code{method} \code{"umap"}, the default is \code{"none"}. For \code{"largevis"}, the default is \code{"maxabs"}.} \item{set_op_mix_ratio}{Interpolate between (fuzzy) union and intersection as the set operation used to combine local fuzzy simplicial sets to obtain a global fuzzy simplicial sets. Both fuzzy set operations use the product t-norm. The value of this parameter should be between \code{0.0} and \code{1.0}; a value of \code{1.0} will use a pure fuzzy union, while \code{0.0} will use a pure fuzzy intersection. Ignored if \code{method = "largevis"}} \item{local_connectivity}{The local connectivity required -- i.e. the number of nearest neighbors that should be assumed to be connected at a local level. The higher this value the more connected the manifold becomes locally. In practice this should be not more than the local intrinsic dimension of the manifold. Ignored if \code{method = "largevis"}.} \item{nn_method}{Method for finding nearest neighbors. Options are: \itemize{ \item \code{"fnn"}. Use exact nearest neighbors via the \href{https://cran.r-project.org/package=FNN}{FNN} package. \item \code{"annoy"} Use approximate nearest neighbors via the \href{https://cran.r-project.org/package=RcppAnnoy}{RcppAnnoy} package. \item \code{"hnsw"} Use approximate nearest neighbors with the Hierarchical Navigable Small World (HNSW) method (Malkov and Yashunin, 2018) via the \href{https://cran.r-project.org/package=RcppHNSW}{RcppHNSW} package. \code{RcppHNSW} is not a dependency of this package: this option is only available if you have installed \code{RcppHNSW} yourself. Also, HNSW only supports the following arguments for \code{metric} and \code{target_metric}: \code{"euclidean"}, \code{"cosine"} and \code{"correlation"}. \item \code{"nndescent"} Use approximate nearest neighbors with the Nearest Neighbor Descent method (Dong et al., 2011) via the \href{https://cran.r-project.org/package=rnndescent}{rnndescent} package. \code{rnndescent} is not a dependency of this package: this option is only available if you have installed \code{rnndescent} yourself. } By default, if \code{X} has less than 4,096 vertices, the exact nearest neighbors are found. Otherwise, approximate nearest neighbors are used. You may also pass pre-calculated nearest neighbor data to this argument. It must be one of two formats, either a list consisting of two elements: \itemize{ \item \code{"idx"}. A \code{n_vertices x n_neighbors} matrix containing the integer indexes of the nearest neighbors in \code{X}. Each vertex is considered to be its own nearest neighbor, i.e. \code{idx[, 1] == 1:n_vertices}. \item \code{"dist"}. A \code{n_vertices x n_neighbors} matrix containing the distances of the nearest neighbors. } or a sparse distance matrix of type \code{dgCMatrix}, with dimensions \code{n_vertices x n_vertices}. Distances should be arranged by column, i.e. a non-zero entry in row \code{j} of the \code{i}th column indicates that the \code{j}th observation in \code{X} is a nearest neighbor of the \code{i}th observation with the distance given by the value of that element. The \code{n_neighbors} parameter is ignored when using precomputed nearest neighbor data. If using the sparse distance matrix input, each column can contain a different number of neighbors.} \item{n_trees}{Number of trees to build when constructing the nearest neighbor index. The more trees specified, the larger the index, but the better the results. With \code{search_k}, determines the accuracy of the Annoy nearest neighbor search. Only used if the \code{nn_method} is \code{"annoy"}. Sensible values are between \code{10} to \code{100}.} \item{search_k}{Number of nodes to search during the neighbor retrieval. The larger k, the more the accurate results, but the longer the search takes. With \code{n_trees}, determines the accuracy of the Annoy nearest neighbor search. Only used if the \code{nn_method} is \code{"annoy"}.} \item{perplexity}{Used only if \code{method = "largevis"}. Controls the size of the local neighborhood used for manifold approximation. Should be a value between 1 and one less than the number of items in \code{X}. If specified, you should \emph{not} specify a value for \code{n_neighbors} unless you know what you are doing.} \item{method}{How to generate the similarities between items. One of: \itemize{ \item \code{"umap"} The UMAP method of McInnes et al. (2018). \item \code{"largevis"} The LargeVis method of Tang et al. (2016). }} \item{y}{Optional target data to add supervised or semi-supervised weighting to the similarity graph . Can be a vector, matrix or data frame. Use the \code{target_metric} parameter to specify the metrics to use, using the same syntax as \code{metric}. Usually either a single numeric or factor column is used, but more complex formats are possible. The following types are allowed: \itemize{ \item Factor columns with the same length as \code{X}. \code{NA} is allowed for any observation with an unknown level, in which case UMAP operates as a form of semi-supervised learning. Each column is treated separately. \item Numeric data. \code{NA} is \emph{not} allowed in this case. Use the parameter \code{target_n_neighbors} to set the number of neighbors used with \code{y}. If unset, \code{n_neighbors} is used. Unlike factors, numeric columns are grouped into one block unless \code{target_metric} specifies otherwise. For example, if you wish columns \code{a} and \code{b} to be treated separately, specify \code{target_metric = list(euclidean = "a", euclidean = "b")}. Otherwise, the data will be effectively treated as a matrix with two columns. \item Nearest neighbor data, consisting of a list of two matrices, \code{idx} and \code{dist}. These represent the precalculated nearest neighbor indices and distances, respectively. This is the same format as that expected for precalculated data in \code{nn_method}. This format assumes that the underlying data was a numeric vector. Any user-supplied value of the \code{target_n_neighbors} parameter is ignored in this case, because the the number of columns in the matrices is used for the value. Multiple nearest neighbor data using different metrics can be supplied by passing a list of these lists. } Unlike \code{X}, all factor columns included in \code{y} are automatically used. This parameter is ignored if \code{method = "largevis"}.} \item{target_n_neighbors}{Number of nearest neighbors to use to construct the target simplicial set. Default value is \code{n_neighbors}. Applies only if \code{y} is non-\code{NULL} and \code{numeric}. This parameter is ignored if \code{method = "largevis"}.} \item{target_metric}{The metric used to measure distance for \code{y} if using supervised dimension reduction. Used only if \code{y} is numeric. This parameter is ignored if \code{method = "largevis"}.} \item{target_weight}{Weighting factor between data topology and target topology. A value of 0.0 weights entirely on data, a value of 1.0 weights entirely on target. The default of 0.5 balances the weighting equally between data and target. Only applies if \code{y} is non-\code{NULL}. This parameter is ignored if \code{method = "largevis"}.} \item{pca}{If set to a positive integer value, reduce data to this number of columns using PCA. Doesn't applied if the distance \code{metric} is \code{"hamming"}, or the dimensions of the data is larger than the number specified (i.e. number of rows and columns must be larger than the value of this parameter). If you have > 100 columns in a data frame or matrix, reducing the number of columns in this way may substantially increase the performance of the nearest neighbor search at the cost of a potential decrease in accuracy. In many t-SNE applications, a value of 50 is recommended, although there's no guarantee that this is appropriate for all settings.} \item{pca_center}{If \code{TRUE}, center the columns of \code{X} before carrying out PCA. For binary data, it's recommended to set this to \code{FALSE}.} \item{ret_extra}{A vector indicating what extra data to return. May contain any combination of the following strings: \itemize{ \item \code{"nn"} nearest neighbor data that can be used as input to \code{nn_method} to avoid the overhead of repeatedly calculating the nearest neighbors when manipulating unrelated parameters. See the "Value" section for the names of the list items. Note that the nearest neighbors could be sensitive to data scaling, so be wary of reusing nearest neighbor data if modifying the \code{scale} parameter. \item \code{"sigma"} the normalization value for each observation in the dataset when constructing the smoothed distances to each of its neighbors. This gives some sense of the local density of each observation in the high dimensional space: higher values of \code{sigma} indicate a higher dispersion or lower density. }} \item{n_threads}{Number of threads to use. Default is half the number of concurrent threads supported by the system. For nearest neighbor search, only applies if \code{nn_method = "annoy"}. If \code{n_threads > 1}, then the Annoy index will be temporarily written to disk in the location determined by \code{\link[base]{tempfile}}.} \item{grain_size}{The minimum amount of work to do on each thread. If this value is set high enough, then less than \code{n_threads} will be used for processing, which might give a performance improvement if the overhead of thread management and context switching was outweighing the improvement due to concurrent processing. This should be left at default (\code{1}) and work will be spread evenly over all the threads specified.} \item{kernel}{Used only if \code{method = "largevis"}. Type of kernel function to create input similiarties. Can be one of \code{"gauss"} (the default) or \code{"knn"}. \code{"gauss"} uses the usual Gaussian weighted similarities. \code{"knn"} assigns equal similiarties. to every edge in the nearest neighbor graph, and zero otherwise, using \code{perplexity} nearest neighbors. The \code{n_neighbors} parameter is ignored in this case.} \item{tmpdir}{Temporary directory to store nearest neighbor indexes during nearest neighbor search. Default is \code{\link{tempdir}}. The index is only written to disk if \code{n_threads > 1} and \code{nn_method = "annoy"}; otherwise, this parameter is ignored.} \item{verbose}{If \code{TRUE}, log details to the console.} \item{pca_method}{Method to carry out any PCA dimensionality reduction when the \code{pca} parameter is specified. Allowed values are: \itemize{ \item{\code{"irlba"}}. Uses \code{\link[irlba]{prcomp_irlba}} from the \href{https://cran.r-project.org/package=irlba}{irlba} package. \item{\code{"rsvd"}}. Uses 5 iterations of \code{\link[irlba]{svdr}} from the \href{https://cran.r-project.org/package=irlba}{irlba} package. This is likely to give much faster but potentially less accurate results than using \code{"irlba"}. For the purposes of nearest neighbor calculation and coordinates initialization, any loss of accuracy doesn't seem to matter much. \item{\code{"bigstatsr"}}. Uses \code{\link[bigstatsr]{big_randomSVD}} from the \href{https://cran.r-project.org/package=bigstatsr}{bigstatsr} package. The SVD methods used in \code{bigstatsr} may be faster on systems without access to efficient linear algebra libraries (e.g. Windows). \strong{Note}: \code{bigstatsr} is \emph{not} a dependency of uwot: if you choose to use this package for PCA, you \emph{must} install it yourself. \item{\code{"svd"}}. Uses \code{\link[base]{svd}} for the SVD. This is likely to be slow for all but the smallest datasets. \item{\code{"auto"}} (the default). Uses \code{"irlba"}, unless more than 50% of the full set of singular vectors would be calculated, in which case \code{"svd"} is used. }} \item{binary_edge_weights}{If \code{TRUE} then edge weights of the returned graph are binary (0/1) rather than reflecting the degree of similarity.} \item{nn_args}{A list containing additional arguments to pass to the nearest neighbor method. For \code{nn_method = "annoy"}, you can specify \code{"n_trees"} and \code{"search_k"}, and these will override the \code{n_trees} and \code{search_k} parameters. For \code{nn_method = "hnsw"}, you may specify the following arguments: \itemize{ \item \code{M} The maximum number of neighbors to keep for each vertex. Reasonable values are \code{2} to \code{100}. Higher values give better recall at the cost of more memory. Default value is \code{16}. \item \code{ef_construction} A positive integer specifying the size of the dynamic list used during index construction. A higher value will provide better results at the cost of a longer time to build the index. Default is \code{200}. \item \code{ef} A positive integer specifying the size of the dynamic list used during search. This cannot be smaller than \code{n_neighbors} and cannot be higher than the number of items in the index. Default is \code{10}. } For \code{nn_method = "nndescent"}, you may specify the following arguments: \itemize{ \item \code{n_trees} The number of trees to use in a random projection forest to initialize the search. A larger number will give more accurate results at the cost of a longer computation time. The default of \code{NULL} means that the number is chosen based on the number of observations in \code{X}. \item \code{max_candidates} The number of potential neighbors to explore per iteration. By default, this is set to \code{n_neighbors} or \code{60}, whichever is smaller. A larger number will give more accurate results at the cost of a longer computation time. \item \code{n_iters} The number of iterations to run the search. A larger number will give more accurate results at the cost of a longer computation time. By default, this will be chosen based on the number of observations in \code{X}. You may also need to modify the convergence criterion \code{delta}. \item \code{delta} The minimum relative change in the neighbor graph allowed before early stopping. Should be a value between 0 and 1. The smaller the value, the smaller the amount of progress between iterations is allowed. Default value of \code{0.001} means that at least 0.1% of the neighbor graph must be updated at each iteration. \item \code{init} How to initialize the nearest neighbor descent. By default this is set to \code{"tree"} and uses a random project forest. If you set this to \code{"rand"}, then a random selection is used. Usually this is less accurate than using RP trees, but for high-dimensional cases, there may be little difference in the quality of the initialization and random initialization will be a lot faster. If you set this to \code{"rand"}, then the \code{n_trees} parameter is ignored. \item \code{pruning_degree_multiplier} The maximum number of edges per node to retain in the search graph, relative to \code{n_neighbors}. A larger value will give more accurate results at the cost of a longer computation time. Default is \code{1.5}. This parameter only affects neighbor search when transforming new data with \code{\link{umap_transform}}. \item \code{epsilon} Controls the degree of the back-tracking when traversing the search graph. Setting this to \code{0.0} will do a greedy search with no back-tracking. A larger value will give more accurate results at the cost of a longer computation time. Default is \code{0.1}. This parameter only affects neighbor search when transforming new data with \code{\link{umap_transform}}. \item \code{max_search_fraction} Specifies the maximum fraction of the search graph to traverse. By default, this is set to \code{1.0}, so the entire graph (i.e. all items in \code{X}) may be visited. You may want to set this to a smaller value if you have a very large dataset (in conjunction with \code{epsilon}) to avoid an inefficient exhaustive search of the data in \code{X}. This parameter only affects neighbor search when transforming new data with \code{\link{umap_transform}}. }} } \value{ A sparse symmetrized matrix of the similarities between the items in \code{X} or if \code{nn_method} contains pre-computed nearest neighbor data, the items in \code{nn_method}. Because of the symmetrization, there may be more non-zero items in each column than the specified value of \code{n_neighbors} (or pre-computed neighbors in \code{nn_method}). If \code{ret_extra} is specified then the return value will be a list containing: \itemize{ \item \code{similarity_graph} the similarity graph as a sparse matrix as described above. \item \code{nn} (if \code{ret_extra} contained \code{"nn"}) the nearest neighbor data as a list called \code{nn}. This contains one list for each \code{metric} calculated, itself containing a matrix \code{idx} with the integer ids of the neighbors; and a matrix \code{dist} with the distances. The \code{nn} list (or a sub-list) can be used as input to the \code{nn_method} parameter. \item \code{sigma} (if \code{ret_extra} contains \code{"sigma"}), a vector of calibrated parameters, one for each item in the input data, reflecting the local data density for that item. The exact definition of the values depends on the choice of the \code{method} parameter. \item \code{rho} (if \code{ret_extra} contains \code{"sigma"}), a vector containing the largest distance to the locally connected neighbors of each item in the input data. This will exist only if \code{method = "umap"}. \item \code{localr} (if \code{ret_extra} contains \code{"localr"}) a vector of the estimated local radii, the sum of \code{"sigma"} and \code{"rho"}. This will exist only if \code{method = "umap"}. } } \description{ Create a graph (as a sparse symmetric weighted adjacency matrix) representing the similarities between items in a data set. No dimensionality reduction is carried out. By default, the similarities are calculated using the merged fuzzy simplicial set approach in the Uniform Manifold Approximation and Projection (UMAP) method (McInnes et al., 2018), but the approach from LargeVis (Tang et al., 2016) can also be used. } \details{ This is equivalent to running \code{\link{umap}} with the \code{ret_extra = c("fgraph")} parameter, but without the overhead of calculating (or returning) the optimized low-dimensional coordinates. } \examples{ iris30 <- iris[c(1:10, 51:60, 101:110), ] # return a 30 x 30 sparse matrix with similarity data based on 10 nearest # neighbors per item iris30_sim_graph <- similarity_graph(iris30, n_neighbors = 10) # Default is to use the UMAP method of calculating similarities, but LargeVis # is also available: for that method, use perplexity instead of n_neighbors # to control neighborhood size. Use ret_extra = "nn" to return nearest # neighbor data as well as the similarity graph. Return value is a list # containing similarity_graph' and 'nn' items. iris30_lv_graph <- similarity_graph(iris30, perplexity = 10, method = "largevis", ret_extra = "nn" ) # If you have the neighbor information you don't need the original data iris30_lv_graph_nn <- similarity_graph( nn_method = iris30_lv_graph$nn, perplexity = 10, method = "largevis" ) all(iris30_lv_graph_nn == iris30_lv_graph$similarity_graph) } \references{ Dong, W., Moses, C., & Li, K. (2011, March). Efficient k-nearest neighbor graph construction for generic similarity measures. In \emph{Proceedings of the 20th international conference on World Wide Web} (pp. 577-586). ACM. \doi{10.1145/1963405.1963487}. Malkov, Y. A., & Yashunin, D. A. (2018). Efficient and robust approximate nearest neighbor search using hierarchical navigable small world graphs. \emph{IEEE transactions on pattern analysis and machine intelligence}, \emph{42}(4), 824-836. McInnes, L., Healy, J., & Melville, J. (2018). UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction \emph{arXiv preprint} \emph{arXiv}:1802.03426. \url{https://arxiv.org/abs/1802.03426} Tang, J., Liu, J., Zhang, M., & Mei, Q. (2016, April). Visualizing large-scale and high-dimensional data. In \emph{Proceedings of the 25th International Conference on World Wide Web} (pp. 287-297). International World Wide Web Conferences Steering Committee. \url{https://arxiv.org/abs/1602.00370} } uwot/man/optimize_graph_layout.Rd0000644000176200001440000003621614735021202016734 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/uwot.R \name{optimize_graph_layout} \alias{optimize_graph_layout} \title{Optimize Graph Layout} \usage{ optimize_graph_layout( graph, X = NULL, n_components = 2, n_epochs = NULL, learning_rate = 1, init = "spectral", init_sdev = NULL, spread = 1, min_dist = 0.01, repulsion_strength = 1, negative_sample_rate = 5, a = NULL, b = NULL, method = "umap", approx_pow = FALSE, pcg_rand = TRUE, fast_sgd = FALSE, n_sgd_threads = 0, grain_size = 1, verbose = getOption("verbose", TRUE), batch = FALSE, opt_args = NULL, epoch_callback = NULL, pca_method = NULL, binary_edge_weights = FALSE, rng_type = NULL ) } \arguments{ \item{graph}{A sparse, symmetric N x N weighted adjacency matrix representing a graph. Non-zero entries indicate an edge between two nodes with a given edge weight. There can be a varying number of non-zero entries in each row/column.} \item{X}{Optional input data. Used only for PCA-based initialization.} \item{n_components}{The dimension of the space to embed into. This defaults to \code{2} to provide easy visualization, but can reasonably be set to any integer value in the range \code{2} to \code{100}.} \item{n_epochs}{Number of epochs to use during the optimization of the embedded coordinates. By default, this value is set to \code{500} for datasets containing 10,000 vertices or less, and \code{200} otherwise. If \code{n_epochs = 0}, then coordinates determined by \code{"init"} will be returned. For UMAP, the default is \code{"none"}.} \item{learning_rate}{Initial learning rate used in optimization of the coordinates.} \item{init}{Type of initialization for the coordinates. Options are: \itemize{ \item \code{"spectral"} Spectral embedding using the normalized Laplacian of the fuzzy 1-skeleton, with Gaussian noise added. \item \code{"normlaplacian"}. Spectral embedding using the normalized Laplacian of the fuzzy 1-skeleton, without noise. \item \code{"random"}. Coordinates assigned using a uniform random distribution between -10 and 10. \item \code{"lvrandom"}. Coordinates assigned using a Gaussian distribution with standard deviation 1e-4, as used in LargeVis (Tang et al., 2016) and t-SNE. \item \code{"laplacian"}. Spectral embedding using the Laplacian Eigenmap. \item \code{"pca"}. The first two principal components from PCA of \code{X} if \code{X} is a data frame, and from a 2-dimensional classical MDS if \code{X} is of class \code{"dist"}. \item \code{"spca"}. Like \code{"pca"}, but each dimension is then scaled so the standard deviation is 1e-4, to give a distribution similar to that used in t-SNE. This is an alias for \code{init = "pca", init_sdev = 1e-4}. \item \code{"agspectral"} An "approximate global" modification of \code{"spectral"} which all edges in the graph to a value of 1, and then sets a random number of edges (\code{negative_sample_rate} edges per vertex) to 0.1, to approximate the effect of non-local affinities. \item A matrix of initial coordinates. } For spectral initializations, (\code{"spectral"}, \code{"normlaplacian"}, \code{"laplacian"}, \code{"agspectral"}), if more than one connected component is identified, no spectral initialization is attempted. Instead a PCA-based initialization is attempted. If \code{verbose = TRUE} the number of connected components are logged to the console. The existence of multiple connected components implies that a global view of the data cannot be attained with this initialization. Increasing the value of \code{n_neighbors} may help.} \item{init_sdev}{If non-\code{NULL}, scales each dimension of the initialized coordinates (including any user-supplied matrix) to this standard deviation. By default no scaling is carried out, except when \code{init = "spca"}, in which case the value is \code{0.0001}. Scaling the input may help if the unscaled versions result in initial coordinates with large inter-point distances or outliers. This usually results in small gradients during optimization and very little progress being made to the layout. Shrinking the initial embedding by rescaling can help under these circumstances. Scaling the result of \code{init = "pca"} is usually recommended and \code{init = "spca"} as an alias for \code{init = "pca", init_sdev = 1e-4} but for the spectral initializations the scaled versions usually aren't necessary unless you are using a large value of \code{n_neighbors} (e.g. \code{n_neighbors = 150} or higher). For compatibility with recent versions of the Python UMAP package, if you are using \code{init = "spectral"}, then you should also set \code{init_sdev = "range"}, which will range scale each of the columns containing the initial data between 0-10. This is not set by default to maintain backwards compatibility with previous versions of uwot.} \item{spread}{The effective scale of embedded points. In combination with \code{min_dist}, this determines how clustered/clumped the embedded points are.} \item{min_dist}{The effective minimum distance between embedded points. Smaller values will result in a more clustered/clumped embedding where nearby points on the manifold are drawn closer together, while larger values will result on a more even dispersal of points. The value should be set relative to the \code{spread} value, which determines the scale at which embedded points will be spread out.} \item{repulsion_strength}{Weighting applied to negative samples in low dimensional embedding optimization. Values higher than one will result in greater weight being given to negative samples.} \item{negative_sample_rate}{The number of negative edge/1-simplex samples to use per positive edge/1-simplex sample in optimizing the low dimensional embedding.} \item{a}{More specific parameters controlling the embedding. If \code{NULL} these values are set automatically as determined by \code{min_dist} and \code{spread}.} \item{b}{More specific parameters controlling the embedding. If \code{NULL} these values are set automatically as determined by \code{min_dist} and \code{spread}.} \item{method}{Cost function to optimize. One of: \itemize{ \item{\code{"umap"}}. The UMAP method of McInnes and co-workers (2018). \item{\code{"tumap"}}. UMAP with the \code{a} and \code{b} parameters fixed to 1. \item{\code{"largevis"}}. The LargeVis method Tang and co-workers (2016). }} \item{approx_pow}{If \code{TRUE}, use an approximation to the power function in the UMAP gradient, from \url{https://martin.ankerl.com/2012/01/25/optimized-approximative-pow-in-c-and-cpp/}.} \item{pcg_rand}{If \code{TRUE}, use the PCG random number generator (O'Neill, 2014) during optimization. Otherwise, use the faster (but probably less statistically good) Tausworthe "taus88" generator. The default is \code{TRUE}. This parameter has been superseded by \code{rng_type} -- if both are set, \code{rng_type} takes precedence.} \item{fast_sgd}{If \code{TRUE}, then the following combination of parameters is set: \code{pcg_rand = TRUE}, \code{n_sgd_threads = "auto"} and \code{approx_pow = TRUE}. The default is \code{FALSE}. Setting this to \code{TRUE} will speed up the stochastic optimization phase, but give a potentially less accurate embedding, and which will not be exactly reproducible even with a fixed seed. For visualization, \code{fast_sgd = TRUE} will give perfectly good results. For more generic dimensionality reduction, it's safer to leave \code{fast_sgd = FALSE}. If \code{fast_sgd = TRUE}, then user-supplied values of \code{pcg_rand}, \code{n_sgd_threads}, and \code{approx_pow} are ignored.} \item{n_sgd_threads}{Number of threads to use during stochastic gradient descent. If set to > 1, then be aware that if \code{batch = FALSE}, results will \emph{not} be reproducible, even if \code{set.seed} is called with a fixed seed before running. If set to \code{"auto"} then half the number of concurrent threads supported by the system will be used.} \item{grain_size}{The minimum amount of work to do on each thread. If this value is set high enough, then less than \code{n_threads} or \code{n_sgd_threads} will be used for processing, which might give a performance improvement if the overhead of thread management and context switching was outweighing the improvement due to concurrent processing. This should be left at default (\code{1}) and work will be spread evenly over all the threads specified.} \item{verbose}{If \code{TRUE}, log details to the console.} \item{batch}{If \code{TRUE}, then embedding coordinates are updated at the end of each epoch rather than during the epoch. In batch mode, results are reproducible with a fixed random seed even with \code{n_sgd_threads > 1}, at the cost of a slightly higher memory use. You may also have to modify \code{learning_rate} and increase \code{n_epochs}, so whether this provides a speed increase over the single-threaded optimization is likely to be dataset and hardware-dependent.} \item{opt_args}{A list of optimizer parameters, used when \code{batch = TRUE}. The default optimization method used is Adam (Kingma and Ba, 2014). \itemize{ \item \code{method} The optimization method to use. Either \code{"adam"} or \code{"sgd"} (stochastic gradient descent). Default: \code{"adam"}. \item \code{beta1} (Adam only). The weighting parameter for the exponential moving average of the first moment estimator. Effectively the momentum parameter. Should be a floating point value between 0 and 1. Higher values can smooth oscillatory updates in poorly-conditioned situations and may allow for a larger \code{learning_rate} to be specified, but too high can cause divergence. Default: \code{0.5}. \item \code{beta2} (Adam only). The weighting parameter for the exponential moving average of the uncentered second moment estimator. Should be a floating point value between 0 and 1. Controls the degree of adaptivity in the step-size. Higher values put more weight on previous time steps. Default: \code{0.9}. \item \code{eps} (Adam only). Intended to be a small value to prevent division by zero, but in practice can also affect convergence due to its interaction with \code{beta2}. Higher values reduce the effect of the step-size adaptivity and bring the behavior closer to stochastic gradient descent with momentum. Typical values are between 1e-8 and 1e-3. Default: \code{1e-7}. \item \code{alpha} The initial learning rate. Default: the value of the \code{learning_rate} parameter. }} \item{epoch_callback}{A function which will be invoked at the end of every epoch. Its signature should be: \code{(epoch, n_epochs, coords)}, where: \itemize{ \item \code{epoch} The current epoch number (between \code{1} and \code{n_epochs}). \item \code{n_epochs} Number of epochs to use during the optimization of the embedded coordinates. \item \code{coords} The embedded coordinates as of the end of the current epoch, as a matrix with dimensions (N, \code{n_components}). }} \item{pca_method}{Method to carry out any PCA dimensionality reduction when the \code{pca} parameter is specified. Allowed values are: \itemize{ \item{\code{"irlba"}}. Uses \code{\link[irlba]{prcomp_irlba}} from the \href{https://cran.r-project.org/package=irlba}{irlba} package. \item{\code{"rsvd"}}. Uses 5 iterations of \code{\link[irlba]{svdr}} from the \href{https://cran.r-project.org/package=irlba}{irlba} package. This is likely to give much faster but potentially less accurate results than using \code{"irlba"}. For the purposes of nearest neighbor calculation and coordinates initialization, any loss of accuracy doesn't seem to matter much. \item{\code{"bigstatsr"}}. Uses \code{\link[bigstatsr]{big_randomSVD}} from the \href{https://cran.r-project.org/package=bigstatsr}{bigstatsr} package. The SVD methods used in \code{bigstatsr} may be faster on systems without access to efficient linear algebra libraries (e.g. Windows). \strong{Note}: \code{bigstatsr} is \emph{not} a dependency of uwot: if you choose to use this package for PCA, you \emph{must} install it yourself. \item{\code{"svd"}}. Uses \code{\link[base]{svd}} for the SVD. This is likely to be slow for all but the smallest datasets. \item{\code{"auto"}} (the default). Uses \code{"irlba"}, unless more than 50% of the full set of singular vectors would be calculated, in which case \code{"svd"} is used. }} \item{binary_edge_weights}{If \code{TRUE} then edge weights in the input graph are treated as binary (0/1) rather than real valued.} \item{rng_type}{The type of random number generator to use during optimization. One of: \itemize{ \item{\code{"pcg"}}. Use the PCG random number generator (O'Neill, 2014). \item{\code{"tausworthe"}}. Use the Tausworthe "taus88" generator. \item{\code{"deterministic"}}. Use a deterministic number generator. This isn't actually random, but may provide enough variation in the negative sampling to give a good embedding and can provide a noticeable speed-up. } For backwards compatibility, by default this is unset and the choice of \code{pcg_rand} is used (making "pcg" the effective default).} } \value{ A matrix of optimized coordinates. } \description{ Carry out dimensionality reduction on an input graph, where the distances in the low dimensional space attempt to reproduce the neighbor relations in the input data. By default, the cost function used to optimize the output coordinates use the Uniform Manifold Approximation and Projection (UMAP) method (McInnes et al., 2018), but the approach from LargeVis (Tang et al., 2016) can also be used. This function can be used to produce a low dimensional representation of the graph produced by \code{\link{similarity_graph}}. } \examples{ iris30 <- iris[c(1:10, 51:60, 101:110), ] # return a 30 x 30 sparse matrix with similarity data based on 10 nearest # neighbors per item iris30_sim_graph <- similarity_graph(iris30, n_neighbors = 10) # produce 2D coordinates replicating the neighbor relations in the similarity # graph set.seed(42) iris30_opt <- optimize_graph_layout(iris30_sim_graph, X = iris30) # the above two steps are the same as: # set.seed(42); iris_umap <- umap(iris30, n_neighbors = 10) } \references{ Kingma, D. P., & Ba, J. (2014). Adam: A method for stochastic optimization. \emph{arXiv preprint} \emph{arXiv}:1412.6980. \url{https://arxiv.org/abs/1412.6980} McInnes, L., Healy, J., & Melville, J. (2018). UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction \emph{arXiv preprint} \emph{arXiv}:1802.03426. \url{https://arxiv.org/abs/1802.03426} O'Neill, M. E. (2014). \emph{PCG: A family of simple fast space-efficient statistically good algorithms for random number generation} (Report No. HMC-CS-2014-0905). Harvey Mudd College. Tang, J., Liu, J., Zhang, M., & Mei, Q. (2016, April). Visualizing large-scale and high-dimensional data. In \emph{Proceedings of the 25th International Conference on World Wide Web} (pp. 287-297). International World Wide Web Conferences Steering Committee. \url{https://arxiv.org/abs/1602.00370} } uwot/man/umap.Rd0000644000176200001440000012076214735021202013260 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/uwot.R \name{umap} \alias{umap} \title{Dimensionality Reduction with UMAP} \usage{ umap( X, n_neighbors = 15, n_components = 2, metric = "euclidean", n_epochs = NULL, learning_rate = 1, scale = FALSE, init = "spectral", init_sdev = NULL, spread = 1, min_dist = 0.01, set_op_mix_ratio = 1, local_connectivity = 1, bandwidth = 1, repulsion_strength = 1, negative_sample_rate = 5, a = NULL, b = NULL, nn_method = NULL, n_trees = 50, search_k = 2 * n_neighbors * n_trees, approx_pow = FALSE, y = NULL, target_n_neighbors = n_neighbors, target_metric = "euclidean", target_weight = 0.5, pca = NULL, pca_center = TRUE, pcg_rand = TRUE, fast_sgd = FALSE, ret_model = FALSE, ret_nn = FALSE, ret_extra = c(), n_threads = NULL, n_sgd_threads = 0, grain_size = 1, tmpdir = tempdir(), verbose = getOption("verbose", TRUE), batch = FALSE, opt_args = NULL, epoch_callback = NULL, pca_method = NULL, binary_edge_weights = FALSE, dens_scale = NULL, seed = NULL, nn_args = list(), rng_type = NULL ) } \arguments{ \item{X}{Input data. Can be a \code{\link{data.frame}}, \code{\link{matrix}}, \code{\link[stats]{dist}} object or \code{\link[Matrix]{sparseMatrix}}. Matrix and data frames should contain one observation per row. Data frames will have any non-numeric columns removed, although factor columns will be used if explicitly included via \code{metric} (see the help for \code{metric} for details). A sparse matrix is interpreted as a distance matrix, and is assumed to be symmetric, so you can also pass in an explicitly upper or lower triangular sparse matrix to save storage. There must be at least \code{n_neighbors} non-zero distances for each row. Both implicit and explicit zero entries are ignored. Set zero distances you want to keep to an arbitrarily small non-zero value (e.g. \code{1e-10}). \code{X} can also be \code{NULL} if pre-computed nearest neighbor data is passed to \code{nn_method}, and \code{init} is not \code{"spca"} or \code{"pca"}.} \item{n_neighbors}{The size of local neighborhood (in terms of number of neighboring sample points) used for manifold approximation. Larger values result in more global views of the manifold, while smaller values result in more local data being preserved. In general values should be in the range \code{2} to \code{100}.} \item{n_components}{The dimension of the space to embed into. This defaults to \code{2} to provide easy visualization, but can reasonably be set to any integer value in the range \code{2} to \code{100}.} \item{metric}{Type of distance metric to use to find nearest neighbors. For \code{nn_method = "annoy"} this can be one of: \itemize{ \item \code{"euclidean"} (the default) \item \code{"cosine"} \item \code{"manhattan"} \item \code{"hamming"} \item \code{"correlation"} (a distance based on the Pearson correlation) \item \code{"categorical"} (see below) } For \code{nn_method = "hnsw"} this can be one of: \itemize{ \item \code{"euclidean"} \item \code{"cosine"} \item \code{"correlation"} } If \href{https://cran.r-project.org/package=rnndescent}{rnndescent} is installed and \code{nn_method = "nndescent"} is specified then many more metrics are avaiable, including: \itemize{ \item \code{"braycurtis"} \item \code{"canberra"} \item \code{"chebyshev"} \item \code{"dice"} \item \code{"hamming"} \item \code{"hellinger"} \item \code{"jaccard"} \item \code{"jensenshannon"} \item \code{"kulsinski"} \item \code{"rogerstanimoto"} \item \code{"russellrao"} \item \code{"sokalmichener"} \item \code{"sokalsneath"} \item \code{"spearmanr"} \item \code{"symmetrickl"} \item \code{"tsss"} \item \code{"yule"} } For more details see the package documentation of \code{rnndescent}. For \code{nn_method = "fnn"}, the distance metric is always "euclidean". If \code{X} is a data frame or matrix, then multiple metrics can be specified, by passing a list to this argument, where the name of each item in the list is one of the metric names above. The value of each list item should be a vector giving the names or integer ids of the columns to be included in a calculation, e.g. \code{metric = list(euclidean = 1:4, manhattan = 5:10)}. Each metric calculation results in a separate fuzzy simplicial set, which are intersected together to produce the final set. Metric names can be repeated. Because non-numeric columns are removed from the data frame, it is safer to use column names than integer ids. Factor columns can also be used by specifying the metric name \code{"categorical"}. Factor columns are treated different from numeric columns and although multiple factor columns can be specified in a vector, each factor column specified is processed individually. If you specify a non-factor column, it will be coerced to a factor. For a given data block, you may override the \code{pca} and \code{pca_center} arguments for that block, by providing a list with one unnamed item containing the column names or ids, and then any of the \code{pca} or \code{pca_center} overrides as named items, e.g. \code{metric = list(euclidean = 1:4, manhattan = list(5:10, pca_center = FALSE))}. This exists to allow mixed binary and real-valued data to be included and to have PCA applied to both, but with centering applied only to the real-valued data (it is typical not to apply centering to binary data before PCA is applied).} \item{n_epochs}{Number of epochs to use during the optimization of the embedded coordinates. By default, this value is set to \code{500} for datasets containing 10,000 vertices or less, and \code{200} otherwise. If \code{n_epochs = 0}, then coordinates determined by \code{"init"} will be returned.} \item{learning_rate}{Initial learning rate used in optimization of the coordinates.} \item{scale}{Scaling to apply to \code{X} if it is a data frame or matrix: \itemize{ \item{\code{"none"} or \code{FALSE} or \code{NULL}} No scaling. \item{\code{"Z"} or \code{"scale"} or \code{TRUE}} Scale each column to zero mean and variance 1. \item{\code{"maxabs"}} Center each column to mean 0, then divide each element by the maximum absolute value over the entire matrix. \item{\code{"range"}} Range scale the entire matrix, so the smallest element is 0 and the largest is 1. \item{\code{"colrange"}} Scale each column in the range (0,1). } For UMAP, the default is \code{"none"}.} \item{init}{Type of initialization for the coordinates. Options are: \itemize{ \item \code{"spectral"} Spectral embedding using the normalized Laplacian of the fuzzy 1-skeleton, with Gaussian noise added. \item \code{"normlaplacian"}. Spectral embedding using the normalized Laplacian of the fuzzy 1-skeleton, without noise. \item \code{"random"}. Coordinates assigned using a uniform random distribution between -10 and 10. \item \code{"lvrandom"}. Coordinates assigned using a Gaussian distribution with standard deviation 1e-4, as used in LargeVis (Tang et al., 2016) and t-SNE. \item \code{"laplacian"}. Spectral embedding using the Laplacian Eigenmap (Belkin and Niyogi, 2002). \item \code{"pca"}. The first two principal components from PCA of \code{X} if \code{X} is a data frame, and from a 2-dimensional classical MDS if \code{X} is of class \code{"dist"}. \item \code{"spca"}. Like \code{"pca"}, but each dimension is then scaled so the standard deviation is 1e-4, to give a distribution similar to that used in t-SNE. This is an alias for \code{init = "pca", init_sdev = 1e-4}. \item \code{"agspectral"} An "approximate global" modification of \code{"spectral"} which all edges in the graph to a value of 1, and then sets a random number of edges (\code{negative_sample_rate} edges per vertex) to 0.1, to approximate the effect of non-local affinities. \item A matrix of initial coordinates. } For spectral initializations, (\code{"spectral"}, \code{"normlaplacian"}, \code{"laplacian"}, \code{"agspectral"}), if more than one connected component is identified, no spectral initialization is attempted. Instead a PCA-based initialization is attempted. If \code{verbose = TRUE} the number of connected components are logged to the console. The existence of multiple connected components implies that a global view of the data cannot be attained with this initialization. Increasing the value of \code{n_neighbors} may help.} \item{init_sdev}{If non-\code{NULL}, scales each dimension of the initialized coordinates (including any user-supplied matrix) to this standard deviation. By default no scaling is carried out, except when \code{init = "spca"}, in which case the value is \code{0.0001}. Scaling the input may help if the unscaled versions result in initial coordinates with large inter-point distances or outliers. This usually results in small gradients during optimization and very little progress being made to the layout. Shrinking the initial embedding by rescaling can help under these circumstances. Scaling the result of \code{init = "pca"} is usually recommended and \code{init = "spca"} as an alias for \code{init = "pca", init_sdev = 1e-4} but for the spectral initializations the scaled versions usually aren't necessary unless you are using a large value of \code{n_neighbors} (e.g. \code{n_neighbors = 150} or higher). For compatibility with recent versions of the Python UMAP package, if you are using \code{init = "spectral"}, then you should also set \code{init_sdev = "range"}, which will range scale each of the columns containing the initial data between 0-10. This is not set by default to maintain backwards compatibility with previous versions of uwot.} \item{spread}{The effective scale of embedded points. In combination with \code{min_dist}, this determines how clustered/clumped the embedded points are.} \item{min_dist}{The effective minimum distance between embedded points. Smaller values will result in a more clustered/clumped embedding where nearby points on the manifold are drawn closer together, while larger values will result on a more even dispersal of points. The value should be set relative to the \code{spread} value, which determines the scale at which embedded points will be spread out.} \item{set_op_mix_ratio}{Interpolate between (fuzzy) union and intersection as the set operation used to combine local fuzzy simplicial sets to obtain a global fuzzy simplicial sets. Both fuzzy set operations use the product t-norm. The value of this parameter should be between \code{0.0} and \code{1.0}; a value of \code{1.0} will use a pure fuzzy union, while \code{0.0} will use a pure fuzzy intersection.} \item{local_connectivity}{The local connectivity required -- i.e. the number of nearest neighbors that should be assumed to be connected at a local level. The higher this value the more connected the manifold becomes locally. In practice this should be not more than the local intrinsic dimension of the manifold.} \item{bandwidth}{The effective bandwidth of the kernel if we view the algorithm as similar to Laplacian Eigenmaps. Larger values induce more connectivity and a more global view of the data, smaller values concentrate more locally.} \item{repulsion_strength}{Weighting applied to negative samples in low dimensional embedding optimization. Values higher than one will result in greater weight being given to negative samples.} \item{negative_sample_rate}{The number of negative edge/1-simplex samples to use per positive edge/1-simplex sample in optimizing the low dimensional embedding.} \item{a}{More specific parameters controlling the embedding. If \code{NULL} these values are set automatically as determined by \code{min_dist} and \code{spread}.} \item{b}{More specific parameters controlling the embedding. If \code{NULL} these values are set automatically as determined by \code{min_dist} and \code{spread}.} \item{nn_method}{Method for finding nearest neighbors. Options are: \itemize{ \item \code{"fnn"}. Use exact nearest neighbors via the \href{https://cran.r-project.org/package=FNN}{FNN} package. \item \code{"annoy"} Use approximate nearest neighbors via the \href{https://cran.r-project.org/package=RcppAnnoy}{RcppAnnoy} package. \item \code{"hnsw"} Use approximate nearest neighbors with the Hierarchical Navigable Small World (HNSW) method (Malkov and Yashunin, 2018) via the \href{https://cran.r-project.org/package=RcppHNSW}{RcppHNSW} package. \code{RcppHNSW} is not a dependency of this package: this option is only available if you have installed \code{RcppHNSW} yourself. Also, HNSW only supports the following arguments for \code{metric} and \code{target_metric}: \code{"euclidean"}, \code{"cosine"} and \code{"correlation"}. \item \code{"nndescent"} Use approximate nearest neighbors with the Nearest Neighbor Descent method (Dong et al., 2011) via the \href{https://cran.r-project.org/package=rnndescent}{rnndescent} package. \code{rnndescent} is not a dependency of this package: this option is only available if you have installed \code{rnndescent} yourself. } By default, if \code{X} has less than 4,096 vertices, the exact nearest neighbors are found. Otherwise, approximate nearest neighbors are used. You may also pass pre-calculated nearest neighbor data to this argument. It must be one of two formats, either a list consisting of two elements: \itemize{ \item \code{"idx"}. A \code{n_vertices x n_neighbors} matrix containing the integer indexes of the nearest neighbors in \code{X}. Each vertex is considered to be its own nearest neighbor, i.e. \code{idx[, 1] == 1:n_vertices}. \item \code{"dist"}. A \code{n_vertices x n_neighbors} matrix containing the distances of the nearest neighbors. } or a sparse distance matrix of type \code{dgCMatrix}, with dimensions \code{n_vertices x n_vertices}. Distances should be arranged by column, i.e. a non-zero entry in row \code{j} of the \code{i}th column indicates that the \code{j}th observation in \code{X} is a nearest neighbor of the \code{i}th observation with the distance given by the value of that element. The \code{n_neighbors} parameter is ignored when using precomputed nearest neighbor data. If using the sparse distance matrix input, each column can contain a different number of neighbors.} \item{n_trees}{Number of trees to build when constructing the nearest neighbor index. The more trees specified, the larger the index, but the better the results. With \code{search_k}, determines the accuracy of the Annoy nearest neighbor search. Only used if the \code{nn_method} is \code{"annoy"}. Sensible values are between \code{10} to \code{100}.} \item{search_k}{Number of nodes to search during the neighbor retrieval. The larger k, the more the accurate results, but the longer the search takes. With \code{n_trees}, determines the accuracy of the Annoy nearest neighbor search. Only used if the \code{nn_method} is \code{"annoy"}.} \item{approx_pow}{If \code{TRUE}, use an approximation to the power function in the UMAP gradient, from \url{https://martin.ankerl.com/2012/01/25/optimized-approximative-pow-in-c-and-cpp/}. Ignored if \code{dens_scale} is non-\code{NULL}.} \item{y}{Optional target data for supervised dimension reduction. Can be a vector, matrix or data frame. Use the \code{target_metric} parameter to specify the metrics to use, using the same syntax as \code{metric}. Usually either a single numeric or factor column is used, but more complex formats are possible. The following types are allowed: \itemize{ \item Factor columns with the same length as \code{X}. \code{NA} is allowed for any observation with an unknown level, in which case UMAP operates as a form of semi-supervised learning. Each column is treated separately. \item Numeric data. \code{NA} is \emph{not} allowed in this case. Use the parameter \code{target_n_neighbors} to set the number of neighbors used with \code{y}. If unset, \code{n_neighbors} is used. Unlike factors, numeric columns are grouped into one block unless \code{target_metric} specifies otherwise. For example, if you wish columns \code{a} and \code{b} to be treated separately, specify \code{target_metric = list(euclidean = "a", euclidean = "b")}. Otherwise, the data will be effectively treated as a matrix with two columns. \item Nearest neighbor data, consisting of a list of two matrices, \code{idx} and \code{dist}. These represent the precalculated nearest neighbor indices and distances, respectively. This is the same format as that expected for precalculated data in \code{nn_method}. This format assumes that the underlying data was a numeric vector. Any user-supplied value of the \code{target_n_neighbors} parameter is ignored in this case, because the the number of columns in the matrices is used for the value. Multiple nearest neighbor data using different metrics can be supplied by passing a list of these lists. } Unlike \code{X}, all factor columns included in \code{y} are automatically used.} \item{target_n_neighbors}{Number of nearest neighbors to use to construct the target simplicial set. Default value is \code{n_neighbors}. Applies only if \code{y} is non-\code{NULL} and \code{numeric}.} \item{target_metric}{The metric used to measure distance for \code{y} if using supervised dimension reduction. Used only if \code{y} is numeric.} \item{target_weight}{Weighting factor between data topology and target topology. A value of 0.0 weights entirely on data, a value of 1.0 weights entirely on target. The default of 0.5 balances the weighting equally between data and target. Only applies if \code{y} is non-\code{NULL}.} \item{pca}{If set to a positive integer value, reduce data to this number of columns using PCA. Doesn't applied if the distance \code{metric} is \code{"hamming"}, or the dimensions of the data is larger than the number specified (i.e. number of rows and columns must be larger than the value of this parameter). If you have > 100 columns in a data frame or matrix, reducing the number of columns in this way may substantially increase the performance of the nearest neighbor search at the cost of a potential decrease in accuracy. In many t-SNE applications, a value of 50 is recommended, although there's no guarantee that this is appropriate for all settings.} \item{pca_center}{If \code{TRUE}, center the columns of \code{X} before carrying out PCA. For binary data, it's recommended to set this to \code{FALSE}.} \item{pcg_rand}{If \code{TRUE}, use the PCG random number generator (O'Neill, 2014) during optimization. Otherwise, use the faster (but probably less statistically good) Tausworthe "taus88" generator. The default is \code{TRUE}. This parameter has been superseded by \code{rng_type} -- if both are set, \code{rng_type} takes precedence.} \item{fast_sgd}{If \code{TRUE}, then the following combination of parameters is set: \code{pcg_rand = TRUE}, \code{n_sgd_threads = "auto"} and \code{approx_pow = TRUE}. The default is \code{FALSE}. Setting this to \code{TRUE} will speed up the stochastic optimization phase, but give a potentially less accurate embedding, and which will not be exactly reproducible even with a fixed seed. For visualization, \code{fast_sgd = TRUE} will give perfectly good results. For more generic dimensionality reduction, it's safer to leave \code{fast_sgd = FALSE}. If \code{fast_sgd = TRUE}, then user-supplied values of \code{pcg_rand}, \code{n_sgd_threads}, and \code{approx_pow} are ignored.} \item{ret_model}{If \code{TRUE}, then return extra data that can be used to add new data to an existing embedding via \code{\link{umap_transform}}. The embedded coordinates are returned as the list item \code{embedding}. If \code{FALSE}, just return the coordinates. This parameter can be used in conjunction with \code{ret_nn} and \code{ret_extra}. Note that some settings are incompatible with the production of a UMAP model: external neighbor data (passed via a list to \code{nn_method}), and factor columns that were included via the \code{metric} parameter. In the latter case, the model produced is based only on the numeric data. A transformation using new data is possible, but the factor columns in the new data are ignored. Note that setting \code{ret_model = TRUE} forces the use of the approximate nearest neighbors method. Because small datasets would otherwise use exact nearest neighbor calculations, setting \code{ret_model = TRUE} means that different results may be returned for small datasets in terms of both the returned nearest neighbors (if requested) and the final embedded coordinates, compared to \code{ret_model = FALSE}, even if the random number seed is fixed. To avoid this, explicitly set \code{nn_method = "annoy"} in the \code{ret_model = FALSE} case.} \item{ret_nn}{If \code{TRUE}, then in addition to the embedding, also return nearest neighbor data that can be used as input to \code{nn_method} to avoid the overhead of repeatedly calculating the nearest neighbors when manipulating unrelated parameters (e.g. \code{min_dist}, \code{n_epochs}, \code{init}). See the "Value" section for the names of the list items. If \code{FALSE}, just return the coordinates. Note that the nearest neighbors could be sensitive to data scaling, so be wary of reusing nearest neighbor data if modifying the \code{scale} parameter. This parameter can be used in conjunction with \code{ret_model} and \code{ret_extra}.} \item{ret_extra}{A vector indicating what extra data to return. May contain any combination of the following strings: \itemize{ \item \code{"model"} Same as setting \code{ret_model = TRUE}. \item \code{"nn"} Same as setting \code{ret_nn = TRUE}. \item \code{"fgraph"} the high dimensional fuzzy graph (i.e. the fuzzy simplicial set of the merged local views of the input data). The graph is returned as a sparse symmetric N x N matrix of class \link[Matrix]{dgCMatrix-class}, where a non-zero entry (i, j) gives the membership strength of the edge connecting vertex i and vertex j. This can be considered analogous to the input probability (or similarity or affinity) used in t-SNE and LargeVis. Note that the graph is further sparsified by removing edges with sufficiently low membership strength that they would not be sampled by the probabilistic edge sampling employed for optimization and therefore the number of non-zero elements in the matrix is dependent on \code{n_epochs}. If you are only interested in the fuzzy input graph (e.g. for clustering), setting \code{n_epochs = 0} will avoid any further sparsifying. Be aware that setting `binary_edge_weights = TRUE` will affect this graph (all non-zero edge weights will be 1). \item \code{"sigma"} the normalization value for each observation in the dataset when constructing the smoothed distances to each of its neighbors. This gives some sense of the local density of each observation in the high dimensional space: higher values of \code{sigma} indicate a higher dispersion or lower density. }} \item{n_threads}{Number of threads to use (except during stochastic gradient descent). Default is half the number of concurrent threads supported by the system. For nearest neighbor search, only applies if \code{nn_method = "annoy"}. If \code{n_threads > 1}, then the Annoy index will be temporarily written to disk in the location determined by \code{\link[base]{tempfile}}.} \item{n_sgd_threads}{Number of threads to use during stochastic gradient descent. If set to > 1, then be aware that if \code{batch = FALSE}, results will \emph{not} be reproducible, even if \code{set.seed} is called with a fixed seed before running. Set to \code{"auto"} to use the same value as \code{n_threads}.} \item{grain_size}{The minimum amount of work to do on each thread. If this value is set high enough, then less than \code{n_threads} or \code{n_sgd_threads} will be used for processing, which might give a performance improvement if the overhead of thread management and context switching was outweighing the improvement due to concurrent processing. This should be left at default (\code{1}) and work will be spread evenly over all the threads specified.} \item{tmpdir}{Temporary directory to store nearest neighbor indexes during nearest neighbor search. Default is \code{\link{tempdir}}. The index is only written to disk if \code{n_threads > 1} and \code{nn_method = "annoy"}; otherwise, this parameter is ignored.} \item{verbose}{If \code{TRUE}, log details to the console.} \item{batch}{If \code{TRUE}, then embedding coordinates are updated at the end of each epoch rather than during the epoch. In batch mode, results are reproducible with a fixed random seed even with \code{n_sgd_threads > 1}, at the cost of a slightly higher memory use. You may also have to modify \code{learning_rate} and increase \code{n_epochs}, so whether this provides a speed increase over the single-threaded optimization is likely to be dataset and hardware-dependent.} \item{opt_args}{A list of optimizer parameters, used when \code{batch = TRUE}. The default optimization method used is Adam (Kingma and Ba, 2014). \itemize{ \item \code{method} The optimization method to use. Either \code{"adam"} or \code{"sgd"} (stochastic gradient descent). Default: \code{"adam"}. \item \code{beta1} (Adam only). The weighting parameter for the exponential moving average of the first moment estimator. Effectively the momentum parameter. Should be a floating point value between 0 and 1. Higher values can smooth oscillatory updates in poorly-conditioned situations and may allow for a larger \code{learning_rate} to be specified, but too high can cause divergence. Default: \code{0.5}. \item \code{beta2} (Adam only). The weighting parameter for the exponential moving average of the uncentered second moment estimator. Should be a floating point value between 0 and 1. Controls the degree of adaptivity in the step-size. Higher values put more weight on previous time steps. Default: \code{0.9}. \item \code{eps} (Adam only). Intended to be a small value to prevent division by zero, but in practice can also affect convergence due to its interaction with \code{beta2}. Higher values reduce the effect of the step-size adaptivity and bring the behavior closer to stochastic gradient descent with momentum. Typical values are between 1e-8 and 1e-3. Default: \code{1e-7}. \item \code{alpha} The initial learning rate. Default: the value of the \code{learning_rate} parameter. }} \item{epoch_callback}{A function which will be invoked at the end of every epoch. Its signature should be: \code{(epoch, n_epochs, coords)}, where: \itemize{ \item \code{epoch} The current epoch number (between \code{1} and \code{n_epochs}). \item \code{n_epochs} Number of epochs to use during the optimization of the embedded coordinates. \item \code{coords} The embedded coordinates as of the end of the current epoch, as a matrix with dimensions (N, \code{n_components}). }} \item{pca_method}{Method to carry out any PCA dimensionality reduction when the \code{pca} parameter is specified. Allowed values are: \itemize{ \item{\code{"irlba"}}. Uses \code{\link[irlba]{prcomp_irlba}} from the \href{https://cran.r-project.org/package=irlba}{irlba} package. \item{\code{"rsvd"}}. Uses 5 iterations of \code{\link[irlba]{svdr}} from the \href{https://cran.r-project.org/package=irlba}{irlba} package. This is likely to give much faster but potentially less accurate results than using \code{"irlba"}. For the purposes of nearest neighbor calculation and coordinates initialization, any loss of accuracy doesn't seem to matter much. \item{\code{"bigstatsr"}}. Uses \code{\link[bigstatsr]{big_randomSVD}} from the \href{https://cran.r-project.org/package=bigstatsr}{bigstatsr} package. The SVD methods used in \code{bigstatsr} may be faster on systems without access to efficient linear algebra libraries (e.g. Windows). \strong{Note}: \code{bigstatsr} is \emph{not} a dependency of uwot: if you choose to use this package for PCA, you \emph{must} install it yourself. \item{\code{"svd"}}. Uses \code{\link[base]{svd}} for the SVD. This is likely to be slow for all but the smallest datasets. \item{\code{"auto"}} (the default). Uses \code{"irlba"}, unless more than 50% of the full set of singular vectors would be calculated, in which case \code{"svd"} is used. }} \item{binary_edge_weights}{If \code{TRUE} then edge weights in the input graph are treated as binary (0/1) rather than real valued. This affects the sampling frequency of neighbors and is the strategy used by the PaCMAP method (Wang and co-workers, 2020). Practical (Böhm and co-workers, 2020) and theoretical (Damrich and Hamprecht, 2021) work suggests this has little effect on UMAP's performance.} \item{dens_scale}{A value between 0 and 1. If > 0 then the output attempts to preserve relative local density around each observation. This uses an approximation to the densMAP method (Narayan and co-workers, 2021). The larger the value of \code{dens_scale}, the greater the range of output densities that will be used to map the input densities. This option is ignored if using multiple \code{metric} blocks.} \item{seed}{Integer seed to use to initialize the random number generator state. Combined with \code{n_sgd_threads = 1} or \code{batch = TRUE}, this should give consistent output across multiple runs on a given installation. Setting this value is equivalent to calling \code{\link[base]{set.seed}}, but it may be more convenient in some situations than having to call a separate function. The default is to not set a seed. If \code{ret_model = TRUE}, the seed will be stored in the output model and then used to set the seed inside \code{\link{umap_transform}}.} \item{nn_args}{A list containing additional arguments to pass to the nearest neighbor method. For \code{nn_method = "annoy"}, you can specify \code{"n_trees"} and \code{"search_k"}, and these will override the \code{n_trees} and \code{search_k} parameters. For \code{nn_method = "hnsw"}, you may specify the following arguments: \itemize{ \item \code{M} The maximum number of neighbors to keep for each vertex. Reasonable values are \code{2} to \code{100}. Higher values give better recall at the cost of more memory. Default value is \code{16}. \item \code{ef_construction} A positive integer specifying the size of the dynamic list used during index construction. A higher value will provide better results at the cost of a longer time to build the index. Default is \code{200}. \item \code{ef} A positive integer specifying the size of the dynamic list used during search. This cannot be smaller than \code{n_neighbors} and cannot be higher than the number of items in the index. Default is \code{10}. } For \code{nn_method = "nndescent"}, you may specify the following arguments: \itemize{ \item \code{n_trees} The number of trees to use in a random projection forest to initialize the search. A larger number will give more accurate results at the cost of a longer computation time. The default of \code{NULL} means that the number is chosen based on the number of observations in \code{X}. \item \code{max_candidates} The number of potential neighbors to explore per iteration. By default, this is set to \code{n_neighbors} or \code{60}, whichever is smaller. A larger number will give more accurate results at the cost of a longer computation time. \item \code{n_iters} The number of iterations to run the search. A larger number will give more accurate results at the cost of a longer computation time. By default, this will be chosen based on the number of observations in \code{X}. You may also need to modify the convergence criterion \code{delta}. \item \code{delta} The minimum relative change in the neighbor graph allowed before early stopping. Should be a value between 0 and 1. The smaller the value, the smaller the amount of progress between iterations is allowed. Default value of \code{0.001} means that at least 0.1% of the neighbor graph must be updated at each iteration. \item \code{init} How to initialize the nearest neighbor descent. By default this is set to \code{"tree"} and uses a random project forest. If you set this to \code{"rand"}, then a random selection is used. Usually this is less accurate than using RP trees, but for high-dimensional cases, there may be little difference in the quality of the initialization and random initialization will be a lot faster. If you set this to \code{"rand"}, then the \code{n_trees} parameter is ignored. \item \code{pruning_degree_multiplier} The maximum number of edges per node to retain in the search graph, relative to \code{n_neighbors}. A larger value will give more accurate results at the cost of a longer computation time. Default is \code{1.5}. This parameter only affects neighbor search when transforming new data with \code{\link{umap_transform}}. \item \code{epsilon} Controls the degree of the back-tracking when traversing the search graph. Setting this to \code{0.0} will do a greedy search with no back-tracking. A larger value will give more accurate results at the cost of a longer computation time. Default is \code{0.1}. This parameter only affects neighbor search when transforming new data with \code{\link{umap_transform}}. \item \code{max_search_fraction} Specifies the maximum fraction of the search graph to traverse. By default, this is set to \code{1.0}, so the entire graph (i.e. all items in \code{X}) may be visited. You may want to set this to a smaller value if you have a very large dataset (in conjunction with \code{epsilon}) to avoid an inefficient exhaustive search of the data in \code{X}. This parameter only affects neighbor search when transforming new data with \code{\link{umap_transform}}. }} \item{rng_type}{The type of random number generator to use during optimization. One of: \itemize{ \item{\code{"pcg"}}. Use the PCG random number generator (O'Neill, 2014). \item{\code{"tausworthe"}}. Use the Tausworthe "taus88" generator. \item{\code{"deterministic"}}. Use a deterministic number generator. This isn't actually random, but may provide enough variation in the negative sampling to give a good embedding and can provide a noticeable speed-up. } For backwards compatibility, by default this is unset and the choice of \code{pcg_rand} is used (making "pcg" the effective default).} } \value{ A matrix of optimized coordinates, or: \itemize{ \item if \code{ret_model = TRUE} (or \code{ret_extra} contains \code{"model"}), returns a list containing extra information that can be used to add new data to an existing embedding via \code{\link{umap_transform}}. In this case, the coordinates are available in the list item \code{embedding}. \bold{NOTE}: The contents of the \code{model} list should \emph{not} be considered stable or part of the public API, and are purposely left undocumented. \item if \code{ret_nn = TRUE} (or \code{ret_extra} contains \code{"nn"}), returns the nearest neighbor data as a list called \code{nn}. This contains one list for each \code{metric} calculated, itself containing a matrix \code{idx} with the integer ids of the neighbors; and a matrix \code{dist} with the distances. The \code{nn} list (or a sub-list) can be used as input to the \code{nn_method} parameter. \item if \code{ret_extra} contains \code{"fgraph"}, returns the high dimensional fuzzy graph as a sparse matrix called \code{fgraph}, of type \link[Matrix]{dgCMatrix-class}. \item if \code{ret_extra} contains \code{"sigma"}, returns a vector of the smooth knn distance normalization terms for each observation as \code{"sigma"} and a vector \code{"rho"} containing the largest distance to the locally connected neighbors of each observation. \item if \code{ret_extra} contains \code{"localr"}, returns a vector of the estimated local radii, the sum of \code{"sigma"} and \code{"rho"}. } The returned list contains the combined data from any combination of specifying \code{ret_model}, \code{ret_nn} and \code{ret_extra}. } \description{ Carry out dimensionality reduction of a dataset using the Uniform Manifold Approximation and Projection (UMAP) method (McInnes et al., 2018). Some of the following help text is lifted verbatim from the Python reference implementation at \url{https://github.com/lmcinnes/umap}. } \examples{ iris30 <- iris[c(1:10, 51:60, 101:110), ] # Non-numeric columns are automatically removed so you can pass data frames # directly in a lot of cases without pre-processing iris_umap <- umap(iris30, n_neighbors = 5, learning_rate = 0.5, init = "random", n_epochs = 20) # Faster approximation to the gradient and return nearest neighbors iris_umap <- umap(iris30, n_neighbors = 5, approx_pow = TRUE, ret_nn = TRUE, n_epochs = 20) # Can specify min_dist and spread parameters to control separation and size # of clusters and reuse nearest neighbors for efficiency nn <- iris_umap$nn iris_umap <- umap(iris30, n_neighbors = 5, min_dist = 1, spread = 5, nn_method = nn, n_epochs = 20) # Supervised dimension reduction using the 'Species' factor column iris_sumap <- umap(iris30, n_neighbors = 5, min_dist = 0.001, y = iris30$Species, target_weight = 0.5, n_epochs = 20 ) # Calculate Petal and Sepal neighbors separately (uses intersection of the resulting sets): iris_umap <- umap(iris30, metric = list( "euclidean" = c("Sepal.Length", "Sepal.Width"), "euclidean" = c("Petal.Length", "Petal.Width") )) } \references{ Belkin, M., & Niyogi, P. (2002). Laplacian eigenmaps and spectral techniques for embedding and clustering. In \emph{Advances in neural information processing systems} (pp. 585-591). \url{http://papers.nips.cc/paper/1961-laplacian-eigenmaps-and-spectral-techniques-for-embedding-and-clustering.pdf} Böhm, J. N., Berens, P., & Kobak, D. (2020). A unifying perspective on neighbor embeddings along the attraction-repulsion spectrum. \emph{arXiv preprint} \emph{arXiv:2007.08902}. \url{https://arxiv.org/abs/2007.08902} Damrich, S., & Hamprecht, F. A. (2021). On UMAP's true loss function. \emph{Advances in Neural Information Processing Systems}, \emph{34}. \url{https://proceedings.neurips.cc/paper/2021/hash/2de5d16682c3c35007e4e92982f1a2ba-Abstract.html} Dong, W., Moses, C., & Li, K. (2011, March). Efficient k-nearest neighbor graph construction for generic similarity measures. In \emph{Proceedings of the 20th international conference on World Wide Web} (pp. 577-586). ACM. \doi{10.1145/1963405.1963487}. Kingma, D. P., & Ba, J. (2014). Adam: A method for stochastic optimization. \emph{arXiv preprint} \emph{arXiv}:1412.6980. \url{https://arxiv.org/abs/1412.6980} Malkov, Y. A., & Yashunin, D. A. (2018). Efficient and robust approximate nearest neighbor search using hierarchical navigable small world graphs. \emph{IEEE transactions on pattern analysis and machine intelligence}, \emph{42}(4), 824-836. McInnes, L., Healy, J., & Melville, J. (2018). UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction \emph{arXiv preprint} \emph{arXiv}:1802.03426. \url{https://arxiv.org/abs/1802.03426} Narayan, A., Berger, B., & Cho, H. (2021). Assessing single-cell transcriptomic variability through density-preserving data visualization. \emph{Nature biotechnology}, \emph{39}(6), 765-774. \doi{10.1038/s41587-020-00801-7} O'Neill, M. E. (2014). \emph{PCG: A family of simple fast space-efficient statistically good algorithms for random number generation} (Report No. HMC-CS-2014-0905). Harvey Mudd College. Tang, J., Liu, J., Zhang, M., & Mei, Q. (2016, April). Visualizing large-scale and high-dimensional data. In \emph{Proceedings of the 25th International Conference on World Wide Web} (pp. 287-297). International World Wide Web Conferences Steering Committee. \url{https://arxiv.org/abs/1602.00370} Van der Maaten, L., & Hinton, G. (2008). Visualizing data using t-SNE. \emph{Journal of Machine Learning Research}, \emph{9} (2579-2605). \url{https://www.jmlr.org/papers/v9/vandermaaten08a.html} Wang, Y., Huang, H., Rudin, C., & Shaposhnik, Y. (2021). Understanding How Dimension Reduction Tools Work: An Empirical Approach to Deciphering t-SNE, UMAP, TriMap, and PaCMAP for Data Visualization. \emph{Journal of Machine Learning Research}, \emph{22}(201), 1-73. \url{https://www.jmlr.org/papers/v22/20-1061.html} } uwot/man/load_uwot.Rd0000644000176200001440000000415514730166740014324 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/uwot.R \name{load_uwot} \alias{load_uwot} \title{Save or Load a Model} \usage{ load_uwot(file, verbose = FALSE) } \arguments{ \item{file}{name of the file where the model is to be saved or read from.} \item{verbose}{if \code{TRUE}, log information to the console.} } \value{ The model saved at \code{file}, for use with \code{\link{umap_transform}}. Additionally, it contains an extra item: \code{mod_dir}, which contains the path to the temporary working directory used during loading of the model. This directory cannot be removed until this model has been unloaded by using \code{\link{unload_uwot}}. } \description{ Functions to write a UMAP model to a file, and to restore. } \examples{ library(RSpectra) iris_train <- iris[c(1:10, 51:60), ] iris_test <- iris[100:110, ] # create model model <- umap(iris_train, ret_model = TRUE, n_epochs = 20) # save without unloading: this leaves behind a temporary working directory model_file <- tempfile("iris_umap") model <- save_uwot(model, file = model_file) # The model can continue to be used test_embedding <- umap_transform(iris_test, model) # To manually unload the model from memory when finished and to clean up # the working directory (this doesn't touch your model file) unload_uwot(model) # At this point, model cannot be used with umap_transform, this would fail: # test_embedding2 <- umap_transform(iris_test, model) # restore the model: this also creates a temporary working directory model2 <- load_uwot(file = model_file) test_embedding2 <- umap_transform(iris_test, model2) # Unload and clean up the loaded model temp directory unload_uwot(model2) # clean up the model file unlink(model_file) # save with unloading: this deletes the temporary working directory but # doesn't allow the model to be re-used model3 <- umap(iris_train, ret_model = TRUE, n_epochs = 20) model_file3 <- tempfile("iris_umap") model3 <- save_uwot(model3, file = model_file3, unload = TRUE) } \seealso{ \code{\link{save_uwot}}, \code{\link{unload_uwot}} } uwot/man/figures/0000755000176200001440000000000014730166740013477 5ustar liggesusersuwot/man/figures/mnist-r.png0000644000176200001440000016445114730166740015611 0ustar liggesusersPNG  IHDRWU|G pHYsod IDATx}w7bU,>fw=ۖc.ox߼R?gUf9z{~x껽ӛ|t}]?B [w'OV!fm}L'*F qVob{v)3Dv_c~p߭{gǓ>g#b^zՇ_Ď=Lӛ,{[OMLx>ܙ3?}|3o%NSO}w3IA!>?rx72.];yr{Cp._1}Z[mbԩsl&ft{BC;WOO|{]|.!IA!6ݚ©}[!qR%;Kͽy[=u =ڝ O^6jѹ3S'6Νy{VqO][9xkҙޱ ۥkgV z4խV0>;6SyM?枧NlfƵB|jBUϜ}V;;N ywm>w/~Nuc'}juݷO5n ᦛ_qtysg%Žv Bo'Oo2!B1$B .IA!KB .B .IA!KRP!B1$B .IA!KRP!B1$B .IA!KRP!B1$B .IA!KRP!B1$B .IA!KRP!B1$B .IA!KRP!B1$B .IA!K|v<N7A!ĝzyJAvSBvJQ!KRP!B1$B .IA!KRP!B1$B .IA!KRP!B1$B .IA!KRP!> 9gQ醈b!C.`}dH tBlWYX^&B9NoI !gش`M2@r;2Ft[7$b,"m=l`F[{&aF MB핧wvU,6.,g(+T$5NB ?KBrTg_rb 6fMmenڦs.pM8S3 J$b,"Pء3LӣBbSW.P شsT1f5L{Ǚ'oI !6v9/gbnkwZ{TݥGm`Z8@ ,4- 0 .Y5/&8K7E@iКbMbDe{r:5W΁=e@\OtߑZP!ѭoU-z7X{]?</P%_䊊үMlA:@ h IA!Fyj1i5pV1J|ESCJnN5Ahqz#TM1qJUNP]ժUorv>$beLZJխT4a.!b9"Y ]vVGS!"Wt /Re"+UBԟBIK/~z]2Ɔ[&luV,c %WǗP'=F8kDG*Xb 0$c]Rot#Ǵs}^ ~|&΢Q(FXIk^鱁 h/4RUhfaS{oE& ޯBjAqkwٙ<42~/KA8 0fw7]!xq4VI2`Ӟ 9A5Ga"V6IJF ?!IAqה ǵ?8s..X~$ a]r8 O0 bmV("u , obȾk;ϞA~pLyZ[ye}` %=#CHy_og5;F:6>]Gx?e Bˤ?VGs.b1⻈˔c(  _ LjQǤ%)IH @\?y?kg[YH Of1 WvP#gv=)P>1V]ZS~qtg?Q '8\he \gy:op!F'1;eW!)([fF#@?x]h;dj/^I9(>G xm_M+N §9!>o_c6ZS*QbKlT(e)x#h{"nQ==qh@äx0G$]rao0<9`٢DAkAs~˹iޖҜ֭1b4tO6Gbg\ 5qG ? TAMi:/aM!&([s missTX/q=6E27H/%?Z&7XO2I6Tk[_i6(Ij#ȃ5*a9cy@ V<,6Z%Lx0%~|89* U~|ejuQbE~2-Qh< 娦1U /-Ih!_J8Au&m@C&^溋ۡCoe?p=LRP}(e!t{8mP$2/vBQlhZޣav; *F@2O20_0cUtZZ5Kaۥ phٸƣL\%RaÌSj+qA8oҍ >CWc }D6:q:K&J$I|t{yMr՝nwt6hcOO$^Xvy+rT?g6}CMZ*Q'HR ojUxv-]΁/0>BlB`*79t-K4[%6b B+TBIt'ޥ]ǼL.O&ŋ|IjA-2O]B>rWC@ x1VygُΟjV $'L=n9_1q22e K| Å)+x 8 Pnxtt;Vy7:Hv w C|XH@}FC+E " Gfb䯒}d3<հ'jJi_` Dj4 m:cijD_;1eXܟh&F^GnfΒw_?^xkueüKǗIh+ W󲳚G_J<.NMFcAP1NƪKD0unwqΗElg#ﭭP9ssVX}<1&Y+Ktu8)"' jˏ/ !Ry #QHgNvγL #h {av̙=Ͻ-Oٹ_].wj}"֎_bFf_'`:\M. ➑0G^^X6`^) *zKF}MM+/ih):K\wpi EE*m"EJeY3fsM5Oq?C=hE3qQIDgWNׂ&yT)[߿Ɜ?>ߙ7/ywfOof+ejM)PڟQ4p);͛ϺAtw+)]V vF)%ʊuzQYѥD޷>I:ӤרR(Qnڙդ{ 函yÜ< $=,;!WvxtZ.ͫbC;u.2#.`U/y˓,0@b<: vpr&7*bCzqk|8UYK[8c%JDvq"٦]_ZP8vw ̂buj3K2nӮ`ϳ^/+ ,XԔ 7YGfT]\Ge(a*Bf +!60/fNz˳ZPXi#gmz\vc3n>$ ?c$ Zߤn(%l1c֫Dh, (3֟bB64 X\ %ذ)>VgB0ߏyՁdοQX }dX.m6YZ6,ǾGYjk-0 jL4|GTqZ_a\*!l g{RLm~?>aXٿpۛ3F]m_e*"hhQ\?>^kPV n`樆P0NnVkwKa8Yo@q-sCJŧLSvKo>3vԂ@?RRWyVh*:n,R1zq5z!bw ; u蟛~"Sd)9YmaLd(= =o!{Wjtvie Xmw{d'UJ+ .)mz0/5x>8@x^'PqK6$!u@1V BͲvokw˾%ak/lD(}Di2\7k*U=b}Y^J8g—Jt(N$@y$N+9Be!Fh;&]8IJoxyl&V$C(DUf23!}!$6<.颩MB*$SY{*Yg;-'5wָGUxp% ?~`ZF|] \?삪mk6Zyv%*4@)̇(?}vrpJٷ@lHth \x2MM`V-휵f6vmnP ߠq@ؿJ; %J\Xw"]na!jiN^)kC3kjZWEٓj)'We,Nu:ӡ` xٚy7+T@y?Uw!b t)YDKBTG0l~~ ʬ؛aC&AUj$>ϿmeXG~ 9X5(M%0 qjugqg #3khtl+LmB "dau4lMQc1[j &|7ozCX`"+.4i $!`ccQC*BEw&TAF>Bpa#6sWX۟14[ NKUQb3yCRPll\$CaFn px:meb:KUF}vCHO0&h. \_ h4 s`"Tκtb&ZL# upѺ$C3Z7K[9<Н+sZX - hw% kMͧz@=~KX=<%䋢lJOהl-N4/9K1KͱXܦ 6)dd!gA_ӡGVk-a"UI {, =ĨZ:ɤaFp )(Wrx`N Ӵg-?^).S,K%7ʶjNpmЃ*$ P(0Ab H $aFJwHY#>ZV'_Xϗ^ػ&c^=QFNd&U]JQUYyU16/`@uҘ75mh/E4+Vp"tA υ(1(&*3D_j[LNG`e?m:m:$1M=h,R!6DIF~-$6FA~t iPcoPdBájl+-pP=,6uX4ۤ 6eUI?) 0<mkb?QEeRRB 8$">zl7Oorȑ,!Vs$OFg/uhW+5:J͢r8BP"½W3~- Q.lo+gהH(R7C?mD2(G[TU#Ẹ:FFm{H"ޟVӃ4$,} E6L1c@Ŧd2#ц=xgxkl <{3\Es'׏3ChMpaB @ נHpXy5e~@M΋N4)\MD4_@.EBm^C(n\g*;6°Nݼkt?SJk7έSc=؟T14`,.RzMEOfخ&n" cVs?T6v|p9R][,g/8® ,5h`8X ޛa&D Fv["WR[3D2Vbsg~Џ%)(WT#tHNb_٦bT؛!~Qv==K ٓ+& l80^??vA!6@!Yrq:aPhA2D-@hϭ'4KDӥY2][N$k{"bLk_bu䭚zF&/ߺW|W5Zx ,:$0Afzo*C$_Jˤ5$J2XYWX-mPD@R`4®[2Ο]J/y8Ak0'IT5a"Y-q՛Ca7BIbSSE&_LowӁ%v_ U6Q4۬THjLZu-ݳҝ`$DaQa_ JwKpAok[`BFP=ABZ;? Y&<9 ?nFlHBkuh7wiLD)/f u<(^w*MPľu6&I]%JRS*6o,* i_ ܨ{B6J 3I;3+Pa`P LC0ix1&iBvi&5=C:'9um]6} =amѤNLGHA1FC;wK8˔Wo8nc5x]׌bcZj4G+c!5IR'1$b{k̮c:MHzC)0fBi\k!.Q_21V*,Za")GL8 0m aΜWh` _< #v0 V)X4]JdR({CZ Hաi_U?΅b.62~v}Lƒ`M9lX8i Vf`kǘ"}K;9ľ/_ ɼ%)(;mlyUFBWPuCC B;ECNj)uE( !І:<qH@PYEI~NloTP0[[|{*ZM]Qcod>V|q^)OMsmFHPjcFO71Ԋ$[|OܕנB Q-!C"\9xR0{-O_ݤhXB!]:DF3 p3d1 q.Q0{ fѴ*B+子  (pw^ iĂQPkxLT>Q(xj6LuQrR**ݜ۷qZԻ}GҤes$`Ԓ12bh~4SNY7qdPO~l(5 nؐX@K)DJI?1IK2H 9?I+ n`#ch?֙3i;7ݣe) ZS;[^EJd l~VeOQ _:!![ӱuYɟ$UGO r|vK SaZ6֎u3į\\eA<e!L,c6FOh w!%l B = U1_CG@TfV8CVnݦ:MD ( n4_M0;kޤ(GS V(mwP?=8654{{@1Qw;O.LgDtiuE8xL,p33.7n{oJaZ"&`Eb+G.#Kl72ORpmRP8#_l%|W(E %ѕ0v`i@!H}dSqH57vϖ[x:^ *A猦6sP&MӃk~-:gQet,\zxK>+1gL& %(,e_&l"<0Lffm2ډ/0h4xU_ӧ_bh(6V;euc\,׮"4B2"Qh!Kt|V$h}6l7XxRo!la(l{jy1`/"?ϼHtlt\D_/Toxj=9jVq+0֕M.ؾg'Fd\̟#^1JBጲJMP ) I& ࿔W[B13``8{Ǖ+˜:5z%4$iΟ1khxt ev!14xL:Ν=% L#fFv ( {!L}Bg%zz/#ɱdapq8[5Bzw$Qo7]2ImB׎2#&էGTݐ}"p?*[iB-LT_op"6˛oS-zeZX.aYy#2S!f1MbTg4&dC.nwP$Ne\6|8BCHqew\g7Yu|:?!7w*6dMa>!@ `-B L;;PY č d4Fo'-08 U4ٴ{Xm '\}aaQӸlt2uCDS]l= =_hH{2Q_Bupդ벾NpH09tG23DhTJh~l8B6K$B.[?rLoB uS#gJn;u'N*tQ[u1;šy`0b,Ղz:Pj'V F3u>WI|CO4688"5lٷtI8"-(Јh/(p¿ \JB* Q.8l"&AIB"KH1s&.ydO7pYR)t]4y@v>mBhN$jC}0ШJ_Ge*QS9!qQ hҐ$Ұ}BHIfc"& S 26dE81&$u!=/7oe Fش8D>DhםVSp2ʹeh6ʈuY{USFl u7lϖв{x؟pEz_'%\&R)Νcj~ϥh1MΎ t(i4rñX_WLOMF]dC rTS2mԺ([M]6!! Slߺ{vK|sZzyԧ|x > " +X: 15u@ 9i&&oҦlLMmU}cS~osfD hs+3c,ئkfj7bQ;\ $|PLb/GG Wr*i9L|]g4BQ$T^m IDATBVVc2-r9~mZ-.]1/(0WI3 dqK)ylX 8GY yvrLGfG}W`dH&78a` C!!>7'al=Z#JkV wd{]AډF4^b  (ضtByߖ8#wcDC{,5+-g8MLTQ*Dj.!67mQ)^CvszT؆@6$xw9k #:4t3:Df_| umѠZVdLJH!H&1 yTwe~˗V4( yVFBшRY\$O01%4|ZBD ] Ժ(tTC@@C)r6;ߛ>LЄ44`PXm|u *{k9OȞ@Kf xmBx ai[$I ]X+H넴@eHT;$P)F!&*q@M8$Ţx;7KI4ʭ[7oRho$fymȽsst[7r8h A?EbY",xC޾ptOf4+h7OxQal.ЇڽаQ[p BЁI@q8!s&4Lζ;"I[|/8~VDi`̘̇:5ŇȆe(Qlzޜ<<&*%(qƉ EQ$~:Qcub8{52 !1C5&ߗaG/YCXp>BiM&.s*Vڱn4g݀W'1hO0Q/A4kkss^9Ci!wb(Pjr HqTp fv[e޹wȹW_ cb@dhVfc䛸 `oXH*nP.h bp[4zX.gW*A Ѷk0 96X,D6%21d ^@qoQ\T@:[b0co">?QSCM"pGyЇ.,taV`6[ZDñ>ΐ6_r5]9Rr"`LPJZHU i̚,7I t۔px{RgLL&W+ 8vkp]=!ɚ30; #XosJ;ramT#GoJMWX\u׹|9>/.+-C)Jmǧ0uJm}Ʋ|_/2ivD9>bU->̴w=aLa |ҰQ>:QB;)bj 1ވ.;(8iT:|?>` ь 1C& JvE6]mbИ%-*L c.07G2^FTU9(#,xZiT$@HkCgmf]qo\Dt헩dXg5mٳ]ߙZc]ӳU$uq0{!ggJ8;j*L@R}KTztGG} 46GνJ%0!P 8}Fɴ[XW_6b<cTvS$*V>P#Qr{%A`JIt9;P(r'i ~Ɂ8W?aN[olR#aL9SQıc5lT=΀mu?I8L*/5ne[ ŲhJ5m| (a kǘ٬ՠW8y)գlV{tBF|ZяEz6L0ﻻD ܡrظL d w  'hAnP6f *0U 1ɉ_*Ɖ5 gb(仾gxNT4=Qn3Y >&*s)8wnw`dNsM Q O5S<3Pm*2'S.ǧ8 ;XQLc6cvm>O!sS9ފP i5C*07bpЃ(x@܏U6AqaA&=7>ΐ;b.=jdyڪMYJu S ??}k̿*:T`_‰-R+}B`7Po>?o|Pp ӪWVpu9~d7w n kEM͑?<:~Fz( io̢>O0_?έ"kNSRh1p$p,T@WYjЬS_! ?W )AU5fr!6z1* A;LRyç0Sgۧe bp2ﮨP!Xn.CpGɊ|&0n|o7QQYiA2&uD-e he8$gyEΜ FqiI \L*)p] u^eQ3Ӄ7G|i&x?%ycO6HDm wVE@SȆfq(,׹p@5h"`gCQ mr|1GHP{L>C"i?Yt}PAp<PIj am(HZtjsw Mv^g5RvزH :!xa;6)&",kS,i2@AWջ~qo!A115wC *oI.a$u;}1sKk-.[[>"ϓJ.Skp G:ğ¨M.;aJm*a&~j9,4(ܔ$z_Ѿ >iƒg_uaNjHT ϣPc}jr,(|`?O,LORܸANs y|%5~Q)edۦ?g{. 0*Rp{cu G5 iذT8fl;G4aN}9w'MX@*ěDq8ܴ)P`Ej F ذ nSę N7N#t=L#OFl(:і=yJ']v.Un;mX#-66Jsc#MC ~Cߛ*ݮײg {iG%ARhp Ǐn8>M*E.L$岰,'!|]s| Uv0uʜHTRm#>v06a ]uC jr#1iқ=mI\,O>df-6/Pp4#_H 6W;W $\H<$&INq+WXXO|\]67z[Zw }Dh%m',I=FOF4Iy)^~..>L[?8@ap}%PaK<a ^̤LL(&qw=gF8d0,Qk -# ԯڱ&zC悘 BO#7T@s#0i,rFbedzr9LgSfmMez=RSfx׉2#8ztόuz=uuu^pW~tnZVڡL#\^֤ӑPMc7鎂Fi7 (bn UQ:+ bcЦmja:X:p8ND&TMyBKq1e!̍w ``*h ID4-FoIjvl 0cӣ⤿cAR'RaDύoS31;C8<~J6JMߧ1vvr9:^x7?Ϝ:,oBw{f&D:r:T*̝GV>oƛ}JJˏ2dWpvxU%;F2 IQu^X=}-Yci{dQ P@@.IG),! iXMO+0ßޕj=6NH%K,![]Yto̅mܭ;1\/bec0`fi7׃զvki2x o5יylr]qvvY8 . \41ͽGrF͵ +\xg !|RT,|6%zв gJ uH /:UC4vaNe yׇbNƩ8%;",CRjpn:mQT! e*6LAS~ʕ/Xl;V8[MNLZn' ~=x;7͞,>A]ZNQN|B>O;Z|ss<4\7ǎx?$!PUYxrN⭷M,FHrO>uJ]xWN”Y^1ҴQi$h' S8gRۅMk R3Ormkt-[9D-!cR|XL6Sb$J RS]QMi$0<k(/ٔEi,}LT8stt  mJCuz=LTͺl2Zf{`iPU F7$h9v ]gsYR)rywܤZe:\m{joop;\#}X2`dJm|Dخ+3ƙi  HJf p4?q%0L%>$$F\=c<1_Qop*fЎmlZ!+5_T8!ep &T oDH#-ش\j/++dz; MЮ۪8~7hl3b8-ږW)QR kkln")tkb1u 2J5nfvQU^EX\Ĉ#Ý;,,p<{Xo}tzoHq?wf?md,35+9s IDAT;.J6>CeЂD y0oԯ:'b:]KNAEpEX#&+e}~sc[5>,&*Ed2:d|!.p*f?y3~)7k2r|~ z 5J%iXn"!>`9so7/`#oc_orxdm^b”JDO3 TJzu0fةA]'%qX^F9u\;w2Cm@ċ/ D(f26fwY_HNq. aR1|!@f:<|6Ā86oA%6hsWǁ'E3D!' Ga4K)Ϛ S4\>(S+pGOc(*\/A&÷EB CܾM2i>n~&bjˣ3@Q wT~ ]whl h27n|KK8U }L^H{`֕eBW *,!Au=Ӿ~")ԉhRsz8n6IhW jW1,1}LTqB!7xkj\h+WIJp]]?lfZɻiY5ivQ \L4J.ǩS,.hm3;= lr /m㺨*/4  7/@!-,7ϛTJd$q<ճSAePc-(̗&H1=wnöhSgJ]/Щ(X,~G'D4 ss?\IgD"DD80[[dLO͇@SaCЩT~hò$-ey++,-q4~?>ǎy*eʷ[z4f7[ $%o֔.4}DOO?FAb~B#+k n`kExAff7ΜÂ1{=$'m۬w8԰?,͎B7"@ᎰXOGa1Q_ /ҙlL&dx/q]~vvh6 T'1/x##ﻮS<$2%{% fv ·`;a3xvl:LxMnvW}d}EfdƕM6IQR7/SVdfDdy4Tu 6ښ;+[yd6z ^u `q[q8٦+X]tnt_ґTuͅD>],i[>wn2Cڐܝ6~px,\tힿ%E1aQOCȯ:U[~ *Wklz_ɩ m=Qu1 4pxT9k|aff8x0?{hqg;D SO V`-5Jk$| %!MIR1kR)ZQ 6dA?: ={{]ܴQc&WwXq#HC!_lcUa?O>ɓ_9=G,N$o;JW*cbvsH$lTJШ r5q00 }!l6?cQT)f+V*!Qb/Z4 JH8()5$=2.S7ߟIbl;lW?K8ц#01Vu/,-I*G,-1m@*m3b,-ut]FE"XLY.r+c +siP:ͫ=.K:# \6IC3C:Xr~*UnFCནc&y?] :Sfb4 Yr{b1J:,ArU oX#&&{lX uV%- U"(cGU3UqaQ"\mG")_:8E2/ 9-syx*FR/GjQV Gb]v ~{rH4x{44@Hbݻ̝IѢAKmO6&! `KTi-DaYlG{1EƆ5c}kZ17G@.Z{)I%w)lW%`MA[-7|io=Ay@o@qplvtxXnnf4oO(Xi ߎB~xe(PQҦ0BDq1 E?k; P]ܾB;#Y/0L(EYu1>rE@7)5FW{Xfk]akr_ӼZ1vڴپD=A:EUFQyzqRnDdA\22n ݓw8+%Owsw? fr"xD^gT$íT*< %ZFTZ<%f(!RY=Vbg=Ϣd28}k_?KU`/@-9vC=ѤTGuFmV0Êa+059Pc5E=DAEBRT2c}/ڭ*EP >{MfM4iRs\xjI(& * 'UɪvD 0*!=Ya"Lo[vLy.òj}.+z<ЋXnjPQZ$/FR:f.p8~"uM*:0zҨ!BE"3y ףLei {p|F.e"&)o4P wҌ̫r}S9ZW&ΊtX$i^1ɿ2q/l.M'B(֥% MGniٿYrbOD+c&|mRr>:dI+hj4~KȗplW kS-}ֈ>ER}V\k%Dț5ʠ'f~ݼGeޤqGhE^HCB:{ u9 ٩]2Rգma{򔚘Uk7lalw2j<%gE+7y᛼< xi?Wc3棠̦ p]C%A@`۩)Ҥ2o ahQ?Z.cDe1mMڮTxn&J_'b"!bp_ Wkjche;!PG'g I˥2H;Se]YZ__Q_yXcsKv4Ӵ&Y Rq)FI Gzv k[j2qUW(owڄOk+Xnj-pr. nt=?'уY騁`?R1n` b *@"0)}bwb1^|RDzz:1<ԐVƣ_ ?x-H&)#0^29)\jKJV.CYyMD##xg׾Ojx.Fl^gOi!Hd4H\8" g.()6KڲR->G459ߍZC@$g9"7(mUp̘{ .,q!B P-SA>6JX D,Pcw@E: Ci|o&#zqB703Je"H`-0k"0f)3݆ ҘPl7Œvfxy{Hkf$\IqShaUEjZrXv4gxPTH_ hyog -Lo2!1[WnQ+j~LD6ѡDPlN3?wWN^2_D{HljxRWTp,~30Û'P+D| LyH;h~XnjW pXjFquB&N4`0ɡ_5mVx o#_ ,6V-E]BbN`(a_NC0,WHqE-u4e?n׼䫀$ V$)vU:Wo"XC6I1bOя"})0Gpsao\,Z`BOD%-޸/%`k [9!CG~K (zx"4Vs,ppjhaL$l])ن .$SG9x ܋4% N ߧU3V1c 5J+4Ϯ Ҥ:12!By&ONp`}KlҦbЫUPgrLFEܢ5|(.fsb;jԦfPi20?0!!RD .L2;-FaWw JD8.NWQ(ニeT\2T B7oBjU \2W^덚1f臨C"Cq&,]=zEy#O115jDk>,yb`aR9\d㏐01܉{y c3St3Y.[rNLl8AD1i]BFY%~ftQ7sMC3٠&#IF$Wm;NML/ IDATVCo{Kd(tF{NA5QV\mpt6^ oc1R`E9D$n5<.ה~Y}?52`.\};k:\r2u}Y9" 'QTSȒsvdέtAOfl!ӵ3a##ȃʴ z$3,XJvZxKm%R9RYiOeXnjnbq̆ID01[v[T:lѠ|jYc@/ok }FmS%S{ҪRiAi' .$|_zm;KZ23Es P#1¾IHCdu׌:Iug9[aU9~~g.c֯]q/,Pސr Hض4ut: ;,';I~FcXmĒ; Qg`B&4K4'sh:3yHỴ_"8)kXnjTX6v3 \C0vW;4~82>Η"(NDG4,{x>Ah' !pRFeTٞ4 kR 6sAq_)(I ]uJrbb/a'h (y7,$nձV"/PrՁ.XF@vs CN' bhtaB䘧pb Kӌ1tbhwFoCc PmLEsDa253`8psуc3.]^>;zZ@n D"ʎnxts Af㻬DSGbQ Ly|||A<2Akg'S#NQ@cD*Hۃm%4$:$.Nb~֖vEE5ԃh.!B2mzoG?w57r!M꼟wL^`R +M#;ɱغPݒ%掐.pU/$ MsX-mdAc~&;sF#6ڪC6?0opKq,ޟecM\J 0jrͦ $tcr6V&5vlN۬fb32g >@UJkl1R:ށ6I{Es(y H& Ku_uhlBI /? p{E;ڃMF]^{o"!G%0(9;a=V]RYTC1fpbh7zѸAHXP¶m3MϦ!?YCK gHaE|:5Qބf LNL;wqh:42 L$(p s#ft9 GFnW?xvH75^(y5}@9'~)>[cF 4kd9B>Sv[ ]q"6rS{QmHB+68`IzW"J!t Ba~Jv?Vl_FhwxE05OvCyG<*o0 z_c"\0$նB=˳WWnOճQGy k$ث[JzJU4,]$"lw8OXnjeJ??C^\;4|\Ku }V NuanhoqICc@?2l;5iBiA^{j2[  'gBx5Ry! B|B*7qߢ񟐌qP| p~ۊSYuQ|JZCT#nj1w-A4390YdGٙsQTv`/1Y2SCaTx M~oQ/w +9'XƳx?Zzǝ_?go{=f',: J$e(iIk^ [X0 E(60QuʛUx,/ Fmmǭ<2 Xh( /aCn֤ث70hdnuXFԌm6j%J׏LDHV3Gt^x5Z5l ˳@VR-L"gRGfY4P? N~xSO\AId9x 33}C8{I&\2'ոCO%0* ♍Sqg6`c9 f@ʚ"jvt`kӟ$±|eK<[<ﮱ@״(8WXs BZ6f<9G(R6e*W^>&1\d}k la(B&vK&^LuJ*k|''ͽi?i\FFv~*&hi,]mx8hkFeh0%@?: h ر;(s@r\^Ӎ sW*9w{U 41ȇ;e|N a 8V4DKQ%aS ]j/p1!?O~Xs"&G"P$/\|I1+} 4&& {b2 "B _ VÜ- ^.mf}GSϿx.g*'BFqĘ1?'87SI4vɒ\?r̆1c[%cEfTP0("zI?mOO|%H:w[tg}ѡ[B}E 3N0DN ś%p{HR %,Y.Jݪu 6,Os bsZDʬxϠsva3 SS^}{"\H!=1rtpSjew7i^#ȍk m63 @O )Rae?[h!7`)+ a>`۲wH9d"N[hC_٠%A9c (QMһȂ*2J1x q%jI=m*:ɯDsw6(ζeU{_ǜ^I]|9qޱ'Nr/4]j!L,-wn0uz5mU&IdY^u[Gs/S7$: (d3hqN}^H+W eO|^0V1c Q^/Sv>ڔJ&[ܰ.M٣(<<UC><gx.$oJ90;:ja=W3QIr& gZΎ>RD0D-n#  P&7, 8,^ygXm᧑WZ:,^@E1wۧg|[c unǤj1z˯"Gij 3z!eʛG<Պf(*ɂ*s}W{ȝį`u8xN 2Up̘Kч}ګ= 3D|(-Zv$sEfgXKU96#s.5Ewb*% l$PcCâ#0?gӣ2V.zPeF) QxetR Il Qo_8[M$AI`# F>1j#q1K{5(Z-h<|[Q!Gs/$(-9|?#2mdrLRPT6΀8C!qr̝Qf*5j%90Up̘Cx_buMm!3O/3҅VCcͪMO#΀Βm}Uv,6$$p>Q!#%ONO`l3\& = (N c|^ %C6wBx搜)g۬4I|4l-K g ˗&3 ^n:$M^-(EEѮ׬2pZX4[ha*[d,ޏLUp̘J?Ԅstu߰䮏'!D&9aBAďgXУZpVUPRzۥ)/2zq+ȷ=BL I-Dy"-?Wߨ/`<oeCB 7!He`e]`)` !E*kOj%D8ګBTBTwK8(ZxZy\*T>MQƍ3S /0yۦ8Ce ?jkP6J'QÜzg.v*8f]#M?vXLGCzUJW*;6(Sa1ͤL9mWvS#ˬ69N i|K*@ l4;lE_ ' [m!xBFHŷ6ZFK(~@H q1g( T+a['_*@慦 kҡU0Q4 da[hH %J/2=;Wp|d,O"+(:'!򟽺1w[s_@ PC -1mVwQG0/tikp6 e +_q?I6<"no'1R릚$HlZ)E oQzo)Lc~{i E%C8Wj 3`Dyla[BPW"i/Dp!Xnj봨s; l͒AbL<(pT6倾M Ħ5D+uJ~4Ǎ(3IE+DrLj]2JL`!Ǒc w B2 |DlT4pP $)Ba޽uwksЍ RdIF˩d&6dƻUffvݭd<[̔I&ΎUeYe` _s˖|E6~>/X?{D;տ ÍMr\*em~ٮfmA. o^ ^f˜`|bLi]KWu~={֦azKSG$?8=)B= Htָ2Di^અa`C3s)9j5 s嗷Y ʽ<_HZ;X8w o8?#Ղ<^_!O`'=)ɘ< ~QRNH! )NߥR:ӪRo4;USOѲU. ,).mQ,α |*CL?U92lg&:-οYr4}t~ )tk Aq*dqE;Sirsܻ תFEEVX?ͯ~oexem>o>~f5 IoIQ&A$AE=O5B͜Rj$5{`̒'~tTzksY/IkQuo/h),gWs4m4{Y gZe%m&(&M*BktX6' A$`5II0cL,#m)W^rOQkjogՑKuv .Ц^f:KIfWY|gp<6b<`͓yps׉:DG2WD$f̈*D.uh6ZmĻ6QDڀ橲S0Zuήvf5=զ)ˡ]UHQsp?mr6"v@meI(u׋u_7yNS{ E")(2 Ƨ92͑/̍z%,s>CΧoS|X7z&։J,W8Ӥ0 da;;Zdك?8"_u4qqϹ.gWjꠗH{ Z⾅99Klu 1 e' :vۺ5gD1%_U(._mܭ 76c9{ʵ3%ض٫+blE8Z5t|v0>I1w{^aFJKG!cSzƧ(a6:sg!͟4YJ(XCVZn7K ;Mq($g:QMloUnJ2`!ܢ^fլb{E$Eo.a%MbDu 5QL&S0Rmcu<0e^F.{+%/S!Wڠԕ& 6q)z5 q䐑]cThVQU|{o}^]G)M37qX[0ƇO>vccw-BxqYn:]w)%8S;lƊdĴkbDt1 O{gV;hB/>rЯ_ KUZ((6E5ة裨{½?:Mm4395Hjv8N &\8EzhELPft֗)0-v:}<479lw+No1ZZzufW8nߛg94G:44[d$l▙H!dL25&zh[sdW1vHڠ d8&Q-R?p=M&qϐ}pxjcb WJAƘ:ēXݝꅱbFӨp.=oLG5~666]z˾DSdl[7,iG Vf M}w1-: *+走390*_"~c~7$ERP.8Gs#y1>\Iuhoxz>GL)BM@sղAYt<5`hcu A^FMOnJLUQ(ś׸e3Z&[lM BB)~ >Q.& hn?Rвl|z7ZCRP"Gq jt'6}JnbpUWPs8Po")K=m٣`'*"$*T=kje\dasvvLj-RO>9zaU2FʼnlM~* zjtxcL>ͯg6M|dzp:6HM:<;\[%}/;?&EqES7CtiQ=Vm.0 0>^nI sr\cF~-{ݱl/"_9Ju fQŭUK&g aڬ/3:]q`~yARP/{9doʠ}~F>@r.&WD.Z DzneL`6 jŀI>j]"ݡKcQޭ\#ecewnz6c7]q€0[k]fVn03s,f_SYN")(Mwsy7MCE% QKě:qڎj Bt? y()-ܷHKÕ{"re` mmT-Mp@2mZ&#cC&"Ng9MRPG<@D FLZNpA'C= )"Als xI|2 Mwmo58wG$Զu|Ioc&Y6q؍㏢ZPԋf8x6( KRPq.I2& aT(yf-jF/1QL̙8+2qi{۴eۗbtd;*^kvog.mqAts4|LJi/ *:tϒJP $ݝ;5Ey}$W(tP2QԐ'C}r6N>N?鴝'>_ȩWbLbLbA{P"<:!&*(ID t5;t 0 M;URi0%Ln$3xL*kRT$I݆[DQbw_gA;{{ʌB>.B?q1v+?Ea`kcyS~>K-(e {]H8G4,zd \u*TݲcGWP^mil߽Ѝw;B| >a ƚEu ֠YB(]=y#S=Ԃ?9I !>0?t^|cUAG'i.x$!>?jUjD!jKKRPۛ9i:yلT\CiMTb`1gQ\{UPvu JoIJR0Ml>MH!(nϽϽz}?} \KV8RUCj&3:m#![B{q!ƍqRz-e>'ܪs^/[v仟;z$_IAqSyi;Vs9&lj i逫!'fɡ댸Qey*b1FDS#o2e2ݎϦ9p z2<-)(`f)xys|1D*>n>!|r;u,<.1hgbz+&cZ*JʴTY{tZ!nW *M{sϺ9;3b.)Ͻ_?O~/W{ß7O@e?WЂt@ Ԍh݈U0)TRBJd&lB)yJc5C)}θ1]-R#]Np;̎'<{O?ޟ1gs_|Til <03m+>+^7xYaWAW0C1 u`q\.r$cE0gP&v#Dbh ?*UK`AJͼɼAYE*Ng=@Gg"'I=>79LPiZ]bl;)G3r 1ӕ/ ý >\ Vuyc-m3dz*vbXf-fwwixAֻmSP }.- 8a r <]nPh5F|'XhfPNuRqZMH]>=[<>wyӠ\]K{7c}bȸ^!`&S &uZR f47p*t<ΪdLfRt[ƨAYIqXib{1 11KT ZuLF}FBMC)f"}HDS!:mqK & }wy`zOǭSX곢bi9>+^JRG80$tcvm/2#ĝOjA12BvKX !8\`2LX~BWk(ir?'ibF!S)1DC`rU,T-Cvl8*44L*!&ؠ@ߛ, Xd"1lhPH,B :@lFk}Φ(JF%3$ĝLRP  S#hYPcE#g,M C5{S="C)ll'}·RRMq> :uz\Y8 z!nIA1 & &U>*R7YfBB !; 8<ߏF߸4|6T21݀0BZƠag|u<&z{ɸ:'|>-.T#z`G>K:[/]W"d .K x\htb } mq&ۡ AYa[&ZP}mcs_="6A b =m2۠cCC!MHt7Fo^ߪ{ֈn1n՘^B>`aE V7TGRPx_HLd&35:ЅL>!YVcT[jb"(ML҆,{?)vWn`@rw>+^@E qH ,N:S5qc*-ZjЩ[֖e . h`H !Cu_}^46:Ƞ'C !hJ|5 *lBgwQ  k{lT$ bBܹ 4.WH2q@]% '\DG02nIA!Wi-d8C獠{N5uҩ.V9;,==1IRPqgTdܥ~ըFvjg0pҷ0waikg6xnBY-~d-=8%5=Grw?i'F{Eqwm,t -CRPqUHwl\6  @Jx?1?51f"ݡJzDw9BNN!_6fgҚGsp Lf!~UX`n]vow&RFۏ*I{ř}N1s<+#MjA!/"w,sa8"нIdq]U7l4K4/24ckA_?]Էd*~ IA!-ԫ^?&h7q}?ӓBd H,apc`wH'be?cG%ڙ)f:it3T8īVF&2DYI !~*O1{{\[U  mD~ )n0> ɘnBj(v«Q(12dZ8iDzGj[/:}^dw(IA!OҫSAygH jmQS&!3 *4/[h-yrw}gBwVӱ5O$IBQA}"B$1i_e6^ߢ0ߣ$L>3iFA:y.K{{잦uP`t\ ,c^,o;,MZr&8s􅊿IA!U02e#7uY)Ϟ7@_$3Cn{ߝFh.L`v$qwF.yjY]qT;:$C>6E*6 n;:p O*eB ;g,J@%TuB.v[[E&:htV}o-Z o#mvN\603ϒ1F*!vN^vvG}rRy)x O*W~k1Q<+nC/b2 |x7X ;oL}cW2̠T^Ls=r\ƶf1ĊJVo q6͙וH/9=W>W{|_\|緅28O:AЭo΂{gfwf% @/YlKvɴ_\P"zQUypUBV*R]%GD , D],qstLvD$JG>O`h1;0ӽnkN#%9, ޥ|V#U]4븃o7t3oҵ}aٵ N;oݶq;vĹ PELe&ꗠDO2?iK4rԖHbg5C}%mDft"' Zn[X-Y0niT!]v<Jy4k%$G=L) w.Nr֙x}ύ kKiyP_$ tFqfJˣE,V'Kr 9-@9}>#5 V}ć4VVo|ƊGI@P-\">m'QkC,A5AoiV=T3 xyY }JDe ebY> Dc9,g,<5..-NQN#G2&е{-wzÑGů9tV7~Rݾ[ -Tf(b%x߮#=N!fLe/1%ˉ8EgI'V:'(S[`fN+'tMc710>HcTP/'?rӦND~{^IgE\`e0 `$4R\a>V&ӫg+}/̟h(OP&5?#ٟ;P>4\+O|}:*wz &?~A͕r/ Mf͞DZ<o : ϰ10O9Rm^Fj8==D_} j`Xbn[O`:уZBB3Tp g3G 0FaMr`FVOKsՌF =:8}4Xntb̽F#_Q– =Cߓd!P>MP'hJdvzO|(ɱ "(TX8~n.F+D=Z3 v5aZD,0ccݿGzTiֱS$HQDLgHtJÖ(Pv Ntɽ:y [tѵ-p =YwWd o΢:17̿`wNβ W^^Z'Kc,*H1/L 5U2"7A'@aVp󯵟_0NM 9^)œz2/VJ0r9Jrwtk_Ԗ(Na:dlPiԚ"Q&07LS+vMe">Ob{ŒoI%P֡u[AgOP̓Ob;O}sňf+wK/T*C|XПc+ty*s"L 3,A.@;:@c)ho]8=TÌIRdQEֽI{tNogZiV2?Oe:j֢ɤx&t?+$ΊH:>.0E FG\LeJn5?AP$X/k]ĝbgSu P+UPd{{)W>zt|+/3:7~QNyǁr[.{e%Q>ҙPʋe?Y~Mj9V8Asm1lL%Ua>Ω"^|t߇]y29M| .Rqp:kWWu~Ѫs}Q#2lzQ7Ƚ|,,4 kwg),1lX1%3-9f_q"#JF0 Nosvߓv;%ԗh>Dr;QJ[!9МSJ/lfSL6n Meo udyÿW0rհvcCtWd}SE;Ee7 bxZ L2,a4LF;ibƨK.Gk< ~9+_(6Зhְܴ{fJffVPZ\ꊧ89./QImDNb= ޣAePE)4g `\~œxZKfNj 6!(v˥\=6g_KzױU}f}.M*n">ߊ:}xKYh8dzag-9,o;[t.Dd2=2#/'ƞ!l,5c~|ﯫ%by%),4*F_ym~ ,Q%JI`Df+9bӥV3r\XtFmZS!=T.'v{QeRE֟ʌF}NfgjaFSM7HhV@cXVX O#hC#4.N?KEl2eQLފ8V'}+Oe $>f@eLm oo- +bOF^M"WOV@uG] )#.N6%Z~2^H ӽ @m js"qK43=JW)iFߊ_!'<%bYiՉVD=|-ϒ}h-woAYo\̸?e vV_Q|nj9Xak:x.q#2jp2 Ս5ԙ{#Jeei+ߣ^f 1ZCu9}ȟQlն=z4lt:sXzF&fc;f.زi"7x?,GKo 0ᯐF/QUj--İ};E2QZ nj5x͔ۚ*~6,]<ӧiֱ \#WdSE6GyɟPu4irI5NJcIf õTYxzz-fĪǝ;&##?O#OF2κl.Fzn+,2̏h>LfvCIP;ŠL v7هrinmz梁г=^0;m0}zSXj)73;]**(z:j_.N_f巂{ ψlv!:I3~sV0_6OPۂ?_S ?Yk0"72hҊTAYXz Hx""^*("" Hx""^*("" Hx""^*("" HxkH$֛ ""눖۴h܎h-:#*"" Hx""^*("" Hx""^f(""ѱ*("" Hx""^*("" Hx""^*("" Hx""^3{H'd֏{+t4Oء6ai_|xѱfqp}/L|h0`0gG3;r!& PiSŁk{bνwk֣s'vdC_Ǹ4~Ő>|Fqhxܖʧ)Ə*(i|Љ#N]f7g-^pۈ1 O/C6Q7:+Nxܴ\__ȇ-m nH;|os_}<Ĺ;j1~t >9a!>c½;r Eǂ!pॉ82D"g_Gہ8i}WLozlik """kCǂ""^*("" Hx""^*("" Hx""^*("" Hx""^*("" Hx""^*("" Hx""^*("" Hx""^*(L?qV/>6% DZo| Gk2#|5"ÑGkt: D7 l 7;ȆZ^8{D TA fGN@;Al, đԝ"=UPd#9~x/LA͐"r[Kq'&ȯG &_(u9ұC8oC&_#'n6A;ArS# 1}; otherwise, this parameter is ignored.} \item{n_epochs}{Number of epochs to use during the optimization of the embedded coordinates. A value between \code{30 - 100} is a reasonable trade off between speed and thoroughness. By default, this value is set to one third the number of epochs used to build the \code{model}.} \item{n_threads}{Number of threads to use, (except during stochastic gradient descent). Default is half the number of concurrent threads supported by the system.} \item{n_sgd_threads}{Number of threads to use during stochastic gradient descent. If set to > 1, then be aware that if \code{batch = FALSE}, results will \emph{not} be reproducible, even if \code{set.seed} is called with a fixed seed before running. Set to \code{"auto"} to use the same value as \code{n_threads}.} \item{grain_size}{Minimum batch size for multithreading. If the number of items to process in a thread falls below this number, then no threads will be used. Used in conjunction with \code{n_threads} and \code{n_sgd_threads}.} \item{verbose}{If \code{TRUE}, log details to the console.} \item{init}{how to initialize the transformed coordinates. One of: \itemize{ \item \code{"weighted"} (The default). Use a weighted average of the coordinates of the nearest neighbors from the original embedding in \code{model}, where the weights used are the edge weights from the UMAP smoothed knn distances. Equivalent to \code{init_weighted = TRUE}. \item \code{"average"}. Use the mean average of the coordinates of the nearest neighbors from the original embedding in \code{model}. Equivalent to \code{init_weighted = FALSE}. \item A matrix of user-specified input coordinates, which must have dimensions the same as \code{(nrow(X), ncol(model$embedding))}. } This parameter should be used in preference to \code{init_weighted}.} \item{batch}{If \code{TRUE}, then embedding coordinates are updated at the end of each epoch rather than during the epoch. In batch mode, results are reproducible with a fixed random seed even with \code{n_sgd_threads > 1}, at the cost of a slightly higher memory use. You may also have to modify \code{learning_rate} and increase \code{n_epochs}, so whether this provides a speed increase over the single-threaded optimization is likely to be dataset and hardware-dependent. If \code{NULL}, the transform will use the value provided in the \code{model}, if available. Default: \code{FALSE}.} \item{learning_rate}{Initial learning rate used in optimization of the coordinates. This overrides the value associated with the \code{model}. This should be left unspecified under most circumstances.} \item{opt_args}{A list of optimizer parameters, used when \code{batch = TRUE}. The default optimization method used is Adam (Kingma and Ba, 2014). \itemize{ \item \code{method} The optimization method to use. Either \code{"adam"} or \code{"sgd"} (stochastic gradient descent). Default: \code{"adam"}. \item \code{beta1} (Adam only). The weighting parameter for the exponential moving average of the first moment estimator. Effectively the momentum parameter. Should be a floating point value between 0 and 1. Higher values can smooth oscillatory updates in poorly-conditioned situations and may allow for a larger \code{learning_rate} to be specified, but too high can cause divergence. Default: \code{0.5}. \item \code{beta2} (Adam only). The weighting parameter for the exponential moving average of the uncentered second moment estimator. Should be a floating point value between 0 and 1. Controls the degree of adaptivity in the step-size. Higher values put more weight on previous time steps. Default: \code{0.9}. \item \code{eps} (Adam only). Intended to be a small value to prevent division by zero, but in practice can also affect convergence due to its interaction with \code{beta2}. Higher values reduce the effect of the step-size adaptivity and bring the behavior closer to stochastic gradient descent with momentum. Typical values are between 1e-8 and 1e-3. Default: \code{1e-7}. \item \code{alpha} The initial learning rate. Default: the value of the \code{learning_rate} parameter. } If \code{NULL}, the transform will use the value provided in the \code{model}, if available.} \item{epoch_callback}{A function which will be invoked at the end of every epoch. Its signature should be: \code{(epoch, n_epochs, coords, fixed_coords)}, where: \itemize{ \item \code{epoch} The current epoch number (between \code{1} and \code{n_epochs}). \item \code{n_epochs} Number of epochs to use during the optimization of the embedded coordinates. \item \code{coords} The embedded coordinates as of the end of the current epoch, as a matrix with dimensions (N, \code{n_components}). \item \code{fixed_coords} The originally embedded coordinates from the \code{model}. These are fixed and do not change. A matrix with dimensions (Nmodel, \code{n_components}) where \code{Nmodel} is the number of observations in the original data. }} \item{ret_extra}{A vector indicating what extra data to return. May contain any combination of the following strings: \itemize{ \item \code{"fgraph"} the high dimensional fuzzy graph (i.e. the fuzzy simplicial set of the merged local views of the input data). The graph is returned as a sparse matrix of class \link[Matrix]{dgCMatrix-class} with dimensions \code{NX} x \code{Nmodel}, where \code{NX} is the number of items in the data to transform in \code{X}, and \code{NModel} is the number of items in the data used to build the UMAP \code{model}. A non-zero entry (i, j) gives the membership strength of the edge connecting the vertex representing the ith item in \code{X} to the jth item in the data used to build the \code{model}. Note that the graph is further sparsified by removing edges with sufficiently low membership strength that they would not be sampled by the probabilistic edge sampling employed for optimization and therefore the number of non-zero elements in the matrix is dependent on \code{n_epochs}. If you are only interested in the fuzzy input graph (e.g. for clustering), setting \code{n_epochs = 0} will avoid any further sparsifying. \item \code{"nn"} the nearest neighbor graph for \code{X} with respect to the observations in the \code{model}. The graph will be returned as a list of two items: \code{idx} a matrix of indices, with as many rows as there are items in \code{X} and as many columns as there are nearest neighbors to be computed (this value is determined by the \code{model}). The indices are those of the rows of the data used to build the \code{model}, so they're not necessarily of much use unless you have access to that data. The second item, \code{dist} is a matrix of the equivalent distances, with the same dimensions as \code{idx}. }} \item{seed}{Integer seed to use to initialize the random number generator state. Combined with \code{n_sgd_threads = 1} or \code{batch = TRUE}, this should give consistent output across multiple runs on a given installation. Setting this value is equivalent to calling \code{\link[base]{set.seed}}, but it may be more convenient in some situations than having to call a separate function. The default is to not set a seed, in which case this function uses the behavior specified by the supplied \code{model}: If the model specifies a seed, then the model seed will be used to seed then random number generator, and results will still be consistent (if \code{n_sgd_threads = 1}). If you want to force the seed to not be set, even if it is set in \code{model}, set \code{seed = FALSE}.} } \value{ A matrix of coordinates for \code{X} transformed into the space of the \code{model}, or if \code{ret_extra} is specified, a list containing: \itemize{ \item \code{embedding} the matrix of optimized coordinates. \item if \code{ret_extra} contains \code{"fgraph"}, an item of the same name containing the high-dimensional fuzzy graph as a sparse matrix, of type \link[Matrix]{dgCMatrix-class}. \item if \code{ret_extra} contains \code{"sigma"}, returns a vector of the smooth knn distance normalization terms for each observation as \code{"sigma"} and a vector \code{"rho"} containing the largest distance to the locally connected neighbors of each observation. \item if \code{ret_extra} contains \code{"localr"}, an item of the same name containing a vector of the estimated local radii, the sum of \code{"sigma"} and \code{"rho"}. \item if \code{ret_extra} contains \code{"nn"}, an item of the same name containing the nearest neighbors of each item in \code{X} (with respect to the items that created the \code{model}). } } \description{ Carry out an embedding of new data using an existing embedding. Requires using the result of calling \code{\link{umap}} or \code{\link{tumap}} with \code{ret_model = TRUE}. } \details{ Note that some settings are incompatible with the production of a UMAP model via \code{\link{umap}}: external neighbor data (passed via a list to the argument of the \code{nn_method} parameter), and factor columns that were included in the UMAP calculation via the \code{metric} parameter. In the latter case, the model produced is based only on the numeric data. A transformation is possible, but factor columns in the new data are ignored. } \examples{ iris_train <- iris[1:100, ] iris_test <- iris[101:150, ] # You must set ret_model = TRUE to return extra data needed iris_train_umap <- umap(iris_train, ret_model = TRUE) iris_test_umap <- umap_transform(iris_test, iris_train_umap) } uwot/man/unload_uwot.Rd0000644000176200001440000000401314730166740014660 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/uwot.R \name{unload_uwot} \alias{unload_uwot} \title{Unload a Model} \usage{ unload_uwot(model, cleanup = TRUE, verbose = FALSE) } \arguments{ \item{model}{a UMAP model create by \code{\link{umap}}.} \item{cleanup}{if \code{TRUE}, attempt to delete the temporary working directory that was used in either the save or load of the model.} \item{verbose}{if \code{TRUE}, log information to the console.} } \description{ Unloads the UMAP model. This prevents the model being used with \code{\link{umap_transform}}, but allows the temporary working directory associated with saving or loading the model to be removed. } \examples{ iris_train <- iris[c(1:10, 51:60), ] iris_test <- iris[100:110, ] # create model model <- umap(iris_train, ret_model = TRUE, n_epochs = 20) # save without unloading: this leaves behind a temporary working directory model_file <- tempfile("iris_umap") model <- save_uwot(model, file = model_file) # The model can continue to be used test_embedding <- umap_transform(iris_test, model) # To manually unload the model from memory when finished and to clean up # the working directory (this doesn't touch your model file) unload_uwot(model) # At this point, model cannot be used with umap_transform, this would fail: # test_embedding2 <- umap_transform(iris_test, model) # restore the model: this also creates a temporary working directory model2 <- load_uwot(file = model_file) test_embedding2 <- umap_transform(iris_test, model2) # Unload and clean up the loaded model temp directory unload_uwot(model2) # clean up the model file unlink(model_file) # save with unloading: this deletes the temporary working directory but # doesn't allow the model to be re-used model3 <- umap(iris_train, ret_model = TRUE, n_epochs = 20) model_file3 <- tempfile("iris_umap") model3 <- save_uwot(model3, file = model_file3, unload = TRUE) } \seealso{ \code{\link{save_uwot}}, \code{\link{load_uwot}} } uwot/man/save_uwot.Rd0000644000176200001440000000554614577210515014347 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/uwot.R \name{save_uwot} \alias{save_uwot} \title{Save or Load a Model} \usage{ save_uwot(model, file, unload = FALSE, verbose = FALSE) } \arguments{ \item{model}{a UMAP model create by \code{\link{umap}}.} \item{file}{name of the file where the model is to be saved or read from.} \item{unload}{if \code{TRUE}, unload all nearest neighbor indexes for the model. The \code{model} will no longer be valid for use in \code{\link{umap_transform}} and the temporary working directory used during model saving will be deleted. You will need to reload the model with \code{load_uwot} to use the model. If \code{FALSE}, then the model can be re-used without reloading, but you must manually unload the NN index when you are finished using it if you want to delete the temporary working directory. To unload manually, use \code{\link{unload_uwot}}. The absolute path of the working directory is found in the \code{mod_dir} item of the return value.} \item{verbose}{if \code{TRUE}, log information to the console.} } \value{ \code{model} with one extra item: \code{mod_dir}, which contains the path to the working directory. If \code{unload = FALSE} then this directory still exists after this function returns, and can be cleaned up with \code{\link{unload_uwot}}. If you don't care about cleaning up this directory, or \code{unload = TRUE}, then you can ignore the return value. } \description{ Functions to write a UMAP model to a file, and to restore. } \examples{ iris_train <- iris[c(1:10, 51:60), ] iris_test <- iris[100:110, ] # create model model <- umap(iris_train, ret_model = TRUE, n_epochs = 20) # save without unloading: this leaves behind a temporary working directory model_file <- tempfile("iris_umap") model <- save_uwot(model, file = model_file) # The model can continue to be used test_embedding <- umap_transform(iris_test, model) # To manually unload the model from memory when finished and to clean up # the working directory (this doesn't touch your model file) unload_uwot(model) # At this point, model cannot be used with umap_transform, this would fail: # test_embedding2 <- umap_transform(iris_test, model) # restore the model: this also creates a temporary working directory model2 <- load_uwot(file = model_file) test_embedding2 <- umap_transform(iris_test, model2) # Unload and clean up the loaded model temp directory unload_uwot(model2) # clean up the model file unlink(model_file) # save with unloading: this deletes the temporary working directory but # doesn't allow the model to be re-used model3 <- umap(iris_train, ret_model = TRUE, n_epochs = 20) model_file3 <- tempfile("iris_umap") model3 <- save_uwot(model3, file = model_file3, unload = TRUE) } \seealso{ \code{\link{load_uwot}}, \code{\link{unload_uwot}} } uwot/man/lvish.Rd0000644000176200001440000007641614735021202013451 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/uwot.R \name{lvish} \alias{lvish} \title{Dimensionality Reduction with a LargeVis-like method} \usage{ lvish( X, perplexity = 50, n_neighbors = perplexity * 3, n_components = 2, metric = "euclidean", n_epochs = -1, learning_rate = 1, scale = "maxabs", init = "lvrandom", init_sdev = NULL, repulsion_strength = 7, negative_sample_rate = 5, nn_method = NULL, n_trees = 50, search_k = 2 * n_neighbors * n_trees, n_threads = NULL, n_sgd_threads = 0, grain_size = 1, kernel = "gauss", pca = NULL, pca_center = TRUE, pcg_rand = TRUE, fast_sgd = FALSE, ret_nn = FALSE, ret_extra = c(), tmpdir = tempdir(), verbose = getOption("verbose", TRUE), batch = FALSE, opt_args = NULL, epoch_callback = NULL, pca_method = NULL, binary_edge_weights = FALSE, nn_args = list(), rng_type = NULL ) } \arguments{ \item{X}{Input data. Can be a \code{\link{data.frame}}, \code{\link{matrix}}, \code{\link[stats]{dist}} object or \code{\link[Matrix]{sparseMatrix}}. Matrix and data frames should contain one observation per row. Data frames will have any non-numeric columns removed, although factor columns will be used if explicitly included via \code{metric} (see the help for \code{metric} for details). A sparse matrix is interpreted as a distance matrix, and is assumed to be symmetric, so you can also pass in an explicitly upper or lower triangular sparse matrix to save storage. There must be at least \code{n_neighbors} non-zero distances for each row. Both implicit and explicit zero entries are ignored. Set zero distances you want to keep to an arbitrarily small non-zero value (e.g. \code{1e-10}). \code{X} can also be \code{NULL} if pre-computed nearest neighbor data is passed to \code{nn_method}, and \code{init} is not \code{"spca"} or \code{"pca"}.} \item{perplexity}{Controls the size of the local neighborhood used for manifold approximation. This is the analogous to \code{n_neighbors} in \code{\link{umap}}. Change this, rather than \code{n_neighbors}.} \item{n_neighbors}{The number of neighbors to use when calculating the \code{perplexity}. Usually set to three times the value of the \code{perplexity}. Must be at least as large as \code{perplexity}.} \item{n_components}{The dimension of the space to embed into. This defaults to \code{2} to provide easy visualization, but can reasonably be set to any integer value in the range \code{2} to \code{100}.} \item{metric}{Type of distance metric to use to find nearest neighbors. For \code{nn_method = "annoy"} this can be one of: \itemize{ \item \code{"euclidean"} (the default) \item \code{"cosine"} \item \code{"manhattan"} \item \code{"hamming"} \item \code{"correlation"} (a distance based on the Pearson correlation) \item \code{"categorical"} (see below) } For \code{nn_method = "hnsw"} this can be one of: \itemize{ \item \code{"euclidean"} \item \code{"cosine"} \item \code{"correlation"} } If \href{https://cran.r-project.org/package=rnndescent}{rnndescent} is installed and \code{nn_method = "nndescent"} is specified then many more metrics are avaiable, including: \itemize{ \item \code{"braycurtis"} \item \code{"canberra"} \item \code{"chebyshev"} \item \code{"dice"} \item \code{"hamming"} \item \code{"hellinger"} \item \code{"jaccard"} \item \code{"jensenshannon"} \item \code{"kulsinski"} \item \code{"rogerstanimoto"} \item \code{"russellrao"} \item \code{"sokalmichener"} \item \code{"sokalsneath"} \item \code{"spearmanr"} \item \code{"symmetrickl"} \item \code{"tsss"} \item \code{"yule"} } For more details see the package documentation of \code{rnndescent}. For \code{nn_method = "fnn"}, the distance metric is always "euclidean". If \code{X} is a data frame or matrix, then multiple metrics can be specified, by passing a list to this argument, where the name of each item in the list is one of the metric names above. The value of each list item should be a vector giving the names or integer ids of the columns to be included in a calculation, e.g. \code{metric = list(euclidean = 1:4, manhattan = 5:10)}. Each metric calculation results in a separate fuzzy simplicial set, which are intersected together to produce the final set. Metric names can be repeated. Because non-numeric columns are removed from the data frame, it is safer to use column names than integer ids. Factor columns can also be used by specifying the metric name \code{"categorical"}. Factor columns are treated different from numeric columns and although multiple factor columns can be specified in a vector, each factor column specified is processed individually. If you specify a non-factor column, it will be coerced to a factor. For a given data block, you may override the \code{pca} and \code{pca_center} arguments for that block, by providing a list with one unnamed item containing the column names or ids, and then any of the \code{pca} or \code{pca_center} overrides as named items, e.g. \code{metric = list(euclidean = 1:4, manhattan = list(5:10, pca_center = FALSE))}. This exists to allow mixed binary and real-valued data to be included and to have PCA applied to both, but with centering applied only to the real-valued data (it is typical not to apply centering to binary data before PCA is applied).} \item{n_epochs}{Number of epochs to use during the optimization of the embedded coordinates. The default is calculate the number of epochs dynamically based on dataset size, to give the same number of edge samples as the LargeVis defaults. This is usually substantially larger than the UMAP defaults. If \code{n_epochs = 0}, then coordinates determined by \code{"init"} will be returned.} \item{learning_rate}{Initial learning rate used in optimization of the coordinates.} \item{scale}{Scaling to apply to \code{X} if it is a data frame or matrix: \itemize{ \item{\code{"none"} or \code{FALSE} or \code{NULL}} No scaling. \item{\code{"Z"} or \code{"scale"} or \code{TRUE}} Scale each column to zero mean and variance 1. \item{\code{"maxabs"}} Center each column to mean 0, then divide each element by the maximum absolute value over the entire matrix. \item{\code{"range"}} Range scale the entire matrix, so the smallest element is 0 and the largest is 1. \item{\code{"colrange"}} Scale each column in the range (0,1). } For lvish, the default is \code{"maxabs"}, for consistency with LargeVis.} \item{init}{Type of initialization for the coordinates. Options are: \itemize{ \item \code{"spectral"} Spectral embedding using the normalized Laplacian of the fuzzy 1-skeleton, with Gaussian noise added. \item \code{"normlaplacian"}. Spectral embedding using the normalized Laplacian of the fuzzy 1-skeleton, without noise. \item \code{"random"}. Coordinates assigned using a uniform random distribution between -10 and 10. \item \code{"lvrandom"}. Coordinates assigned using a Gaussian distribution with standard deviation 1e-4, as used in LargeVis (Tang et al., 2016) and t-SNE. \item \code{"laplacian"}. Spectral embedding using the Laplacian Eigenmap (Belkin and Niyogi, 2002). \item \code{"pca"}. The first two principal components from PCA of \code{X} if \code{X} is a data frame, and from a 2-dimensional classical MDS if \code{X} is of class \code{"dist"}. \item \code{"spca"}. Like \code{"pca"}, but each dimension is then scaled so the standard deviation is 1e-4, to give a distribution similar to that used in t-SNE and LargeVis. This is an alias for \code{init = "pca", init_sdev = 1e-4}. \item \code{"agspectral"} An "approximate global" modification of \code{"spectral"} which all edges in the graph to a value of 1, and then sets a random number of edges (\code{negative_sample_rate} edges per vertex) to 0.1, to approximate the effect of non-local affinities. \item A matrix of initial coordinates. } For spectral initializations, (\code{"spectral"}, \code{"normlaplacian"}, \code{"laplacian"}, \code{"agspectral"}), if more than one connected component is identified, no spectral initialization is attempted. Instead a PCA-based initialization is attempted. If \code{verbose = TRUE} the number of connected components are logged to the console. The existence of multiple connected components implies that a global view of the data cannot be attained with this initialization. Increasing the value of \code{n_neighbors} may help.} \item{init_sdev}{If non-\code{NULL}, scales each dimension of the initialized coordinates (including any user-supplied matrix) to this standard deviation. By default no scaling is carried out, except when \code{init = "spca"}, in which case the value is \code{0.0001}. Scaling the input may help if the unscaled versions result in initial coordinates with large inter-point distances or outliers. This usually results in small gradients during optimization and very little progress being made to the layout. Shrinking the initial embedding by rescaling can help under these circumstances. Scaling the result of \code{init = "pca"} is usually recommended and \code{init = "spca"} as an alias for \code{init = "pca", init_sdev = 1e-4} but for the spectral initializations the scaled versions usually aren't necessary unless you are using a large value of \code{n_neighbors} (e.g. \code{n_neighbors = 150} or higher). For compatibility with recent versions of the Python UMAP package, if you are using \code{init = "spectral"}, then you should also set \code{init_sdev = "range"}, which will range scale each of the columns containing the initial data between 0-10. This is not set by default to maintain backwards compatibility with previous versions of uwot.} \item{repulsion_strength}{Weighting applied to negative samples in low dimensional embedding optimization. Values higher than one will result in greater weight being given to negative samples.} \item{negative_sample_rate}{The number of negative edge/1-simplex samples to use per positive edge/1-simplex sample in optimizing the low dimensional embedding.} \item{nn_method}{Method for finding nearest neighbors. Options are: \itemize{ \item \code{"fnn"}. Use exact nearest neighbors via the \href{https://cran.r-project.org/package=FNN}{FNN} package. \item \code{"annoy"} Use approximate nearest neighbors via the \href{https://cran.r-project.org/package=RcppAnnoy}{RcppAnnoy} package. \item \code{"hnsw"} Use approximate nearest neighbors with the Hierarchical Navigable Small World (HNSW) method (Malkov and Yashunin, 2018) via the \href{https://cran.r-project.org/package=RcppHNSW}{RcppHNSW} package. \code{RcppHNSW} is not a dependency of this package: this option is only available if you have installed \code{RcppHNSW} yourself. Also, HNSW only supports the following arguments for \code{metric}: \code{"euclidean"}, \code{"cosine"} and \code{"correlation"}. \item \code{"nndescent"} Use approximate nearest neighbors with the Nearest Neighbor Descent method (Dong et al., 2011) via the \href{https://cran.r-project.org/package=rnndescent}{rnndescent} package. \code{rnndescent} is not a dependency of this package: this option is only available if you have installed \code{rnndescent} yourself. } By default, if \code{X} has less than 4,096 vertices, the exact nearest neighbors are found. Otherwise, approximate nearest neighbors are used. You may also pass precalculated nearest neighbor data to this argument. It must be a list consisting of two elements: \itemize{ \item \code{"idx"}. A \code{n_vertices x n_neighbors} matrix containing the integer indexes of the nearest neighbors in \code{X}. Each vertex is considered to be its own nearest neighbor, i.e. \code{idx[, 1] == 1:n_vertices}. \item \code{"dist"}. A \code{n_vertices x n_neighbors} matrix containing the distances of the nearest neighbors. } Multiple nearest neighbor data (e.g. from two different precomputed metrics) can be passed by passing a list containing the nearest neighbor data lists as items. The \code{n_neighbors} parameter is ignored when using precomputed nearest neighbor data.} \item{n_trees}{Number of trees to build when constructing the nearest neighbor index. The more trees specified, the larger the index, but the better the results. With \code{search_k}, determines the accuracy of the Annoy nearest neighbor search. Only used if the \code{nn_method} is \code{"annoy"}. Sensible values are between \code{10} to \code{100}.} \item{search_k}{Number of nodes to search during the neighbor retrieval. The larger k, the more the accurate results, but the longer the search takes. With \code{n_trees}, determines the accuracy of the Annoy nearest neighbor search. Only used if the \code{nn_method} is \code{"annoy"}.} \item{n_threads}{Number of threads to use (except during stochastic gradient descent). Default is half the number of concurrent threads supported by the system. For nearest neighbor search, only applies if \code{nn_method = "annoy"}. If \code{n_threads > 1}, then the Annoy index will be temporarily written to disk in the location determined by \code{\link[base]{tempfile}}.} \item{n_sgd_threads}{Number of threads to use during stochastic gradient descent. If set to > 1, then be aware that if \code{batch = FALSE}, results will \emph{not} be reproducible, even if \code{set.seed} is called with a fixed seed before running. Set to \code{"auto"} to use the same value as \code{n_threads}.} \item{grain_size}{The minimum amount of work to do on each thread. If this value is set high enough, then less than \code{n_threads} or \code{n_sgd_threads} will be used for processing, which might give a performance improvement if the overhead of thread management and context switching was outweighing the improvement due to concurrent processing. This should be left at default (\code{1}) and work will be spread evenly over all the threads specified.} \item{kernel}{Type of kernel function to create input probabilities. Can be one of \code{"gauss"} (the default) or \code{"knn"}. \code{"gauss"} uses the usual Gaussian weighted similarities. \code{"knn"} assigns equal probabilities to every edge in the nearest neighbor graph, and zero otherwise, using \code{perplexity} nearest neighbors. The \code{n_neighbors} parameter is ignored in this case.} \item{pca}{If set to a positive integer value, reduce data to this number of columns using PCA. Doesn't applied if the distance \code{metric} is \code{"hamming"}, or the dimensions of the data is larger than the number specified (i.e. number of rows and columns must be larger than the value of this parameter). If you have > 100 columns in a data frame or matrix, reducing the number of columns in this way may substantially increase the performance of the nearest neighbor search at the cost of a potential decrease in accuracy. In many t-SNE applications, a value of 50 is recommended, although there's no guarantee that this is appropriate for all settings.} \item{pca_center}{If \code{TRUE}, center the columns of \code{X} before carrying out PCA. For binary data, it's recommended to set this to \code{FALSE}.} \item{pcg_rand}{If \code{TRUE}, use the PCG random number generator (O'Neill, 2014) during optimization. Otherwise, use the faster (but probably less statistically good) Tausworthe "taus88" generator. The default is \code{TRUE}. This parameter has been superseded by \code{rng_type} -- if both are set, \code{rng_type} takes precedence.} \item{fast_sgd}{If \code{TRUE}, then the following combination of parameters is set: \code{pcg_rand = TRUE} and \code{n_sgd_threads = "auto"}. The default is \code{FALSE}. Setting this to \code{TRUE} will speed up the stochastic optimization phase, but give a potentially less accurate embedding, and which will not be exactly reproducible even with a fixed seed. For visualization, \code{fast_sgd = TRUE} will give perfectly good results. For more generic dimensionality reduction, it's safer to leave \code{fast_sgd = FALSE}. If \code{fast_sgd = TRUE}, then user-supplied values of \code{pcg_rand} and \code{n_sgd_threads}, are ignored.} \item{ret_nn}{If \code{TRUE}, then in addition to the embedding, also return nearest neighbor data that can be used as input to \code{nn_method} to avoid the overhead of repeatedly calculating the nearest neighbors when manipulating unrelated parameters (e.g. \code{min_dist}, \code{n_epochs}, \code{init}). See the "Value" section for the names of the list items. If \code{FALSE}, just return the coordinates. Note that the nearest neighbors could be sensitive to data scaling, so be wary of reusing nearest neighbor data if modifying the \code{scale} parameter.} \item{ret_extra}{A vector indicating what extra data to return. May contain any combination of the following strings: \itemize{ \item \code{"nn"} same as setting \code{ret_nn = TRUE}. \item \code{"P"} the high dimensional probability matrix. The graph is returned as a sparse symmetric N x N matrix of class \link[Matrix]{dgCMatrix-class}, where a non-zero entry (i, j) gives the input probability (or similarity or affinity) of the edge connecting vertex i and vertex j. Note that the graph is further sparsified by removing edges with sufficiently low membership strength that they would not be sampled by the probabilistic edge sampling employed for optimization and therefore the number of non-zero elements in the matrix is dependent on \code{n_epochs}. If you are only interested in the fuzzy input graph (e.g. for clustering), setting \code{n_epochs = 0} will avoid any further sparsifying. Be aware that setting \code{binary_edge_weights = TRUE} will affect this graph (all non-zero edge weights will be 1). \item \code{sigma} a vector of the bandwidths used to calibrate the input Gaussians to reproduce the target \code{"perplexity"}. }} \item{tmpdir}{Temporary directory to store nearest neighbor indexes during nearest neighbor search. Default is \code{\link{tempdir}}. The index is only written to disk if \code{n_threads > 1} and \code{nn_method = "annoy"}; otherwise, this parameter is ignored.} \item{verbose}{If \code{TRUE}, log details to the console.} \item{batch}{If \code{TRUE}, then embedding coordinates are updated at the end of each epoch rather than during the epoch. In batch mode, results are reproducible with a fixed random seed even with \code{n_sgd_threads > 1}, at the cost of a slightly higher memory use. You may also have to modify \code{learning_rate} and increase \code{n_epochs}, so whether this provides a speed increase over the single-threaded optimization is likely to be dataset and hardware-dependent.} \item{opt_args}{A list of optimizer parameters, used when \code{batch = TRUE}. The default optimization method used is Adam (Kingma and Ba, 2014). \itemize{ \item \code{method} The optimization method to use. Either \code{"adam"} or \code{"sgd"} (stochastic gradient descent). Default: \code{"adam"}. \item \code{beta1} (Adam only). The weighting parameter for the exponential moving average of the first moment estimator. Effectively the momentum parameter. Should be a floating point value between 0 and 1. Higher values can smooth oscillatory updates in poorly-conditioned situations and may allow for a larger \code{learning_rate} to be specified, but too high can cause divergence. Default: \code{0.5}. \item \code{beta2} (Adam only). The weighting parameter for the exponential moving average of the uncentered second moment estimator. Should be a floating point value between 0 and 1. Controls the degree of adaptivity in the step-size. Higher values put more weight on previous time steps. Default: \code{0.9}. \item \code{eps} (Adam only). Intended to be a small value to prevent division by zero, but in practice can also affect convergence due to its interaction with \code{beta2}. Higher values reduce the effect of the step-size adaptivity and bring the behavior closer to stochastic gradient descent with momentum. Typical values are between 1e-8 and 1e-3. Default: \code{1e-7}. \item \code{alpha} The initial learning rate. Default: the value of the \code{learning_rate} parameter. }} \item{epoch_callback}{A function which will be invoked at the end of every epoch. Its signature should be: \code{(epoch, n_epochs, coords)}, where: \itemize{ \item \code{epoch} The current epoch number (between \code{1} and \code{n_epochs}). \item \code{n_epochs} Number of epochs to use during the optimization of the embedded coordinates. \item \code{coords} The embedded coordinates as of the end of the current epoch, as a matrix with dimensions (N, \code{n_components}). }} \item{pca_method}{Method to carry out any PCA dimensionality reduction when the \code{pca} parameter is specified. Allowed values are: \itemize{ \item{\code{"irlba"}}. Uses \code{\link[irlba]{prcomp_irlba}} from the \href{https://cran.r-project.org/package=irlba}{irlba} package. \item{\code{"rsvd"}}. Uses 5 iterations of \code{\link[irlba]{svdr}} from the \href{https://cran.r-project.org/package=irlba}{irlba} package. This is likely to give much faster but potentially less accurate results than using \code{"irlba"}. For the purposes of nearest neighbor calculation and coordinates initialization, any loss of accuracy doesn't seem to matter much. \item{\code{"bigstatsr"}}. Uses \code{\link[bigstatsr]{big_randomSVD}} from the \href{https://cran.r-project.org/package=bigstatsr}{bigstatsr} package. The SVD methods used in \code{bigstatsr} may be faster on systems without access to efficient linear algebra libraries (e.g. Windows). \strong{Note}: \code{bigstatsr} is \emph{not} a dependency of uwot: if you choose to use this package for PCA, you \emph{must} install it yourself. \item{\code{"svd"}}. Uses \code{\link[base]{svd}} for the SVD. This is likely to be slow for all but the smallest datasets. \item{\code{"auto"}} (the default). Uses \code{"irlba"}, unless more than 50% of the full set of singular vectors would be calculated, in which case \code{"svd"} is used. }} \item{binary_edge_weights}{If \code{TRUE} then edge weights in the input graph are treated as binary (0/1) rather than real valued. This affects the sampling frequency of neighbors and is the strategy used by the PaCMAP method (Wang and co-workers, 2020). Practical (Böhm and co-workers, 2020) and theoretical (Damrich and Hamprecht, 2021) work suggests this has little effect on UMAP's performance.} \item{nn_args}{A list containing additional arguments to pass to the nearest neighbor method. For \code{nn_method = "annoy"}, you can specify \code{"n_trees"} and \code{"search_k"}, and these will override the \code{n_trees} and \code{search_k} parameters. For \code{nn_method = "hnsw"}, you may specify the following arguments: \itemize{ \item \code{M} The maximum number of neighbors to keep for each vertex. Reasonable values are \code{2} to \code{100}. Higher values give better recall at the cost of more memory. Default value is \code{16}. \item \code{ef_construction} A positive integer specifying the size of the dynamic list used during index construction. A higher value will provide better results at the cost of a longer time to build the index. Default is \code{200}. \item \code{ef} A positive integer specifying the size of the dynamic list used during search. This cannot be smaller than \code{n_neighbors} and cannot be higher than the number of items in the index. Default is \code{10}. } For \code{nn_method = "nndescent"}, you may specify the following arguments: \itemize{ \item \code{n_trees} The number of trees to use in a random projection forest to initialize the search. A larger number will give more accurate results at the cost of a longer computation time. The default of \code{NULL} means that the number is chosen based on the number of observations in \code{X}. \item \code{max_candidates} The number of potential neighbors to explore per iteration. By default, this is set to \code{n_neighbors} or \code{60}, whichever is smaller. A larger number will give more accurate results at the cost of a longer computation time. \item \code{n_iters} The number of iterations to run the search. A larger number will give more accurate results at the cost of a longer computation time. By default, this will be chosen based on the number of observations in \code{X}. You may also need to modify the convergence criterion \code{delta}. \item \code{delta} The minimum relative change in the neighbor graph allowed before early stopping. Should be a value between 0 and 1. The smaller the value, the smaller the amount of progress between iterations is allowed. Default value of \code{0.001} means that at least 0.1% of the neighbor graph must be updated at each iteration. \item \code{init} How to initialize the nearest neighbor descent. By default this is set to \code{"tree"} and uses a random project forest. If you set this to \code{"rand"}, then a random selection is used. Usually this is less accurate than using RP trees, but for high-dimensional cases, there may be little difference in the quality of the initialization and random initialization will be a lot faster. If you set this to \code{"rand"}, then the \code{n_trees} parameter is ignored. }} \item{rng_type}{The type of random number generator to use during optimization. One of: \itemize{ \item{\code{"pcg"}}. Use the PCG random number generator (O'Neill, 2014). \item{\code{"tausworthe"}}. Use the Tausworthe "taus88" generator. \item{\code{"deterministic"}}. Use a deterministic number generator. This isn't actually random, but may provide enough variation in the negative sampling to give a good embedding and can provide a noticeable speed-up. } For backwards compatibility, by default this is unset and the choice of \code{pcg_rand} is used (making "pcg" the effective default).} } \value{ A matrix of optimized coordinates, or: \itemize{ \item if \code{ret_nn = TRUE} (or \code{ret_extra} contains \code{"nn"}), returns the nearest neighbor data as a list called \code{nn}. This contains one list for each \code{metric} calculated, itself containing a matrix \code{idx} with the integer ids of the neighbors; and a matrix \code{dist} with the distances. The \code{nn} list (or a sub-list) can be used as input to the \code{nn_method} parameter. \item if \code{ret_extra} contains \code{"P"}, returns the high dimensional probability matrix as a sparse matrix called \code{P}, of type \link[Matrix]{dgCMatrix-class}. \item if \code{ret_extra} contains \code{"sigma"}, returns a vector of the high dimensional gaussian bandwidths for each point, and \code{"dint"} a vector of estimates of the intrinsic dimensionality at each point, based on the method given by Lee and co-workers (2015). } The returned list contains the combined data from any combination of specifying \code{ret_nn} and \code{ret_extra}. } \description{ Carry out dimensionality reduction of a dataset using a method similar to LargeVis (Tang et al., 2016). } \details{ \code{lvish} differs from the official LargeVis implementation in the following: \itemize{ \item Only the nearest-neighbor index search phase is multi-threaded. \item Matrix input data is not normalized. \item The \code{n_trees} parameter cannot be dynamically chosen based on data set size. \item Nearest neighbor results are not refined via the neighbor-of-my-neighbor method. The \code{search_k} parameter is twice as large than default to compensate. \item Gradient values are clipped to \code{4.0} rather than \code{5.0}. \item Negative edges are generated by uniform sampling of vertexes rather than their degree ^ 0.75. \item The default number of samples is much reduced. The default number of epochs, \code{n_epochs}, is set to \code{5000}, much larger than for \code{\link{umap}}, but may need to be increased further depending on your dataset. Using \code{init = "spectral"} can help. } } \examples{ # Default number of epochs is much larger than for UMAP, assumes random # initialization. Use perplexity rather than n_neighbors to control the size # of the local neighborhood 20 epochs may be too small for a random # initialization iris_lvish <- lvish(iris, perplexity = 50, learning_rate = 0.5, init = "random", n_epochs = 20 ) } \references{ Belkin, M., & Niyogi, P. (2002). Laplacian eigenmaps and spectral techniques for embedding and clustering. In \emph{Advances in neural information processing systems} (pp. 585-591). \url{http://papers.nips.cc/paper/1961-laplacian-eigenmaps-and-spectral-techniques-for-embedding-and-clustering.pdf} Böhm, J. N., Berens, P., & Kobak, D. (2020). A unifying perspective on neighbor embeddings along the attraction-repulsion spectrum. \emph{arXiv preprint} \emph{arXiv:2007.08902}. \url{https://arxiv.org/abs/2007.08902} Damrich, S., & Hamprecht, F. A. (2021). On UMAP's true loss function. \emph{Advances in Neural Information Processing Systems}, \emph{34}. \url{https://proceedings.neurips.cc/paper/2021/hash/2de5d16682c3c35007e4e92982f1a2ba-Abstract.html} Dong, W., Moses, C., & Li, K. (2011, March). Efficient k-nearest neighbor graph construction for generic similarity measures. In \emph{Proceedings of the 20th international conference on World Wide Web} (pp. 577-586). ACM. \doi{10.1145/1963405.1963487}. Kingma, D. P., & Ba, J. (2014). Adam: A method for stochastic optimization. \emph{arXiv preprint} \emph{arXiv}:1412.6980. \url{https://arxiv.org/abs/1412.6980} Lee, J. A., Peluffo-Ordóñez, D. H., & Verleysen, M. (2015). Multi-scale similarities in stochastic neighbour embedding: Reducing dimensionality while preserving both local and global structure. \emph{Neurocomputing}, \emph{169}, 246-261. Malkov, Y. A., & Yashunin, D. A. (2018). Efficient and robust approximate nearest neighbor search using hierarchical navigable small world graphs. \emph{IEEE transactions on pattern analysis and machine intelligence}, \emph{42}(4), 824-836. McInnes, L., Healy, J., & Melville, J. (2018). UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction \emph{arXiv preprint} \emph{arXiv}:1802.03426. \url{https://arxiv.org/abs/1802.03426} O'Neill, M. E. (2014). \emph{PCG: A family of simple fast space-efficient statistically good algorithms for random number generation} (Report No. HMC-CS-2014-0905). Harvey Mudd College. Tang, J., Liu, J., Zhang, M., & Mei, Q. (2016, April). Visualizing large-scale and high-dimensional data. In \emph{Proceedings of the 25th International Conference on World Wide Web} (pp. 287-297). International World Wide Web Conferences Steering Committee. \url{https://arxiv.org/abs/1602.00370} Van der Maaten, L., & Hinton, G. (2008). Visualizing data using t-SNE. \emph{Journal of Machine Learning Research}, \emph{9} (2579-2605). \url{https://www.jmlr.org/papers/v9/vandermaaten08a.html} Wang, Y., Huang, H., Rudin, C., & Shaposhnik, Y. (2021). Understanding How Dimension Reduction Tools Work: An Empirical Approach to Deciphering t-SNE, UMAP, TriMap, and PaCMAP for Data Visualization. \emph{Journal of Machine Learning Research}, \emph{22}(201), 1-73. \url{https://www.jmlr.org/papers/v22/20-1061.html} } uwot/DESCRIPTION0000644000176200001440000000357514757010752013000 0ustar liggesusersPackage: uwot Title: The Uniform Manifold Approximation and Projection (UMAP) Method for Dimensionality Reduction Version: 0.2.3 Authors@R: c( person("James", "Melville", , "jlmelville@gmail.com", role = c("aut", "cre", "cph")), person("Aaron", "Lun", role = "ctb"), person("Mohamed Nadhir", "Djekidel", role = "ctb"), person("Yuhan", "Hao", role = "ctb"), person("Dirk", "Eddelbuettel", role = "ctb"), person("Wouter", "van der Bijl", role = "ctb") ) Description: An implementation of the Uniform Manifold Approximation and Projection dimensionality reduction by McInnes et al. (2018) . It also provides means to transform new data and to carry out supervised dimensionality reduction. An implementation of the related LargeVis method of Tang et al. (2016) is also provided. This is a complete re-implementation in R (and C++, via the 'Rcpp' package): no Python installation is required. See the uwot website () for more documentation and examples. License: GPL (>= 3) URL: https://github.com/jlmelville/uwot, https://jlmelville.github.io/uwot/ BugReports: https://github.com/jlmelville/uwot/issues Depends: Matrix Imports: FNN, irlba, methods, Rcpp, RcppAnnoy (>= 0.0.17), RSpectra Suggests: bigstatsr, covr, knitr, RcppHNSW, rmarkdown, rnndescent, testthat LinkingTo: dqrng, Rcpp, RcppAnnoy, RcppProgress VignetteBuilder: knitr Config/Needs/website: rmarkdown Encoding: UTF-8 RoxygenNote: 7.3.2 NeedsCompilation: yes Packaged: 2025-02-24 05:51:09 UTC; jlmel Author: James Melville [aut, cre, cph], Aaron Lun [ctb], Mohamed Nadhir Djekidel [ctb], Yuhan Hao [ctb], Dirk Eddelbuettel [ctb], Wouter van der Bijl [ctb] Maintainer: James Melville Repository: CRAN Date/Publication: 2025-02-24 06:30:02 UTC