uwot/0000755000176200001440000000000013571756262011266 5ustar liggesusersuwot/NAMESPACE0000644000176200001440000000045613571663324012506 0ustar liggesusers# Generated by roxygen2: do not edit by hand export(load_uwot) export(lvish) export(save_uwot) export(tumap) export(umap) export(umap_transform) import(Matrix) importFrom(Rcpp,sourceCpp) importFrom(RcppParallel,RcppParallelLibs) importFrom(methods,new) useDynLib(uwot, .registration=TRUE) uwot/man/0000755000176200001440000000000013452015217012023 5ustar liggesusersuwot/man/umap_transform.Rd0000644000176200001440000000641713571660503015365 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/transform.R \name{umap_transform} \alias{umap_transform} \title{Add New Points to an Existing Embedding} \usage{ umap_transform( X, model, init_weighted = TRUE, search_k = NULL, tmpdir = tempdir(), n_epochs = NULL, n_threads = max(1, RcppParallel::defaultNumThreads()/2), n_sgd_threads = 0, grain_size = 1, verbose = FALSE ) } \arguments{ \item{X}{The new data to be transformed, either a matrix of data frame. Must have the same columns in the same order as the input data used to generate the \code{model}.} \item{model}{Data associated with an existing embedding.} \item{init_weighted}{If \code{TRUE}, then initialize the embedded coordinates of \code{X} using a weighted average of the coordinates of the nearest neighbors from the original embedding in \code{model}, where the weights used are the edge weights from the UMAP smoothed knn distances. Otherwise, use an unweighted average.} \item{search_k}{Number of nodes to search during the neighbor retrieval. The larger k, the more the accurate results, but the longer the search takes. Default is the value used in building the \code{model} is used.} \item{tmpdir}{Temporary directory to store nearest neighbor indexes during nearest neighbor search. Default is \code{\link{tempdir}}. The index is only written to disk if \code{n_threads > 1}; otherwise, this parameter is ignored.} \item{n_epochs}{Number of epochs to use during the optimization of the embedded coordinates. A value between \code{30 - 100} is a reasonable trade off between speed and thoroughness. By default, this value is set to one third the number of epochs used to build the \code{model}.} \item{n_threads}{Number of threads to use, (except during stochastic gradient descent). Default is half that recommended by RcppParallel.} \item{n_sgd_threads}{Number of threads to use during stochastic gradient descent. If set to > 1, then results will not be reproducible, even if `set.seed` is called with a fixed seed before running.} \item{grain_size}{Minimum batch size for multithreading. If the number of items to process in a thread falls below this number, then no threads will be used. Used in conjunction with \code{n_threads} and \code{n_sgd_threads}.} \item{verbose}{If \code{TRUE}, log details to the console.} } \value{ A matrix of coordinates for \code{X} transformed into the space of the \code{model}. } \description{ Carry out an embedding of new data using an existing embedding. Requires using the result of calling \code{\link{umap}} or \code{\link{tumap}} with \code{ret_model = TRUE}. } \details{ Note that some settings are incompatible with the production of a UMAP model via \code{\link{umap}}: external neighbor data (passed via a list to the argument of the \code{nn_method} parameter), and factor columns that were included in the UMAP calculation via the \code{metric} parameter. In the latter case, the model produced is based only on the numeric data. A transformation is possible, but factor columns in the new data are ignored. } \examples{ iris_train <- iris[1:100, ] iris_test <- iris[101:150, ] # You must set ret_model = TRUE to return extra data needed iris_train_umap <- umap(iris_train, ret_model = TRUE) iris_test_umap <- umap_transform(iris_test, iris_train_umap) } uwot/man/save_uwot.Rd0000644000176200001440000000121713571372403014334 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/uwot.R \name{save_uwot} \alias{save_uwot} \title{Save or Load a Model} \usage{ save_uwot(model, file) } \arguments{ \item{model}{a UMAP model create by \code{\link{umap}}.} \item{file}{name of the file where the model is to be saved or read from.} } \description{ Functions to write a UMAP model to a file, and to restore. } \examples{ # create model model <- umap(iris[1:100, ], ret_model = TRUE) # save model_file <- tempfile("iris_umap") save_uwot(model, file = model_file) # restore model2 <- load_uwot(file = model_file) identical(model, model2) unlink(model_file) } uwot/man/tumap.Rd0000644000176200001440000004547113571660504013462 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/uwot.R \name{tumap} \alias{tumap} \title{Dimensionality Reduction Using t-Distributed UMAP (t-UMAP)} \usage{ tumap( X, n_neighbors = 15, n_components = 2, metric = "euclidean", n_epochs = NULL, learning_rate = 1, scale = FALSE, init = "spectral", init_sdev = NULL, set_op_mix_ratio = 1, local_connectivity = 1, bandwidth = 1, repulsion_strength = 1, negative_sample_rate = 5, nn_method = NULL, n_trees = 50, search_k = 2 * n_neighbors * n_trees, n_threads = max(1, RcppParallel::defaultNumThreads()/2), n_sgd_threads = 0, grain_size = 1, y = NULL, target_n_neighbors = n_neighbors, target_metric = "euclidean", target_weight = 0.5, pca = NULL, pca_center = TRUE, pcg_rand = TRUE, fast_sgd = FALSE, ret_model = FALSE, ret_nn = FALSE, tmpdir = tempdir(), verbose = getOption("verbose", TRUE) ) } \arguments{ \item{X}{Input data. Can be a \code{\link{data.frame}}, \code{\link{matrix}}, \code{\link[stats]{dist}} object or \code{\link[Matrix]{sparseMatrix}}. A sparse matrix is interpreted as a distance matrix and both implicit and explicit zero entries are ignored. Set zero distances you want to keep to an arbitrarily small non-zero value (e.g. \code{1e-10}). Matrix and data frames should contain one observation per row. Data frames will have any non-numeric columns removed, although factor columns will be used if explicitly included via \code{metric} (see the help for \code{metric} for details). Can be \code{NULL} if precomputed nearest neighbor data is passed to \code{nn_method}, and \code{init} is not \code{"spca"} or \code{"pca"}.} \item{n_neighbors}{The size of local neighborhood (in terms of number of neighboring sample points) used for manifold approximation. Larger values result in more global views of the manifold, while smaller values result in more local data being preserved. In general values should be in the range \code{2} to \code{100}.} \item{n_components}{The dimension of the space to embed into. This defaults to \code{2} to provide easy visualization, but can reasonably be set to any integer value in the range \code{2} to \code{100}.} \item{metric}{Type of distance metric to use to find nearest neighbors. One of: \itemize{ \item \code{"euclidean"} (the default) \item \code{"cosine"} \item \code{"manhattan"} \item \code{"hamming"} \item \code{"categorical"} (see below) } Only applies if \code{nn_method = "annoy"} (for \code{nn_method = "fnn"}, the distance metric is always "euclidean"). If \code{X} is a data frame or matrix, then multiple metrics can be specified, by passing a list to this argument, where the name of each item in the list is one of the metric names above. The value of each list item should be a vector giving the names or integer ids of the columns to be included in a calculation, e.g. \code{metric = list(euclidean = 1:4, manhattan = 5:10)}. Each metric calculation results in a separate fuzzy simplicial set, which are intersected together to produce the final set. Metric names can be repeated. Because non-numeric columns are removed from the data frame, it is safer to use column names than integer ids. Factor columns can also be used by specifying the metric name \code{"categorical"}. Factor columns are treated different from numeric columns and although multiple factor columns can be specified in a vector, each factor column specified is processed individually. If you specify a non-factor column, it will be coerced to a factor. For a given data block, you may override the \code{pca} and \code{pca_center} arguments for that block, by providing a list with one unnamed item containing the column names or ids, and then any of the \code{pca} or \code{pca_center} overrides as named items, e.g. \code{metric = list(euclidean = 1:4, manhattan = list(5:10, pca_center = FALSE))}. This exists to allow mixed binary and real-valued data to be included and to have PCA applied to both, but with centering applied only to the real-valued data (it is typical not to apply centering to binary data before PCA is applied).} \item{n_epochs}{Number of epochs to use during the optimization of the embedded coordinates. By default, this value is set to \code{500} for datasets containing 10,000 vertices or less, and \code{200} otherwise.} \item{learning_rate}{Initial learning rate used in optimization of the coordinates.} \item{scale}{Scaling to apply to \code{X} if it is a data frame or matrix: \itemize{ \item{\code{"none"} or \code{FALSE} or \code{NULL}} No scaling. \item{\code{"Z"} or \code{"scale"} or \code{TRUE}} Scale each column to zero mean and variance 1. \item{\code{"maxabs"}} Center each column to mean 0, then divide each element by the maximum absolute value over the entire matrix. \item{\code{"range"}} Range scale the entire matrix, so the smallest element is 0 and the largest is 1. \item{\code{"colrange"}} Scale each column in the range (0,1). } For t-UMAP, the default is \code{"none"}.} \item{init}{Type of initialization for the coordinates. Options are: \itemize{ \item \code{"spectral"} Spectral embedding using the normalized Laplacian of the fuzzy 1-skeleton, with Gaussian noise added. \item \code{"normlaplacian"}. Spectral embedding using the normalized Laplacian of the fuzzy 1-skeleton, without noise. \item \code{"random"}. Coordinates assigned using a uniform random distribution between -10 and 10. \item \code{"lvrandom"}. Coordinates assigned using a Gaussian distribution with standard deviation 1e-4, as used in LargeVis (Tang et al., 2016) and t-SNE. \item \code{"laplacian"}. Spectral embedding using the Laplacian Eigenmap (Belkin and Niyogi, 2002). \item \code{"pca"}. The first two principal components from PCA of \code{X} if \code{X} is a data frame, and from a 2-dimensional classical MDS if \code{X} is of class \code{"dist"}. \item \code{"spca"}. Like \code{"pca"}, but each dimension is then scaled so the standard deviation is 1e-4, to give a distribution similar to that used in t-SNE. This is an alias for \code{init = "pca", init_sdev = 1e-4}. \item \code{"agspectral"} An "approximate global" modification of \code{"spectral"} which all edges in the graph to a value of 1, and then sets a random number of edges (\code{negative_sample_rate} edges per vertex) to 0.1, to approximate the effect of non-local affinities. \item A matrix of initial coordinates. } For spectral initializations, (\code{"spectral"}, \code{"normlaplacian"}, \code{"laplacian"}), if more than one connected component is identified, each connected component is initialized separately and the results are merged. If \code{verbose = TRUE} the number of connected components are logged to the console. The existence of multiple connected components implies that a global view of the data cannot be attained with this initialization. Either a PCA-based initialization or increasing the value of \code{n_neighbors} may be more appropriate.} \item{init_sdev}{If non-\code{NULL}, scales each dimension of the initialized coordinates (including any user-supplied matrix) to this standard deviation. By default no scaling is carried out, except when \code{init = "spca"}, in which case the value is \code{0.0001}. Scaling the input may help if the unscaled versions result in initial coordinates with large inter-point distances or outliers. This usually results in small gradients during optimization and very little progress being made to the layout. Shrinking the initial embedding by rescaling can help under these circumstances. Scaling the result of \code{init = "pca"} is usually recommended and \code{init = "spca"} as an alias for \code{init = "pca", init_sdev = 1e-4} but for the spectral initializations the scaled versions usually aren't necessary unless you are using a large value of \code{n_neighbors} (e.g. \code{n_neighbors = 150} or higher).} \item{set_op_mix_ratio}{Interpolate between (fuzzy) union and intersection as the set operation used to combine local fuzzy simplicial sets to obtain a global fuzzy simplicial sets. Both fuzzy set operations use the product t-norm. The value of this parameter should be between \code{0.0} and \code{1.0}; a value of \code{1.0} will use a pure fuzzy union, while \code{0.0} will use a pure fuzzy intersection.} \item{local_connectivity}{The local connectivity required -- i.e. the number of nearest neighbors that should be assumed to be connected at a local level. The higher this value the more connected the manifold becomes locally. In practice this should be not more than the local intrinsic dimension of the manifold.} \item{bandwidth}{The effective bandwidth of the kernel if we view the algorithm as similar to Laplacian Eigenmaps. Larger values induce more connectivity and a more global view of the data, smaller values concentrate more locally.} \item{repulsion_strength}{Weighting applied to negative samples in low dimensional embedding optimization. Values higher than one will result in greater weight being given to negative samples.} \item{negative_sample_rate}{The number of negative edge/1-simplex samples to use per positive edge/1-simplex sample in optimizing the low dimensional embedding.} \item{nn_method}{Method for finding nearest neighbors. Options are: \itemize{ \item \code{"fnn"}. Use exact nearest neighbors via the \href{https://cran.r-project.org/package=FNN}{FNN} package. \item \code{"annoy"} Use approximate nearest neighbors via the \href{https://cran.r-project.org/package=RcppAnnoy}{RcppAnnoy} package. } By default, if \code{X} has less than 4,096 vertices, the exact nearest neighbors are found. Otherwise, approximate nearest neighbors are used. You may also pass precalculated nearest neighbor data to this argument. It must be a list consisting of two elements: \itemize{ \item \code{"idx"}. A \code{n_vertices x n_neighbors} matrix containing the integer indexes of the nearest neighbors in \code{X}. Each vertex is considered to be its own nearest neighbor, i.e. \code{idx[, 1] == 1:n_vertices}. \item \code{"dist"}. A \code{n_vertices x n_neighbors} matrix containing the distances of the nearest neighbors. } Multiple nearest neighbor data (e.g. from two different precomputed metrics) can be passed by passing a list containing the nearest neighbor data lists as items. The \code{n_neighbors} parameter is ignored when using precalculated nearest neighbor data.} \item{n_trees}{Number of trees to build when constructing the nearest neighbor index. The more trees specified, the larger the index, but the better the results. With \code{search_k}, determines the accuracy of the Annoy nearest neighbor search. Only used if the \code{nn_method} is \code{"annoy"}. Sensible values are between \code{10} to \code{100}.} \item{search_k}{Number of nodes to search during the neighbor retrieval. The larger k, the more the accurate results, but the longer the search takes. With \code{n_trees}, determines the accuracy of the Annoy nearest neighbor search. Only used if the \code{nn_method} is \code{"annoy"}.} \item{n_threads}{Number of threads to use (except during stochastic gradient descent). Default is half that recommended by RcppParallel. For nearest neighbor search, only applies if \code{nn_method = "annoy"}. If \code{n_threads > 1}, then the Annoy index will be temporarily written to disk in the location determined by \code{\link[base]{tempfile}}.} \item{n_sgd_threads}{Number of threads to use during stochastic gradient descent. If set to > 1, then results will not be reproducible, even if `set.seed` is called with a fixed seed before running. Set to \code{"auto"} go use the same value as \code{n_threads}.} \item{grain_size}{Minimum batch size for multithreading. If the number of items to process in a thread falls below this number, then no threads will be used. Used in conjunction with \code{n_threads} and \code{n_sgd_threads}.} \item{y}{Optional target data for supervised dimension reduction. Can be a vector, matrix or data frame. Use the \code{target_metric} parameter to specify the metrics to use, using the same syntax as \code{metric}. Usually either a single numeric or factor column is used, but more complex formats are possible. The following types are allowed: \itemize{ \item Factor columns with the same length as \code{X}. \code{NA} is allowed for any observation with an unknown level, in which case UMAP operates as a form of semi-supervised learning. Each column is treated separately. \item Numeric data. \code{NA} is \emph{not} allowed in this case. Use the parameter \code{target_n_neighbors} to set the number of neighbors used with \code{y}. If unset, \code{n_neighbors} is used. Unlike factors, numeric columns are grouped into one block unless \code{target_metric} specifies otherwise. For example, if you wish columns \code{a} and \code{b} to be treated separately, specify \code{target_metric = list(euclidean = "a", euclidean = "b")}. Otherwise, the data will be effectively treated as a matrix with two columns. \item Nearest neighbor data, consisting of a list of two matrices, \code{idx} and \code{dist}. These represent the precalculated nearest neighbor indices and distances, respectively. This is the same format as that expected for precalculated data in \code{nn_method}. This format assumes that the underlying data was a numeric vector. Any user-supplied value of the \code{target_n_neighbors} parameter is ignored in this case, because the the number of columns in the matrices is used for the value. Multiple nearest neighbor data using different metrics can be supplied by passing a list of these lists. } Unlike \code{X}, all factor columns included in \code{y} are automatically used.} \item{target_n_neighbors}{Number of nearest neighbors to use to construct the target simplicial set. Default value is \code{n_neighbors}. Applies only if \code{y} is non-\code{NULL} and \code{numeric}.} \item{target_metric}{The metric used to measure distance for \code{y} if using supervised dimension reduction. Used only if \code{y} is numeric.} \item{target_weight}{Weighting factor between data topology and target topology. A value of 0.0 weights entirely on data, a value of 1.0 weights entirely on target. The default of 0.5 balances the weighting equally between data and target. Only applies if \code{y} is non-\code{NULL}.} \item{pca}{If set to a positive integer value, reduce data to this number of columns using PCA. Doesn't applied if the distance \code{metric} is \code{"hamming"}, or the dimensions of the data is larger than the number specified (i.e. number of rows and columns must be larger than the value of this parameter). If you have > 100 columns in a data frame or matrix, reducing the number of columns in this way may substantially increase the performance of the nearest neighbor search at the cost of a potential decrease in accuracy. In many t-SNE applications, a value of 50 is recommended, although there's no guarantee that this is appropriate for all settings.} \item{pca_center}{If \code{TRUE}, center the columns of \code{X} before carrying out PCA. For binary data, it's recommended to set this to \code{FALSE}.} \item{pcg_rand}{If \code{TRUE}, use the PCG random number generator (O'Neill, 2014) during optimization. Otherwise, use the faster (but probably less statistically good) Tausworthe "taus88" generator. The default is \code{TRUE}.} \item{fast_sgd}{If \code{TRUE}, then the following combination of parameters is set: \code{pcg_rand = TRUE} and \code{n_sgd_threads = "auto"}. The default is \code{FALSE}. Setting this to \code{TRUE} will speed up the stochastic optimization phase, but give a potentially less accurate embedding, and which will not be exactly reproducible even with a fixed seed. For visualization, \code{fast_sgd = TRUE} will give perfectly good results. For more generic dimensionality reduction, it's safer to leave \code{fast_sgd = FALSE}. If \code{fast_sgd = TRUE}, then user-supplied values of \code{pcg_rand} and \code{n_sgd_threads}, are ignored.} \item{ret_model}{If \code{TRUE}, then return extra data that can be used to add new data to an existing embedding via \code{\link{umap_transform}}. The embedded coordinates are returned as the list item \code{embedding}. If \code{FALSE}, just return the coordinates. This parameter can be used in conjunction with \code{ret_nn}. Note that some settings are incompatible with the production of a UMAP model: external neighbor data (passed via a list to \code{nn_method}), and factor columns that were included via the \code{metric} parameter. In the latter case, the model produced is based only on the numeric data. A transformation using new data is possible, but the factor columns in the new data are ignored.} \item{ret_nn}{If \code{TRUE}, then in addition to the embedding, also return nearest neighbor data that can be used as input to \code{nn_method} to avoid the overhead of repeatedly calculating the nearest neighbors when manipulating unrelated parameters (e.g. \code{min_dist}, \code{n_epochs}, \code{init}). See the "Value" section for the names of the list items. If \code{FALSE}, just return the coordinates. Note that the nearest neighbors could be sensitive to data scaling, so be wary of reusing nearest neighbor data if modifying the \code{scale} parameter. This parameter can be used in conjunction with \code{ret_model}.} \item{tmpdir}{Temporary directory to store nearest neighbor indexes during nearest neighbor search. Default is \code{\link{tempdir}}. The index is only written to disk if \code{n_threads > 1} and \code{nn_method = "annoy"}; otherwise, this parameter is ignored.} \item{verbose}{If \code{TRUE}, log details to the console.} } \value{ A matrix of optimized coordinates, or: \itemize{ \item if \code{ret_model = TRUE}, returns a list containing extra information that can be used to add new data to an existing embedding via \code{\link{umap_transform}}. In this case, the coordinates are available in the list item \code{embedding}. \item if \code{ret_nn = TRUE}, returns the nearest neighbor data as a list called \code{nn}. This contains one list for each \code{metric} calculated, itself containing a matrix \code{idx} with the integer ids of the neighbors; and a matrix \code{dist} with the distances. The \code{nn} list (or a sub-list) can be used as input to the \code{nn_method} parameter. } Both \code{ret_model} and \code{ret_nn} can be \code{TRUE}, in which case the returned list contains the combined data. } \description{ A faster (but less flexible) version of the UMAP gradient. For more detail on UMAP, see the \code{\link{umap}} function. } \details{ By setting the UMAP curve parameters \code{a} and \code{b} to \code{1}, you get back the Cauchy distribution as used in t-SNE and LargeVis. It also results in a substantially simplified gradient expression. This can give a speed improvement of around 50\%. } \examples{ iris_tumap <- tumap(iris, n_neighbors = 50, learning_rate = 0.5) } uwot/man/lvish.Rd0000644000176200001440000003767213571660504013465 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/uwot.R \name{lvish} \alias{lvish} \title{Dimensionality Reduction with a LargeVis-like method} \usage{ lvish( X, perplexity = 50, n_neighbors = perplexity * 3, n_components = 2, metric = "euclidean", n_epochs = -1, learning_rate = 1, scale = "maxabs", init = "lvrandom", init_sdev = NULL, repulsion_strength = 7, negative_sample_rate = 5, nn_method = NULL, n_trees = 50, search_k = 2 * n_neighbors * n_trees, n_threads = max(1, RcppParallel::defaultNumThreads()/2), n_sgd_threads = 0, grain_size = 1, kernel = "gauss", pca = NULL, pca_center = TRUE, pcg_rand = TRUE, fast_sgd = FALSE, ret_nn = FALSE, tmpdir = tempdir(), verbose = getOption("verbose", TRUE) ) } \arguments{ \item{X}{Input data. Can be a \code{\link{data.frame}}, \code{\link{matrix}}, \code{\link[stats]{dist}} object or \code{\link[Matrix]{sparseMatrix}}. A sparse matrix is interpreted as a distance matrix and both implicit and explicit zero entries are ignored. Set zero distances you want to keep to an arbitrarily small non-zero value (e.g. \code{1e-10}). Matrix and data frames should contain one observation per row. Data frames will have any non-numeric columns removed, although factor columns will be used if explicitly included via \code{metric} (see the help for \code{metric} for details). Can be \code{NULL} if precomputed nearest neighbor data is passed to \code{nn_method}, and \code{init} is not \code{"spca"} or \code{"pca"}.} \item{perplexity}{Controls the size of the local neighborhood used for manifold approximation. This is the analogous to \code{n_neighbors} in \code{\link{umap}}. Change this, rather than \code{n_neighbors}.} \item{n_neighbors}{The number of neighbors to use when calculating the \code{perplexity}. Usually set to three times the value of the \code{perplexity}. Must be at least as large as \code{perplexity}.} \item{n_components}{The dimension of the space to embed into. This defaults to \code{2} to provide easy visualization, but can reasonably be set to any integer value in the range \code{2} to \code{100}.} \item{metric}{Type of distance metric to use to find nearest neighbors. One of: \itemize{ \item \code{"euclidean"} (the default) \item \code{"cosine"} \item \code{"manhattan"} \item \code{"hamming"} \item \code{"categorical"} (see below) } Only applies if \code{nn_method = "annoy"} (for \code{nn_method = "fnn"}, the distance metric is always "euclidean"). If \code{X} is a data frame or matrix, then multiple metrics can be specified, by passing a list to this argument, where the name of each item in the list is one of the metric names above. The value of each list item should be a vector giving the names or integer ids of the columns to be included in a calculation, e.g. \code{metric = list(euclidean = 1:4, manhattan = 5:10)}. Each metric calculation results in a separate fuzzy simplicial set, which are intersected together to produce the final set. Metric names can be repeated. Because non-numeric columns are removed from the data frame, it is safer to use column names than integer ids. Factor columns can also be used by specifying the metric name \code{"categorical"}. Factor columns are treated different from numeric columns and although multiple factor columns can be specified in a vector, each factor column specified is processed individually. If you specify a non-factor column, it will be coerced to a factor. For a given data block, you may override the \code{pca} and \code{pca_center} arguments for that block, by providing a list with one unnamed item containing the column names or ids, and then any of the \code{pca} or \code{pca_center} overrides as named items, e.g. \code{metric = list(euclidean = 1:4, manhattan = list(5:10, pca_center = FALSE))}. This exists to allow mixed binary and real-valued data to be included and to have PCA applied to both, but with centering applied only to the real-valued data (it is typical not to apply centering to binary data before PCA is applied).} \item{n_epochs}{Number of epochs to use during the optimization of the embedded coordinates. The default is calculate the number of epochs dynamically based on dataset size, to give the same number of edge samples as the LargeVis defaults. This is usually substantially larger than the UMAP defaults.} \item{learning_rate}{Initial learning rate used in optimization of the coordinates.} \item{scale}{Scaling to apply to \code{X} if it is a data frame or matrix: \itemize{ \item{\code{"none"} or \code{FALSE} or \code{NULL}} No scaling. \item{\code{"Z"} or \code{"scale"} or \code{TRUE}} Scale each column to zero mean and variance 1. \item{\code{"maxabs"}} Center each column to mean 0, then divide each element by the maximum absolute value over the entire matrix. \item{\code{"range"}} Range scale the entire matrix, so the smallest element is 0 and the largest is 1. \item{\code{"colrange"}} Scale each column in the range (0,1). } For lvish, the default is \code{"maxabs"}, for consistency with LargeVis.} \item{init}{Type of initialization for the coordinates. Options are: \itemize{ \item \code{"spectral"} Spectral embedding using the normalized Laplacian of the fuzzy 1-skeleton, with Gaussian noise added. \item \code{"normlaplacian"}. Spectral embedding using the normalized Laplacian of the fuzzy 1-skeleton, without noise. \item \code{"random"}. Coordinates assigned using a uniform random distribution between -10 and 10. \item \code{"lvrandom"}. Coordinates assigned using a Gaussian distribution with standard deviation 1e-4, as used in LargeVis (Tang et al., 2016) and t-SNE. \item \code{"laplacian"}. Spectral embedding using the Laplacian Eigenmap (Belkin and Niyogi, 2002). \item \code{"pca"}. The first two principal components from PCA of \code{X} if \code{X} is a data frame, and from a 2-dimensional classical MDS if \code{X} is of class \code{"dist"}. \item \code{"spca"}. Like \code{"pca"}, but each dimension is then scaled so the standard deviation is 1e-4, to give a distribution similar to that used in t-SNE and LargeVis. This is an alias for \code{init = "pca", init_sdev = 1e-4}. \item \code{"agspectral"} An "approximate global" modification of \code{"spectral"} which all edges in the graph to a value of 1, and then sets a random number of edges (\code{negative_sample_rate} edges per vertex) to 0.1, to approximate the effect of non-local affinities. \item A matrix of initial coordinates. } For spectral initializations, (\code{"spectral"}, \code{"normlaplacian"}, \code{"laplacian"}), if more than one connected component is identified, each connected component is initialized separately and the results are merged. If \code{verbose = TRUE} the number of connected components are logged to the console. The existence of multiple connected components implies that a global view of the data cannot be attained with this initialization. Either a PCA-based initialization or increasing the value of \code{n_neighbors} may be more appropriate.} \item{init_sdev}{If non-\code{NULL}, scales each dimension of the initialized coordinates (including any user-supplied matrix) to this standard deviation. By default no scaling is carried out, except when \code{init = "spca"}, in which case the value is \code{0.0001}. Scaling the input may help if the unscaled versions result in initial coordinates with large inter-point distances or outliers. This usually results in small gradients during optimization and very little progress being made to the layout. Shrinking the initial embedding by rescaling can help under these circumstances. Scaling the result of \code{init = "pca"} is usually recommended and \code{init = "spca"} as an alias for \code{init = "pca", init_sdev = 1e-4} but for the spectral initializations the scaled versions usually aren't necessary unless you are using a large value of \code{n_neighbors} (e.g. \code{n_neighbors = 150} or higher).} \item{repulsion_strength}{Weighting applied to negative samples in low dimensional embedding optimization. Values higher than one will result in greater weight being given to negative samples.} \item{negative_sample_rate}{The number of negative edge/1-simplex samples to use per positive edge/1-simplex sample in optimizing the low dimensional embedding.} \item{nn_method}{Method for finding nearest neighbors. Options are: \itemize{ \item \code{"fnn"}. Use exact nearest neighbors via the \href{https://cran.r-project.org/package=FNN}{FNN} package. \item \code{"annoy"} Use approximate nearest neighbors via the \href{https://cran.r-project.org/package=RcppAnnoy}{RcppAnnoy} package. } By default, if \code{X} has less than 4,096 vertices, the exact nearest neighbors are found. Otherwise, approximate nearest neighbors are used. You may also pass precalculated nearest neighbor data to this argument. It must be a list consisting of two elements: \itemize{ \item \code{"idx"}. A \code{n_vertices x n_neighbors} matrix containing the integer indexes of the nearest neighbors in \code{X}. Each vertex is considered to be its own nearest neighbor, i.e. \code{idx[, 1] == 1:n_vertices}. \item \code{"dist"}. A \code{n_vertices x n_neighbors} matrix containing the distances of the nearest neighbors. } Multiple nearest neighbor data (e.g. from two different precomputed metrics) can be passed by passing a list containing the nearest neighbor data lists as items. The \code{n_neighbors} parameter is ignored when using precomputed nearest neighbor data.} \item{n_trees}{Number of trees to build when constructing the nearest neighbor index. The more trees specified, the larger the index, but the better the results. With \code{search_k}, determines the accuracy of the Annoy nearest neighbor search. Only used if the \code{nn_method} is \code{"annoy"}. Sensible values are between \code{10} to \code{100}.} \item{search_k}{Number of nodes to search during the neighbor retrieval. The larger k, the more the accurate results, but the longer the search takes. With \code{n_trees}, determines the accuracy of the Annoy nearest neighbor search. Only used if the \code{nn_method} is \code{"annoy"}.} \item{n_threads}{Number of threads to use (except during stochastic gradient descent). Default is half that recommended by RcppParallel. For nearest neighbor search, only applies if \code{nn_method = "annoy"}. If \code{n_threads > 1}, then the Annoy index will be temporarily written to disk in the location determined by \code{\link[base]{tempfile}}.} \item{n_sgd_threads}{Number of threads to use during stochastic gradient descent. If set to > 1, then results will not be reproducible, even if `set.seed` is called with a fixed seed before running. Set to \code{"auto"} go use the same value as \code{n_threads}.} \item{grain_size}{Minimum batch size for multithreading. If the number of items to process in a thread falls below this number, then no threads will be used. Used in conjunction with \code{n_threads} and \code{n_sgd_threads}.} \item{kernel}{Type of kernel function to create input probabilities. Can be one of \code{"gauss"} (the default) or \code{"knn"}. \code{"gauss"} uses the usual Gaussian weighted similarities. \code{"knn"} assigns equal probabilities to every edge in the nearest neighbor graph, and zero otherwise, using \code{perplexity} nearest neighbors. The \code{n_neighbors} parameter is ignored in this case.} \item{pca}{If set to a positive integer value, reduce data to this number of columns using PCA. Doesn't applied if the distance \code{metric} is \code{"hamming"}, or the dimensions of the data is larger than the number specified (i.e. number of rows and columns must be larger than the value of this parameter). If you have > 100 columns in a data frame or matrix, reducing the number of columns in this way may substantially increase the performance of the nearest neighbor search at the cost of a potential decrease in accuracy. In many t-SNE applications, a value of 50 is recommended, although there's no guarantee that this is appropriate for all settings.} \item{pca_center}{If \code{TRUE}, center the columns of \code{X} before carrying out PCA. For binary data, it's recommended to set this to \code{FALSE}.} \item{pcg_rand}{If \code{TRUE}, use the PCG random number generator (O'Neill, 2014) during optimization. Otherwise, use the faster (but probably less statistically good) Tausworthe "taus88" generator. The default is \code{TRUE}.} \item{fast_sgd}{If \code{TRUE}, then the following combination of parameters is set: \code{pcg_rand = TRUE} and \code{n_sgd_threads = "auto"}. The default is \code{FALSE}. Setting this to \code{TRUE} will speed up the stochastic optimization phase, but give a potentially less accurate embedding, and which will not be exactly reproducible even with a fixed seed. For visualization, \code{fast_sgd = TRUE} will give perfectly good results. For more generic dimensionality reduction, it's safer to leave \code{fast_sgd = FALSE}. If \code{fast_sgd = TRUE}, then user-supplied values of \code{pcg_rand} and \code{n_sgd_threads}, are ignored.} \item{ret_nn}{If \code{TRUE}, then in addition to the embedding, also return nearest neighbor data that can be used as input to \code{nn_method} to avoid the overhead of repeatedly calculating the nearest neighbors when manipulating unrelated parameters (e.g. \code{min_dist}, \code{n_epochs}, \code{init}). See the "Value" section for the names of the list items. If \code{FALSE}, just return the coordinates. Note that the nearest neighbors could be sensitive to data scaling, so be wary of reusing nearest neighbor data if modifying the \code{scale} parameter.} \item{tmpdir}{Temporary directory to store nearest neighbor indexes during nearest neighbor search. Default is \code{\link{tempdir}}. The index is only written to disk if \code{n_threads > 1} and \code{nn_method = "annoy"}; otherwise, this parameter is ignored.} \item{verbose}{If \code{TRUE}, log details to the console.} } \value{ A matrix of optimized coordinates, or if \code{ret_nn = TRUE}, returns the nearest neighbor data as a list containing a matrix \code{idx} with the integer ids of the neighbors; and a matrix \code{dist} with the distances. This list can be used as input to the \code{nn_method} parameter. } \description{ Carry out dimensionality reduction of a dataset using a method similar to LargeVis (Tang et al., 2016). } \details{ \code{lvish} differs from the official LargeVis implementation in the following: \itemize{ \item Only the nearest-neighbor index search phase is multi-threaded. \item Matrix input data is not normalized. \item The \code{n_trees} parameter cannot be dynamically chosen based on data set size. \item Nearest neighbor results are not refined via the neighbor-of-my-neighbor method. The \code{search_k} parameter is twice as large than default to compensate. \item Gradient values are clipped to \code{4.0} rather than \code{5.0}. \item Negative edges are generated by uniform sampling of vertexes rather than their degree ^ 0.75. \item The default number of samples is much reduced. The default number of epochs, \code{n_epochs}, is set to \code{5000}, much larger than for \code{\link{umap}}, but may need to be increased further depending on your dataset. Using \code{init = "spectral"} can help. } } \examples{ # Default number of epochs is much larger than for UMAP, assumes random # initialization # If using a more global initialization, can use fewer epochs iris_lvish_short <- lvish(iris, perplexity = 50, n_epochs = 200, init = "pca" ) # Use perplexity rather than n_neighbors to control the size of the local # neighborhood # 200 epochs may be too small for a random initialization iris_lvish <- lvish(iris, perplexity = 50, learning_rate = 0.5, init = "random", n_epochs = 200 ) } \references{ Tang, J., Liu, J., Zhang, M., & Mei, Q. (2016, April). Visualizing large-scale and high-dimensional data. In \emph{Proceedings of the 25th International Conference on World Wide Web} (pp. 287-297). International World Wide Web Conferences Steering Committee. \url{https://arxiv.org/abs/1602.00370} } uwot/man/umap.Rd0000644000176200001440000005515713571660504013300 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/uwot.R \name{umap} \alias{umap} \title{Dimensionality Reduction with UMAP} \usage{ umap( X, n_neighbors = 15, n_components = 2, metric = "euclidean", n_epochs = NULL, learning_rate = 1, scale = FALSE, init = "spectral", init_sdev = NULL, spread = 1, min_dist = 0.01, set_op_mix_ratio = 1, local_connectivity = 1, bandwidth = 1, repulsion_strength = 1, negative_sample_rate = 5, a = NULL, b = NULL, nn_method = NULL, n_trees = 50, search_k = 2 * n_neighbors * n_trees, approx_pow = FALSE, y = NULL, target_n_neighbors = n_neighbors, target_metric = "euclidean", target_weight = 0.5, pca = NULL, pca_center = TRUE, pcg_rand = TRUE, fast_sgd = FALSE, ret_model = FALSE, ret_nn = FALSE, n_threads = max(1, RcppParallel::defaultNumThreads()/2), n_sgd_threads = 0, grain_size = 1, tmpdir = tempdir(), verbose = getOption("verbose", TRUE) ) } \arguments{ \item{X}{Input data. Can be a \code{\link{data.frame}}, \code{\link{matrix}}, \code{\link[stats]{dist}} object or \code{\link[Matrix]{sparseMatrix}}. A sparse matrix is interpreted as a distance matrix and both implicit and explicit zero entries are ignored. Set zero distances you want to keep to an arbitrarily small non-zero value (e.g. \code{1e-10}). Matrix and data frames should contain one observation per row. Data frames will have any non-numeric columns removed, although factor columns will be used if explicitly included via \code{metric} (see the help for \code{metric} for details). Can be \code{NULL} if precomputed nearest neighbor data is passed to \code{nn_method}, and \code{init} is not \code{"spca"} or \code{"pca"}.} \item{n_neighbors}{The size of local neighborhood (in terms of number of neighboring sample points) used for manifold approximation. Larger values result in more global views of the manifold, while smaller values result in more local data being preserved. In general values should be in the range \code{2} to \code{100}.} \item{n_components}{The dimension of the space to embed into. This defaults to \code{2} to provide easy visualization, but can reasonably be set to any integer value in the range \code{2} to \code{100}.} \item{metric}{Type of distance metric to use to find nearest neighbors. One of: \itemize{ \item \code{"euclidean"} (the default) \item \code{"cosine"} \item \code{"manhattan"} \item \code{"hamming"} \item \code{"categorical"} (see below) } Only applies if \code{nn_method = "annoy"} (for \code{nn_method = "fnn"}, the distance metric is always "euclidean"). If \code{X} is a data frame or matrix, then multiple metrics can be specified, by passing a list to this argument, where the name of each item in the list is one of the metric names above. The value of each list item should be a vector giving the names or integer ids of the columns to be included in a calculation, e.g. \code{metric = list(euclidean = 1:4, manhattan = 5:10)}. Each metric calculation results in a separate fuzzy simplicial set, which are intersected together to produce the final set. Metric names can be repeated. Because non-numeric columns are removed from the data frame, it is safer to use column names than integer ids. Factor columns can also be used by specifying the metric name \code{"categorical"}. Factor columns are treated different from numeric columns and although multiple factor columns can be specified in a vector, each factor column specified is processed individually. If you specify a non-factor column, it will be coerced to a factor. For a given data block, you may override the \code{pca} and \code{pca_center} arguments for that block, by providing a list with one unnamed item containing the column names or ids, and then any of the \code{pca} or \code{pca_center} overrides as named items, e.g. \code{metric = list(euclidean = 1:4, manhattan = list(5:10, pca_center = FALSE))}. This exists to allow mixed binary and real-valued data to be included and to have PCA applied to both, but with centering applied only to the real-valued data (it is typical not to apply centering to binary data before PCA is applied).} \item{n_epochs}{Number of epochs to use during the optimization of the embedded coordinates. By default, this value is set to \code{500} for datasets containing 10,000 vertices or less, and \code{200} otherwise.} \item{learning_rate}{Initial learning rate used in optimization of the coordinates.} \item{scale}{Scaling to apply to \code{X} if it is a data frame or matrix: \itemize{ \item{\code{"none"} or \code{FALSE} or \code{NULL}} No scaling. \item{\code{"Z"} or \code{"scale"} or \code{TRUE}} Scale each column to zero mean and variance 1. \item{\code{"maxabs"}} Center each column to mean 0, then divide each element by the maximum absolute value over the entire matrix. \item{\code{"range"}} Range scale the entire matrix, so the smallest element is 0 and the largest is 1. \item{\code{"colrange"}} Scale each column in the range (0,1). } For UMAP, the default is \code{"none"}.} \item{init}{Type of initialization for the coordinates. Options are: \itemize{ \item \code{"spectral"} Spectral embedding using the normalized Laplacian of the fuzzy 1-skeleton, with Gaussian noise added. \item \code{"normlaplacian"}. Spectral embedding using the normalized Laplacian of the fuzzy 1-skeleton, without noise. \item \code{"random"}. Coordinates assigned using a uniform random distribution between -10 and 10. \item \code{"lvrandom"}. Coordinates assigned using a Gaussian distribution with standard deviation 1e-4, as used in LargeVis (Tang et al., 2016) and t-SNE. \item \code{"laplacian"}. Spectral embedding using the Laplacian Eigenmap (Belkin and Niyogi, 2002). \item \code{"pca"}. The first two principal components from PCA of \code{X} if \code{X} is a data frame, and from a 2-dimensional classical MDS if \code{X} is of class \code{"dist"}. \item \code{"spca"}. Like \code{"pca"}, but each dimension is then scaled so the standard deviation is 1e-4, to give a distribution similar to that used in t-SNE. This is an alias for \code{init = "pca", init_sdev = 1e-4}. \item \code{"agspectral"} An "approximate global" modification of \code{"spectral"} which all edges in the graph to a value of 1, and then sets a random number of edges (\code{negative_sample_rate} edges per vertex) to 0.1, to approximate the effect of non-local affinities. \item A matrix of initial coordinates. } For spectral initializations, (\code{"spectral"}, \code{"normlaplacian"}, \code{"laplacian"}), if more than one connected component is identified, each connected component is initialized separately and the results are merged. If \code{verbose = TRUE} the number of connected components are logged to the console. The existence of multiple connected components implies that a global view of the data cannot be attained with this initialization. Either a PCA-based initialization or increasing the value of \code{n_neighbors} may be more appropriate.} \item{init_sdev}{If non-\code{NULL}, scales each dimension of the initialized coordinates (including any user-supplied matrix) to this standard deviation. By default no scaling is carried out, except when \code{init = "spca"}, in which case the value is \code{0.0001}. Scaling the input may help if the unscaled versions result in initial coordinates with large inter-point distances or outliers. This usually results in small gradients during optimization and very little progress being made to the layout. Shrinking the initial embedding by rescaling can help under these circumstances. Scaling the result of \code{init = "pca"} is usually recommended and \code{init = "spca"} as an alias for \code{init = "pca", init_sdev = 1e-4} but for the spectral initializations the scaled versions usually aren't necessary unless you are using a large value of \code{n_neighbors} (e.g. \code{n_neighbors = 150} or higher).} \item{spread}{The effective scale of embedded points. In combination with \code{min_dist}, this determines how clustered/clumped the embedded points are.} \item{min_dist}{The effective minimum distance between embedded points. Smaller values will result in a more clustered/clumped embedding where nearby points on the manifold are drawn closer together, while larger values will result on a more even dispersal of points. The value should be set relative to the \code{spread} value, which determines the scale at which embedded points will be spread out.} \item{set_op_mix_ratio}{Interpolate between (fuzzy) union and intersection as the set operation used to combine local fuzzy simplicial sets to obtain a global fuzzy simplicial sets. Both fuzzy set operations use the product t-norm. The value of this parameter should be between \code{0.0} and \code{1.0}; a value of \code{1.0} will use a pure fuzzy union, while \code{0.0} will use a pure fuzzy intersection.} \item{local_connectivity}{The local connectivity required -- i.e. the number of nearest neighbors that should be assumed to be connected at a local level. The higher this value the more connected the manifold becomes locally. In practice this should be not more than the local intrinsic dimension of the manifold.} \item{bandwidth}{The effective bandwidth of the kernel if we view the algorithm as similar to Laplacian Eigenmaps. Larger values induce more connectivity and a more global view of the data, smaller values concentrate more locally.} \item{repulsion_strength}{Weighting applied to negative samples in low dimensional embedding optimization. Values higher than one will result in greater weight being given to negative samples.} \item{negative_sample_rate}{The number of negative edge/1-simplex samples to use per positive edge/1-simplex sample in optimizing the low dimensional embedding.} \item{a}{More specific parameters controlling the embedding. If \code{NULL} these values are set automatically as determined by \code{min_dist} and \code{spread}.} \item{b}{More specific parameters controlling the embedding. If \code{NULL} these values are set automatically as determined by \code{min_dist} and \code{spread}.} \item{nn_method}{Method for finding nearest neighbors. Options are: \itemize{ \item \code{"fnn"}. Use exact nearest neighbors via the \href{https://cran.r-project.org/package=FNN}{FNN} package. \item \code{"annoy"} Use approximate nearest neighbors via the \href{https://cran.r-project.org/package=RcppAnnoy}{RcppAnnoy} package. } By default, if \code{X} has less than 4,096 vertices, the exact nearest neighbors are found. Otherwise, approximate nearest neighbors are used. You may also pass precalculated nearest neighbor data to this argument. It must be a list consisting of two elements: \itemize{ \item \code{"idx"}. A \code{n_vertices x n_neighbors} matrix containing the integer indexes of the nearest neighbors in \code{X}. Each vertex is considered to be its own nearest neighbor, i.e. \code{idx[, 1] == 1:n_vertices}. \item \code{"dist"}. A \code{n_vertices x n_neighbors} matrix containing the distances of the nearest neighbors. } Multiple nearest neighbor data (e.g. from two different precomputed metrics) can be passed by passing a list containing the nearest neighbor data lists as items. The \code{n_neighbors} parameter is ignored when using precomputed nearest neighbor data.} \item{n_trees}{Number of trees to build when constructing the nearest neighbor index. The more trees specified, the larger the index, but the better the results. With \code{search_k}, determines the accuracy of the Annoy nearest neighbor search. Only used if the \code{nn_method} is \code{"annoy"}. Sensible values are between \code{10} to \code{100}.} \item{search_k}{Number of nodes to search during the neighbor retrieval. The larger k, the more the accurate results, but the longer the search takes. With \code{n_trees}, determines the accuracy of the Annoy nearest neighbor search. Only used if the \code{nn_method} is \code{"annoy"}.} \item{approx_pow}{If \code{TRUE}, use an approximation to the power function in the UMAP gradient, from \url{https://martin.ankerl.com/2012/01/25/optimized-approximative-pow-in-c-and-cpp/}.} \item{y}{Optional target data for supervised dimension reduction. Can be a vector, matrix or data frame. Use the \code{target_metric} parameter to specify the metrics to use, using the same syntax as \code{metric}. Usually either a single numeric or factor column is used, but more complex formats are possible. The following types are allowed: \itemize{ \item Factor columns with the same length as \code{X}. \code{NA} is allowed for any observation with an unknown level, in which case UMAP operates as a form of semi-supervised learning. Each column is treated separately. \item Numeric data. \code{NA} is \emph{not} allowed in this case. Use the parameter \code{target_n_neighbors} to set the number of neighbors used with \code{y}. If unset, \code{n_neighbors} is used. Unlike factors, numeric columns are grouped into one block unless \code{target_metric} specifies otherwise. For example, if you wish columns \code{a} and \code{b} to be treated separately, specify \code{target_metric = list(euclidean = "a", euclidean = "b")}. Otherwise, the data will be effectively treated as a matrix with two columns. \item Nearest neighbor data, consisting of a list of two matrices, \code{idx} and \code{dist}. These represent the precalculated nearest neighbor indices and distances, respectively. This is the same format as that expected for precalculated data in \code{nn_method}. This format assumes that the underlying data was a numeric vector. Any user-supplied value of the \code{target_n_neighbors} parameter is ignored in this case, because the the number of columns in the matrices is used for the value. Multiple nearest neighbor data using different metrics can be supplied by passing a list of these lists. } Unlike \code{X}, all factor columns included in \code{y} are automatically used.} \item{target_n_neighbors}{Number of nearest neighbors to use to construct the target simplicial set. Default value is \code{n_neighbors}. Applies only if \code{y} is non-\code{NULL} and \code{numeric}.} \item{target_metric}{The metric used to measure distance for \code{y} if using supervised dimension reduction. Used only if \code{y} is numeric.} \item{target_weight}{Weighting factor between data topology and target topology. A value of 0.0 weights entirely on data, a value of 1.0 weights entirely on target. The default of 0.5 balances the weighting equally between data and target. Only applies if \code{y} is non-\code{NULL}.} \item{pca}{If set to a positive integer value, reduce data to this number of columns using PCA. Doesn't applied if the distance \code{metric} is \code{"hamming"}, or the dimensions of the data is larger than the number specified (i.e. number of rows and columns must be larger than the value of this parameter). If you have > 100 columns in a data frame or matrix, reducing the number of columns in this way may substantially increase the performance of the nearest neighbor search at the cost of a potential decrease in accuracy. In many t-SNE applications, a value of 50 is recommended, although there's no guarantee that this is appropriate for all settings.} \item{pca_center}{If \code{TRUE}, center the columns of \code{X} before carrying out PCA. For binary data, it's recommended to set this to \code{FALSE}.} \item{pcg_rand}{If \code{TRUE}, use the PCG random number generator (O'Neill, 2014) during optimization. Otherwise, use the faster (but probably less statistically good) Tausworthe "taus88" generator. The default is \code{TRUE}.} \item{fast_sgd}{If \code{TRUE}, then the following combination of parameters is set: \code{pcg_rand = TRUE}, \code{n_sgd_threads = "auto"} and \code{approx_pow = TRUE}. The default is \code{FALSE}. Setting this to \code{TRUE} will speed up the stochastic optimization phase, but give a potentially less accurate embedding, and which will not be exactly reproducible even with a fixed seed. For visualization, \code{fast_sgd = TRUE} will give perfectly good results. For more generic dimensionality reduction, it's safer to leave \code{fast_sgd = FALSE}. If \code{fast_sgd = TRUE}, then user-supplied values of \code{pcg_rand}, \code{n_sgd_threads}, and \code{approx_pow} are ignored.} \item{ret_model}{If \code{TRUE}, then return extra data that can be used to add new data to an existing embedding via \code{\link{umap_transform}}. The embedded coordinates are returned as the list item \code{embedding}. If \code{FALSE}, just return the coordinates. This parameter can be used in conjunction with \code{ret_nn}. Note that some settings are incompatible with the production of a UMAP model: external neighbor data (passed via a list to \code{nn_method}), and factor columns that were included via the \code{metric} parameter. In the latter case, the model produced is based only on the numeric data. A transformation using new data is possible, but the factor columns in the new data are ignored.} \item{ret_nn}{If \code{TRUE}, then in addition to the embedding, also return nearest neighbor data that can be used as input to \code{nn_method} to avoid the overhead of repeatedly calculating the nearest neighbors when manipulating unrelated parameters (e.g. \code{min_dist}, \code{n_epochs}, \code{init}). See the "Value" section for the names of the list items. If \code{FALSE}, just return the coordinates. Note that the nearest neighbors could be sensitive to data scaling, so be wary of reusing nearest neighbor data if modifying the \code{scale} parameter. This parameter can be used in conjunction with \code{ret_model}.} \item{n_threads}{Number of threads to use (except during stochastic gradient descent). Default is half that recommended by RcppParallel. For nearest neighbor search, only applies if \code{nn_method = "annoy"}. If \code{n_threads > 1}, then the Annoy index will be temporarily written to disk in the location determined by \code{\link[base]{tempfile}}.} \item{n_sgd_threads}{Number of threads to use during stochastic gradient descent. If set to > 1, then results will not be reproducible, even if `set.seed` is called with a fixed seed before running. Set to \code{"auto"} go use the same value as \code{n_threads}.} \item{grain_size}{Minimum batch size for multithreading. If the number of items to process in a thread falls below this number, then no threads will be used. Used in conjunction with \code{n_threads} and \code{n_sgd_threads}.} \item{tmpdir}{Temporary directory to store nearest neighbor indexes during nearest neighbor search. Default is \code{\link{tempdir}}. The index is only written to disk if \code{n_threads > 1} and \code{nn_method = "annoy"}; otherwise, this parameter is ignored.} \item{verbose}{If \code{TRUE}, log details to the console.} } \value{ A matrix of optimized coordinates, or: \itemize{ \item if \code{ret_model = TRUE}, returns a list containing extra information that can be used to add new data to an existing embedding via \code{\link{umap_transform}}. In this case, the coordinates are available in the list item \code{embedding}. \item if \code{ret_nn = TRUE}, returns the nearest neighbor data as a list called \code{nn}. This contains one list for each \code{metric} calculated, itself containing a matrix \code{idx} with the integer ids of the neighbors; and a matrix \code{dist} with the distances. The \code{nn} list (or a sub-list) can be used as input to the \code{nn_method} parameter. } Both \code{ret_model} and \code{ret_nn} can be \code{TRUE}, in which case the returned list contains the combined data. } \description{ Carry out dimensionality reduction of a dataset using the Uniform Manifold Approximation and Projection (UMAP) method (McInnes & Healy, 2018). Some of the following help text is lifted verbatim from the Python reference implementation at \url{https://github.com/lmcinnes/umap}. } \examples{ # Non-numeric columns are automatically removed so you can pass data frames # directly in a lot of cases without pre-processing iris_umap <- umap(iris, n_neighbors = 50, learning_rate = 0.5, init = "random" ) # Although not an issue for the iris dataset, for high dimensional data # (> 100 columns), using PCA to reduce dimensionality is highly # recommended to avoid nearest neighbor searches taking a long time # 50 dimensions is a good value to start with. If there are fewer columns # in the input than the requested number of components, the parameter is # ignored. iris_umap <- umap(iris, pca = 50) # Faster approximation to the gradient iris_umap <- umap(iris, n_neighbors = 15, approx_pow = TRUE) # Can specify min_dist and spread parameters to control separation and size # of clusters iris_umap <- umap(iris, n_neighbors = 15, min_dist = 1, spread = 5) # Supervised dimension reduction using the 'Species' factor column iris_sumap <- umap(iris, n_neighbors = 15, min_dist = 0.001, y = iris$Species, target_weight = 0.5 ) \donttest{ # Calculate Petal and Sepal neighbors separately (uses intersection of the resulting sets): iris_umap <- umap(iris, metric = list( "euclidean" = c("Sepal.Length", "Sepal.Width"), "euclidean" = c("Petal.Length", "Petal.Width") )) # Can also use individual factor columns iris_umap <- umap(iris, metric = list( "euclidean" = c("Sepal.Length", "Sepal.Width"), "euclidean" = c("Petal.Length", "Petal.Width"), "categorical" = "Species" )) # Return NN info iris_umap <- umap(iris, ret_nn = TRUE) # Re-use NN info for greater efficiency # Here we use random initialization iris_umap_spca <- umap(iris, init = "rand", nn_method = iris_umap$nn) } } \references{ Belkin, M., & Niyogi, P. (2002). Laplacian eigenmaps and spectral techniques for embedding and clustering. In \emph{Advances in neural information processing systems} (pp. 585-591). \url{http://papers.nips.cc/paper/1961-laplacian-eigenmaps-and-spectral-techniques-for-embedding-and-clustering.pdf} McInnes, L., & Healy, J. (2018). UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction \emph{arXiv preprint} \emph{arXiv}:1802.03426. \url{https://arxiv.org/abs/1802.03426} O’Neill, M. E. (2014). \emph{PCG: A family of simple fast space-efficient statistically good algorithms for random number generation} (Report No. HMC-CS-2014-0905). Harvey Mudd College. Tang, J., Liu, J., Zhang, M., & Mei, Q. (2016, April). Visualizing large-scale and high-dimensional data. In \emph{Proceedings of the 25th International Conference on World Wide Web} (pp. 287-297). International World Wide Web Conferences Steering Committee. \url{https://arxiv.org/abs/1602.00370} Van der Maaten, L., & Hinton, G. (2008). Visualizing data using t-SNE. \emph{Journal of Machine Learning Research}, \emph{9} (2579-2605). \url{http://www.jmlr.org/papers/v9/vandermaaten08a.html} } uwot/man/load_uwot.Rd0000644000176200001440000000111613571372403014313 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/uwot.R \name{load_uwot} \alias{load_uwot} \title{Save or Load a Model} \usage{ load_uwot(file) } \arguments{ \item{file}{name of the file where the model is to be saved or read from.} } \description{ Functions to write a UMAP model to a file, and to restore. } \examples{ # create model model <- umap(iris[1:100, ], ret_model = TRUE) # save model_file <- tempfile("iris_umap") save_uwot(model, file = model_file) # restore model2 <- load_uwot(file = model_file) identical(model, model2) unlink(model_file) } uwot/DESCRIPTION0000644000176200001440000000304513571756262012776 0ustar liggesusersPackage: uwot Title: The Uniform Manifold Approximation and Projection (UMAP) Method for Dimensionality Reduction Version: 0.1.5 Authors@R: c(person("James", "Melville", email = "jlmelville@gmail.com", role = c("aut", "cre")), person("Aaron", "Lun", role="ctb"),person("Mohamed Nadhir", "Djekidel", role="ctb")) Description: An implementation of the Uniform Manifold Approximation and Projection dimensionality reduction by McInnes et al. (2018) . It also provides means to transform new data and to carry out supervised dimensionality reduction. An implementation of the related LargeVis method of Tang et al. (2016) is also provided. This is a complete re-implementation in R (and C++, via the 'Rcpp' package): no Python installation is required. See the uwot website () for more documentation and examples. License: GPL-3 URL: https://github.com/jlmelville/uwot BugReports: https://github.com/jlmelville/uwot/issues Encoding: UTF-8 LazyData: true Suggests: testthat, covr RoxygenNote: 7.0.2 Depends: Matrix LinkingTo: Rcpp, RcppProgress, RcppParallel, RcppAnnoy, dqrng Imports: Rcpp, methods, FNN, RSpectra, RcppAnnoy (>= 0.0.11), RcppParallel, irlba SystemRequirements: GNU make NeedsCompilation: yes Packaged: 2019-12-04 08:05:20 UTC; jlmel Author: James Melville [aut, cre], Aaron Lun [ctb], Mohamed Nadhir Djekidel [ctb] Maintainer: James Melville Repository: CRAN Date/Publication: 2019-12-04 16:20:02 UTC uwot/tests/0000755000176200001440000000000013400360122012401 5ustar liggesusersuwot/tests/testthat/0000755000176200001440000000000013571756262014270 5ustar liggesusersuwot/tests/testthat/test_perplexity.R0000644000176200001440000002534313571660452017661 0ustar liggesuserslibrary(uwot) context("perplexity") # Full neighbor values based on comparison with smallvis results iris10_nn10 <- dist_nn(dist(iris10), k = 10) P_symm <- matrix(c( 0.000000e+00, 0.0022956859, 0.0022079944, 0.0004763074, 4.338953e-02, 1.822079e-02, 0.002913239, 0.0413498285, 5.184416e-05, 0.004134502, 2.295686e-03, 0.0000000000, 0.0188919615, 0.0129934442, 1.089032e-03, 5.689921e-04, 0.002131646, 0.0048261793, 6.996252e-03, 0.050676976, 2.207994e-03, 0.0188919615, 0.0000000000, 0.0444964580, 2.464225e-03, 5.935835e-04, 0.040353636, 0.0027720360, 1.111298e-02, 0.013673490, 4.763074e-04, 0.0129934442, 0.0444964580, 0.0000000000, 5.466455e-04, 2.771325e-04, 0.018028275, 0.0005761904, 3.389471e-02, 0.014363302, 4.338953e-02, 0.0010890318, 0.0024642250, 0.0005466455, 0.000000e+00, 1.831834e-02, 0.006393040, 0.0329052015, 5.372241e-05, 0.002356628, 1.822079e-02, 0.0005689921, 0.0005935835, 0.0002771325, 1.831834e-02, 0.000000e+00, 0.001326343, 0.0110122168, 1.065771e-05, 0.001168212, 2.913239e-03, 0.0021316462, 0.0403536359, 0.0180282748, 6.393040e-03, 1.326343e-03, 0.000000000, 0.0059083283, 4.862680e-03, 0.002656313, 4.134983e-02, 0.0048261793, 0.0027720360, 0.0005761904, 3.290520e-02, 1.101222e-02, 0.005908328, 0.0000000000, 2.982247e-04, 0.012212476, 5.184416e-05, 0.0069962518, 0.0111129834, 0.0338947056, 5.372241e-05, 1.065771e-05, 0.004862680, 0.0002982247, 0.000000e+00, 0.004150755, 4.134502e-03, 0.0506769759, 0.0136734904, 0.0143633019, 2.356628e-03, 1.168212e-03, 0.002656313, 0.0122124758, 4.150755e-03, 0.000000000 ) * 10, nrow = 10, byrow = TRUE) res <- perplexity_similarities( perplexity = 4, verbose = FALSE, nn = find_nn(iris10, k = 10, method = "fnn", metric = "euclidean", n_threads = 0, verbose = FALSE ) ) expect_true(Matrix::isSymmetric(res)) expect_equal(as.matrix(res), P_symm, tol = 1e-5, check.attributes = FALSE) Psymm9 <- matrix( c( 0, 0.1111, 0.1112, 0.1110, 0.1116, 0.1113, 0.1112, 0.1115, 0.1106, 0.1112, 0.1111, 0, 0.1113, 0.1113, 0.1110, 0.1107, 0.1112, 0.1112, 0.1112, 0.1114, 0.1112, 0.1113, 0, 0.1113, 0.1112, 0.1106, 0.1114, 0.1112, 0.1112, 0.1113, 0.1110, 0.1113, 0.1113, 0, 0.1110, 0.1105, 0.1114, 0.1111, 0.1113, 0.1114, 0.1116, 0.1110, 0.1112, 0.1110, 0, 0.1113, 0.1113, 0.1115, 0.1106, 0.1111, 0.1113, 0.1107, 0.1106, 0.1105, 0.1113, 0, 0.1105, 0.1111, 0.1103, 0.1105, 0.1112, 0.1112, 0.1114, 0.1114, 0.1113, 0.1105, 0, 0.1113, 0.1112, 0.1112, 0.1115, 0.1112, 0.1112, 0.1111, 0.1115, 0.1111, 0.1113, 0, 0.1108, 0.1114, 0.1106, 0.1112, 0.1112, 0.1113, 0.1106, 0.1103, 0.1112, 0.1108, 0, 0.1111, 0.1112, 0.1114, 0.1113, 0.1114, 0.1111, 0.1105, 0.1112, 0.1114, 0.1111, 0 ), byrow = TRUE, nrow = 10 ) res <- perplexity_similarities( perplexity = 9, verbose = FALSE, nn = find_nn(iris10, k = 10, method = "fnn", metric = "euclidean", n_threads = 0, verbose = FALSE ) ) expect_true(Matrix::isSymmetric(res)) expect_equal(as.matrix(res), Psymm9, tol = 1e-4, check.attributes = FALSE) P_symm_6nn <- matrix(c( 0, 0, 0.004227396, 0, 0.038581602, 0.016370215, 0.003972948, 0.037491042, 0, 0.007253571, 0, 0, 0.020541010, 0.01457322, 0, 0, 0, 0.008117719, 0.008608916, 0.043891243, 0.004227396, 0.020541010, 0, 0.04314614, 0.004242199, 0, 0.036275982, 0.004791681, 0.010952319, 0.015666352, 0, 0.014573224, 0.043146139, 0, 0, 0, 0.018725165, 0, 0.032811238, 0.015644628, 0.038581602, 0, 0.004242199, 0, 0, 0.016370215, 0.010365583, 0.031963895, 0, 0.003730662, 0.016370215, 0, 0, 0, 0.016370215, 0, 0.002795087, 0.011902114, 0, 0.002562369, 0.003972948, 0, 0.036275982, 0.01872517, 0.010365583, 0.002795087, 0, 0.006321792, 0.004717900, 0.003609179, 0.037491042, 0.008117719, 0.004791681, 0, 0.031963895, 0.011902114, 0.006321792, 0, 0, 0.015406444, 0, 0.008608916, 0.010952319, 0.03281124, 0, 0, 0.004717900, 0, 0, 0.004370167, 0.007253571, 0.043891243, 0.015666352, 0.01564463, 0.003730662, 0.002562369, 0.003609179, 0.015406444, 0.004370167, 0 ) * 10, nrow = 10, byrow = TRUE) res <- perplexity_similarities( perplexity = 4, verbose = FALSE, nn = find_nn(iris10, k = 6, method = "fnn", metric = "euclidean", n_threads = 0, verbose = FALSE ) ) expect_true(Matrix::isSymmetric(res)) expect_equal(as.matrix(res), P_symm_6nn, tol = 1e-5, check.attributes = FALSE) P_row <- matrix(c( 0.000000e+00, 0.03254778, 0.04322171, 0.009522236, 4.179712e-01, 1.389888e-02, 0.03932256, 0.3802648571, 1.633620e-04, 0.06308741, 1.336594e-02, 0.00000000, 0.21654628, 0.163906282, 4.387114e-03, 4.819686e-08, 0.02029701, 0.0618376045, 2.029701e-02, 0.49936271, 9.381792e-04, 0.16129295, 0.00000000, 0.400023323, 9.381792e-04, 7.502536e-16, 0.29552576, 0.0143118438, 7.811162e-03, 0.11915861, 3.912338e-06, 0.09596260, 0.48990584, 0.000000000, 3.912338e-06, 1.913538e-19, 0.09596260, 0.0009992458, 1.842071e-01, 0.13295484, 4.498193e-01, 0.01739352, 0.04834632, 0.010928997, 0.000000e+00, 1.584988e-02, 0.07694327, 0.3403719404, 2.009270e-04, 0.04014584, 3.505169e-01, 0.01137979, 0.01187167, 0.005542650, 3.505169e-01, 0.000000e+00, 0.02652673, 0.2200679860, 2.131340e-04, 0.02336422, 1.894222e-02, 0.02233591, 0.51154696, 0.264602894, 5.091753e-02, 1.331050e-07, 0.00000000, 0.0834805843, 1.155348e-02, 0.03662029, 4.467317e-01, 0.03468598, 0.04112888, 0.010524562, 3.177321e-01, 1.763500e-04, 0.03468598, 0.0000000000, 1.925167e-05, 0.11431520, 8.735213e-04, 0.11962802, 0.21444851, 0.493687059, 8.735213e-04, 2.023140e-08, 0.08570012, 0.0059452421, 0.000000e+00, 0.07884399, 1.960264e-02, 0.51417681, 0.15431120, 0.154311203, 6.986716e-03, 2.081749e-08, 0.01650597, 0.1299343209, 4.171117e-03, 0.00000000 ), nrow = 10, byrow = TRUE) # expected_sigmas <- c(0.3252233, 0.2679755, 0.1817380, 0.1751287, 0.3280264, 0.4861266, 0.2463306, 0.2422687, 0.3463065, 0.2411619) res <- calc_row_probabilities_parallel(iris10_nn10$dist, iris10_nn10$idx, perplexity = 4, parallelize = FALSE, verbose = FALSE )$matrix res <- nn_to_sparse(iris10_nn10$idx, as.vector(res), self_nbr = TRUE, max_nbr_id = nrow(iris10_nn10$idx) ) expect_equal(as.matrix(res), P_row, tol = 1e-5, check.attributes = FALSE) RcppParallel::setThreadOptions(numThreads = 1) res <- calc_row_probabilities_parallel(iris10_nn10$dist, iris10_nn10$idx, perplexity = 4, verbose = FALSE )$matrix res <- nn_to_sparse(iris10_nn10$idx, as.vector(res), self_nbr = TRUE, max_nbr_id = nrow(iris10_nn10$idx) ) expect_equal(as.matrix(res), P_row, tol = 1e-5, check.attributes = FALSE) iris_dup <- duplicated(x2m(iris)) uiris <- iris[!iris_dup, ] # LargeVis-style iris normalization normiris <- scale(x2m(uiris), center = TRUE, scale = FALSE) normiris <- normiris / max(abs(normiris)) # niris10_nn149 <- dist_nn(dist(normiris), k = 149) # expect_equal(1 / res$sigma ^ 2, Prow_niris_p150_k50_betas, tol = 1e-5) # Taken from LargeVis C++ implementation # Prow_niris_p150_k50_betas <- # c( # 5.885742, 5.736816, 5.197266, 5.471191, 5.71875, 5.699707, 5.451172, 6.242188, 4.727051, 5.95459, # 5.53418, 6.278809, 5.412598, 3.991699, 4.12207, 4.150879, 4.842285, 6.005859, 5.578369, 5.635254, # 6.838379, 5.978516, 4.450684, 7.612305, 7.321289, 6.594238, 6.863281, 6.144043, 6.030762, 6.04248, # 6.217285, 6.470703, 4.745117, 4.282471, 6.104004, 5.433594, 5.414551, 5.560547, 4.568359, 6.307129, # 5.703125, 4.453369, 4.664551, 7.005859, 6.844238, 5.671875, 5.775879, 5.260254, 5.626465, 5.994629, # 13.903809, 19.361816, 17.199951, 10.595215, 21.495117, 19.960938, 22.048828, 6.962402, 17.828125, 9.40918, # 6.223511, 18.996094, 11.31665, 26.000977, 9.661377, 15.227539, 18.856934, 12.867676, 20.546875, 9.992432, # 22.505859, 16.000977, 23.277344, 24.697266, 18.488281, 16.779785, 17.730957, 23.207031, 25.374023, 8.538818, # 8.724121, 8.112671, 12.005859, 23.245605, 16.113281, 19.423828, 17.885254, 19.319336, 13.402344, 11.15625, # 15.037109, 24.227539, 12.86377, 6.838623, 14.530762, 14.649414, 15.500977, 20.317383, 7.972168, 14.302246, # 8.755371, 19.519531, 9.589355, 18.270508, 11.995361, 4.407959, 11.406738, 6.748779, 14.204102, 6.195068, # 22.625977, 21.842773, 14.548828, 17.580078, 15.902832, 15.451172, 19.606445, 3.676147, 3.610718, 18.935059, # 10.456055, 18.219238, 4.096863, 26.099609, 12.159668, 8.970703, 27.116211, 26.008301, 15.51416, 11.36377, # 7.287109, 4.036987, 14.57373, 25.604492, 17.268066, 5.501221, 11.135986, 19.822266, 25.111816, 14.611328, # 11.272705, 15.021484, 9.565918, 9.592529, 15.895508, 22.691895, 22.258789, 13.867188, 22.583008 # ) # Taken from the LargeVis C++ implementation Prow_iris_p150_k50_rowSums <- c( 1.064902, 1.01981, 1.022902, 1.00269, 1.058712, 0.959587, 1.020604, 1.072308, 0.918501, 1.035426, 1.010711, 1.055485, 1.00664, 0.874596, 0.840662, 0.782034, 0.960034, 1.065464, 0.91116, 1.029154, 1.016113, 1.041956, 0.94594, 1.038197, 1.010267, 1.021842, 1.064241, 1.060124, 1.058187, 1.030837, 1.03162, 1.022887, 0.938471, 0.876479, 1.042369, 1.031878, 0.992018, 1.047312, 0.93035, 1.069906, 1.057901, 0.766783, 0.954321, 1.030951, 0.977892, 1.013204, 1.025217, 1.012253, 1.028178, 1.065557, 0.84282, 1.103245, 0.981218, 0.927567, 1.190567, 1.135817, 1.11851, 0.745579, 1.074413, 0.892972, 0.717667, 1.111475, 0.84308, 1.285817, 0.884184, 0.949937, 1.078251, 1.009963, 0.930185, 0.96844, 1.107436, 1.031162, 1.161345, 1.165258, 1.076716, 1.029794, 1.008188, 1.173165, 1.26895, 0.857554, 0.919873, 0.887521, 1.015009, 1.200669, 0.958936, 0.979149, 1.070616, 0.946976, 1.019518, 0.987791, 1.007625, 1.257148, 1.038777, 0.754076, 1.078727, 1.049221, 1.098134, 1.148382, 0.74274, 1.084166, 0.819342, 1.072723, 0.958094, 1.101275, 1.036348, 0.744063, 0.689913, 0.811003, 0.896867, 0.781658, 1.162044, 1.199109, 1.110908, 0.923403, 0.831603, 1.055974, 1.167772, 0.662894, 0.653821, 0.899437, 1.003726, 0.952751, 0.705241, 1.281862, 1.039594, 0.882058, 1.306628, 1.290206, 1.09357, 0.864005, 0.824926, 0.663579, 1.063836, 1.242015, 0.843297, 0.769286, 0.907494, 1.14366, 1.252945, 1.073047, 1.022525, 0.951965, 0.977462, 0.941184, 1.050544, 1.128182, 1.230836, 0.925821, 1.158545 ) res <- perplexity_similarities( perplexity = 50, n_threads = 0, verbose = FALSE, nn = find_nn(normiris, k = 149, method = "fnn", metric = "euclidean", n_threads = 0, verbose = FALSE ) ) expect_equal(Matrix::rowSums(res), Prow_iris_p150_k50_rowSums, tol = 1e-6) RcppParallel::setThreadOptions(numThreads = 1) res <- perplexity_similarities( perplexity = 50, n_threads = 1, verbose = FALSE, nn = find_nn(normiris, k = 149, method = "fnn", metric = "euclidean", n_threads = 0, verbose = FALSE ) ) expect_equal(Matrix::rowSums(res), Prow_iris_p150_k50_rowSums, tol = 1e-6) uwot/tests/testthat/test_epochs.R0000644000176200001440000000117313400360122016706 0ustar liggesuserslibrary(uwot) context("Epochs") V <- fuzzy_simplicial_set(nn = nn) n_epochs <- 500 V@x[V@x < max(V@x) / n_epochs] <- 0 V <- Matrix::drop0(V) expect_equal(make_epochs_per_sample(V@x, n_epochs), c( 1.0, 1.0, 1.0, 6.1763447, 1.44948659, 2.16709542, 2.35197392, 1.0, 1.44948659, 1.0, 1.0, 1.73965964, 2.00000768, 2.16709542, 1.0, 1.54738133, 1.0, 1.37424666, 1.0, 1.0, 3.56216605, 1.09552314, 1.0, 1.0, 1.0, 1.54738133, 3.56216605, 2.82683975, 1.0, 1.09552314, 2.82683975, 3.21967221, 2.35197392, 1.73965964, 1.0, 6.1763447, 1.0, 2.00000768, 1.37424666, 3.21967221 ), tol = 1e-5 ) uwot/tests/testthat/test_spectral.R0000644000176200001440000000551213571660461017265 0ustar liggesuserslibrary(uwot) context("Spectral") test_that("normalized laplacian", { # These numbers come from running UMAP Python code: # spectral_layout(pairwise_distances(iris.data[0:10, :])) # NB: # 1. iris data in scikit-learn is currently from UCI repo, which has errors # (although this doesn't affect the first ten entries) # 2. eigenvector calculation is not that converged and specifies a starting # vector that we can't supply with either RSpectra or eigen. # 3. The eigenvectors are only identical up to a sign, so we take the absolute # values. expected_norm_lap <- c2y( 0.7477, -0.1292, -0.03001, 0.02127, -0.563, -0.01149, 0.1402, -0.2725, -0.01241, 0.1084, -0.106, -0.5723, 0.2024, -0.3082, 0.1642, -5.549e-05, -0.04843, -0.1747, 0.1684, 0.6611 ) res <- normalized_laplacian_init(Matrix::drop0(x2d(iris[1:10, ]))) expect_equal(abs(res), abs(expected_norm_lap), tolerance = 1e-2) }) test_that("laplacian eigenmap", { expected_lap_eig <- c2y( 0.3964, -0.2585, -0.297, -0.3923, 0.3905, 0.3581, -0.1268, 0.2687, -0.356, -0.1954, 0.2775, 0.3298, 0.1282, -0.09545, 0.1503, -0.4656, -0.1417, 0.4416, -0.3753, 0.4397 ) # Test with distance matrix (simple and symmetric) res <- laplacian_eigenmap(Matrix::drop0(x2d(iris10))) expect_equal(abs(res), abs(expected_lap_eig), tolerance = 1e-4) }) test_that("1 dimensional output gives a matrix", { expect_ok_matrix(spectral_init(V_union, ndim = 1, verbose = FALSE), nc = 1) expect_ok_matrix(normalized_laplacian_init(V_union, ndim = 1, verbose = FALSE ), nc = 1) expect_ok_matrix(laplacian_eigenmap(V_union, ndim = 1, verbose = FALSE), nc = 1 ) # 23: ndim was always 2 expect_ok_matrix(agspectral_init(V_union, n_neg_nbrs = 2, ndim = 1, verbose = FALSE), nc = 1 ) }) test_that("connected components", { # Example from doc of scipy.sparse.csgraph.connected_components graph <- Matrix::drop0(matrix( c( 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0 ), nrow = 5, byrow = TRUE )) cc_res <- connected_components(graph) expect_equal(cc_res$n_components, 2) expect_equal(cc_res$labels, c(0, 0, 0, 1, 1)) # Slightly more complicated example validated by running the Python version graph100 <- matrix(0, nrow = 10, ncol = 10) graph100[cbind(c(2, 6, 7, 8), c(5, 3, 7, 6))] <- 1 graph100 <- Matrix::drop0(graph100) g100_nc <- 7 g100_labels <- c(0, 1, 2, 3, 1, 2, 4, 2, 5, 6) cc_res <- connected_components(graph100) expect_equal(cc_res$n_components, g100_nc) expect_equal(cc_res$labels, g100_labels) # test recursive initialization of components sgraph <- graph + Matrix::t(graph) expect_ok_matrix(spectral_init(sgraph), nr = 5, nc = 2) }) uwot/tests/testthat/test_supervised.R0000644000176200001440000000714313571660463017645 0ustar liggesuserscontext("Supervised") # categorical y res <- umap(iris10, n_neighbors = 4, n_epochs = 2, learning_rate = 0.5, min_dist = 0.001, init = "rand", verbose = FALSE, n_threads = 1, y = ycat ) expect_ok_matrix(res) # numeric y res <- umap(iris10, n_neighbors = 4, n_epochs = 2, learning_rate = 0.5, min_dist = 0.001, init = "rand", verbose = FALSE, n_threads = 1, y = ynum ) expect_ok_matrix(res) # mixed categorical and numeric res <- umap(iris10, n_neighbors = 4, n_epochs = 2, learning_rate = 0.5, min_dist = 0.001, init = "rand", verbose = FALSE, n_threads = 1, y = data.frame(ycat, ynum) ) expect_ok_matrix(res) # multiple categorical y res <- umap(iris10, n_neighbors = 4, n_epochs = 2, learning_rate = 0.5, min_dist = 0.001, init = "rand", verbose = FALSE, n_threads = 1, y = data.frame(ycat, ycat2) ) expect_ok_matrix(res) # multiple numeric y res <- umap(iris10, n_neighbors = 4, n_epochs = 2, learning_rate = 0.5, min_dist = 0.001, init = "rand", verbose = FALSE, n_threads = 1, y = data.frame(ynum, ynum2) ) expect_ok_matrix(res) # multiple numeric and categorical res <- umap(iris10, n_neighbors = 4, n_epochs = 2, learning_rate = 0.5, min_dist = 0.001, init = "rand", verbose = FALSE, n_threads = 1, y = data.frame(ynum, ynum2, ycat, ycat2) ) expect_ok_matrix(res) # multiple numeric with different metrics and categorical set.seed(1337) res <- umap(iris10, n_neighbors = 4, n_epochs = 2, learning_rate = 0.5, min_dist = 0.001, init = "rand", verbose = FALSE, n_threads = 1, target_metric = list("euclidean" = 1, "cosine" = 2), target_weight = 0.5, y = data.frame(ynum, ynum2, ycat, ycat2) ) expect_ok_matrix(res) sm <- Matrix::drop0(matrix(c( -0.9183907, -1.4071020, 0.70400164, 0.4990913, -0.1631884, -0.03232201, 0.2156861, 0.4341653, 0.92592670 ), byrow = TRUE, nrow = 3)) expected <- matrix(c( -4.310855, 1, 1, 1, -0.7608521, 0.4345031, 1, 0.4345031, 1 ), byrow = TRUE, nrow = 3) expect_equal(as.matrix(reset_local_connectivity(sm)), expected, tol = 1e-7, check.attributes = FALSE ) sparr <- new("dgCMatrix", i = c(0L, 2L, 0L, 1L, 2L, 0L, 1L), p = c(0L, 2L, 5L, 7L), Dim = c(3L, 3L), Dimnames = list(NULL, NULL), x = c( 0.918390745913514, 0.215686070576616, 1.40710203887692, 0.163188411813119, 0.434165332563817, 0.704001636268765, 0.0323220081795518 ), factors = list() ) sparr2 <- new("dgCMatrix", i = c(0L, 1L, 2L, 1L, 2L, 0L, 1L), p = c(0L, 3L, 5L, 7L), Dim = c(3L, 3L), Dimnames = list(NULL, NULL), x = c( 1.68463092, 2.91620546, 0.26469792, 1.08820257, 0.96444675, 1.46399222, 2.72643589 ), factors = list() ) # Numbers taken from Python implementation int09 <- general_simplicial_set_intersection(sparr, sparr2, 0.9) res09 <- matrix(c( 1.66877087146, 0.137467853888, 1.40799953091, 1.84399206494, 0.889673751622, 1.86201852389, 0.223218799442, 0.879058365893, 0.000000 ), nrow = 3, byrow = TRUE ) expect_equal(as.matrix(int09), res09, check.attributes = FALSE, tol = 1e-6) int01 <- general_simplicial_set_intersection(sparr, sparr2, 0.1) res01 <- matrix(c( 0.97318335824, 1.12392924757, 0.734457833761, 0.0182018202924, 0.164728272878, 0.0361324854953, 0.186072986202, 0.432422466467, 0.000000 ), nrow = 3, byrow = TRUE ) expect_equal(as.matrix(int01), res01, check.attributes = FALSE, tol = 1e-6) sp34 <- Matrix::drop0(matrix(nrow = 3, byrow = TRUE, c( 0, 0.7403984, 0, 0.6574427, 0, 0, 0.9472488, 0, 0, 0.3039677, 0.2868714, 0 ))) expect_equal(colMaxs(sp34), c(0, 0.7403984, 0.9472488, 0.6574427)) uwot/tests/testthat/test_fuzzy_simplicial_set.R0000644000176200001440000001000313400360122021665 0ustar liggesuserslibrary(uwot) context("fuzzy simplicial set") ### Various fuzzy set matrices are defined in helper_fuzzy_sets.R # matrix res <- fuzzy_simplicial_set( set_op_mix_ratio = 1, local_connectivity = 1, bandwidth = 1, verbose = FALSE, n_threads = 0, nn = nn ) expect_equal(res, V_union, tol = 1e-4) # mix union + intersection res <- fuzzy_simplicial_set( set_op_mix_ratio = 0.5, local_connectivity = 1, bandwidth = 1, verbose = FALSE, n_threads = 0, nn = nn ) expect_equal(res, V_mix, tol = 1e-4) # intersection res <- fuzzy_simplicial_set( set_op_mix_ratio = 0, local_connectivity = 1, bandwidth = 1, verbose = FALSE, n_threads = 0, nn = nn ) expect_equal(res, V_intersect, tol = 1e-4) # Union + local_connectivity res <- fuzzy_simplicial_set( set_op_mix_ratio = 1, local_connectivity = 1.5, bandwidth = 1, verbose = FALSE, n_threads = 0, nn = nn ) expect_equal(res, V_union_local, tol = 1e-4) # Union + bandwidth res <- fuzzy_simplicial_set( set_op_mix_ratio = 1, local_connectivity = 1, bandwidth = 0.5, verbose = FALSE, n_threads = 0, nn = nn ) expect_equal(res, V_union_bandwidth, tol = 1e-4) # intersect + local + bandwidth res <- fuzzy_simplicial_set( set_op_mix_ratio = 0, local_connectivity = 1.5, bandwidth = 0.5, verbose = FALSE, n_threads = 0, nn = nn ) expect_equal(res, V_intersect_local_bandwidth, tol = 1e-4) # parallel code path RcppParallel::setThreadOptions(numThreads = 1) # matrix res <- fuzzy_simplicial_set( set_op_mix_ratio = 1, local_connectivity = 1, bandwidth = 1, verbose = FALSE, n_threads = 1, nn = nn ) expect_equal(res, V_union, tol = 1e-4) # mix union + intersection res <- fuzzy_simplicial_set( set_op_mix_ratio = 0.5, local_connectivity = 1, bandwidth = 1, verbose = FALSE, n_threads = 1, nn = nn ) expect_equal(res, V_mix, tol = 1e-4) # intersection res <- fuzzy_simplicial_set( set_op_mix_ratio = 0, local_connectivity = 1, bandwidth = 1, verbose = FALSE, n_threads = 1, nn = nn ) expect_equal(res, V_intersect, tol = 1e-4) # Union + local_connectivity res <- fuzzy_simplicial_set( set_op_mix_ratio = 1, local_connectivity = 1.5, bandwidth = 1, verbose = FALSE, n_threads = 1, nn = nn ) expect_equal(res, V_union_local, tol = 1e-4) # Union + bandwidth res <- fuzzy_simplicial_set( set_op_mix_ratio = 1, local_connectivity = 1, bandwidth = 0.5, verbose = FALSE, n_threads = 1, nn = nn ) expect_equal(res, V_union_bandwidth, tol = 1e-4) # intersect + local + bandwidth res <- fuzzy_simplicial_set( set_op_mix_ratio = 0, local_connectivity = 1.5, bandwidth = 0.5, verbose = FALSE, n_threads = 1, nn = nn ) expect_equal(res, V_intersect_local_bandwidth, tol = 1e-4) # parallel code path RcppParallel::setThreadOptions(numThreads = 1) # matrix res <- fuzzy_simplicial_set( set_op_mix_ratio = 1, local_connectivity = 1, bandwidth = 1, verbose = FALSE, n_threads = 1, nn = nn ) expect_equal(res, V_union, tol = 1e-4) # mix union + intersection res <- fuzzy_simplicial_set( set_op_mix_ratio = 0.5, local_connectivity = 1, bandwidth = 1, verbose = FALSE, n_threads = 1, nn = nn ) expect_equal(res, V_mix, tol = 1e-4) # intersection res <- fuzzy_simplicial_set( set_op_mix_ratio = 0, local_connectivity = 1, bandwidth = 1, verbose = FALSE, n_threads = 1, nn = nn ) expect_equal(res, V_intersect, tol = 1e-4) # Union + local_connectivity res <- fuzzy_simplicial_set( set_op_mix_ratio = 1, local_connectivity = 1.5, bandwidth = 1, verbose = FALSE, n_threads = 1, nn = nn ) expect_equal(res, V_union_local, tol = 1e-4) # Union + bandwidth res <- fuzzy_simplicial_set( set_op_mix_ratio = 1, local_connectivity = 1, bandwidth = 0.5, verbose = FALSE, n_threads = 1, nn = nn ) expect_equal(res, V_union_bandwidth, tol = 1e-4) # intersect + local + bandwidth res <- fuzzy_simplicial_set( set_op_mix_ratio = 0, local_connectivity = 1.5, bandwidth = 0.5, verbose = FALSE, n_threads = 1, nn = nn ) expect_equal(res, V_intersect_local_bandwidth, tol = 1e-4) uwot/tests/testthat/test_errors.R0000644000176200001440000000456713571660426016776 0ustar liggesuserslibrary(uwot) context("Input validation") expect_error(umap(iris10, n_neighbors = 1, n_threads = 0), "n_neighbors") expect_error(umap(iris10, n_neighbors = 15, n_threads = 0), "n_neighbors") expect_error(umap(iris10, set_op_mix_ratio = 10, n_threads = 0), "set_op_mix_ratio") expect_error(umap(iris10, set_op_mix_ratio = -10, n_threads = 0), "set_op_mix_ratio") expect_error(umap(iris10, local_connectivity = 0.5, n_threads = 0), "local_connectivity") expect_error(umap(diris10, ret_model = TRUE, n_threads = 0), "models") expect_error(umap(dmiris10z, ret_model = TRUE, n_threads = 0), "models") expect_error(umap(dmiris10z[, 1:9], n_threads = 0), "distance") expect_error(umap(dmiris10z[, 1:9], n_threads = 0), "distance") expect_error(umap(iris[, "Species", drop = FALSE], n_threads = 0), "numeric") expect_error(umap(iris10, n_threads = 0, nn_method = list()), "precalculated") expect_error(umap(iris10, n_threads = 0, nn_method = list(idx = matrix(1:4, nrow = 2))), "idx") expect_error(umap(iris10, n_threads = 0, nn_method = list(idx = matrix(1:40, nrow = 10))), "dist") expect_error(umap(iris10, n_threads = 0, nn_method = list( idx = matrix(1:40, nrow = 10), dist = matrix(1:4, nrow = 2) )), "dist") expect_error(umap(iris10, n_threads = 0, n_neighbors = 4, nn_method = "fnn", metric = "cosine"), "FNN") expect_error(umap(iris10, n_threads = 0, n_neighbors = 4, nn_method = "fnn", ret_model = TRUE), "FNN") expect_error(lvish(iris10, n_threads = 0, perplexity = 50), "perplexity") expect_error(tumap(iris10, n_components = 0), "n_components") expect_error(umap(iris10, pca = 1), "'pca' must be >=") expect_error(umap(iris10, n_threads = 0, n_neighbors = 4, y = c(1:9, NA)), "numeric y") expect_error(umap( X = NULL, n_threads = 0, n_neighbors = 4, nn_method = nn, init = "spca" ), "spca") # add an extra column to nn nn5 <- nn nn5$idx <- cbind(nn5$idx, rep(100, nrow(nn5$idx))) nn5$dist <- cbind(nn5$dist, rep(100.0, nrow(nn5$dist))) expect_error(umap(X = NULL, n_threads = 0, nn_method = list(nn, nn5)), "Precalculated") expect_error(umap(iris10, n_threads = 0, pca = 0), "positive integer") expect_error(umap(iris10, n_threads = -1), "n_threads") expect_error(umap(iris10, n_sgd_threads = -1), "n_sgd_threads") model <- umap(iris10, n_neighbors = 2, ret_model = TRUE, n_epochs = 2) expect_error(umap_transform(iris10[, 1:2], model), "Incorrect dimensions") uwot/tests/testthat/test_scale.R0000644000176200001440000000231513571660454016537 0ustar liggesuserslibrary(uwot) context("Scaling") iris10_sd <- apply(iris10, 2, sd) iris10_mean <- apply(iris10, 2, mean) iris10_none <- scale_input(iris10, scale_type = FALSE) expect_equal(apply(iris10_none, 2, sd), iris10_sd) expect_equal(apply(iris10_none, 2, mean), iris10_mean) iris10_scale <- scale_input(iris10, scale_type = TRUE) expect_equal(apply(iris10_scale, 2, sd), rep(1, 4), check.attributes = FALSE) expect_equal(apply(iris10_scale, 2, mean), rep(0, 4), check.attributes = FALSE) # "scale" and "z" and TRUE are synonyms expect_equal(scale_input(iris10, scale_type = "scale"), iris10_scale) expect_equal(scale_input(iris10, scale_type = "Z"), iris10_scale) iris10_maxabs <- scale_input(iris10, scale_type = "maxabs") expect_equal(apply(iris10_maxabs, 2, mean), rep(0, 4), check.attributes = FALSE) expect_equal(max(abs(iris10_maxabs)), 1) iris10_range <- scale_input(iris10, scale_type = "range") expect_equal(max(iris10_range), 1) expect_equal(min(iris10_range), 0) iris10_colrange <- scale_input(iris10, scale_type = "colrange") expect_equal(apply(iris10_colrange, 2, max), rep(1, 4), check.attributes = FALSE) expect_equal(apply(iris10_colrange, 2, min), rep(0, 4), check.attributes = FALSE) uwot/tests/testthat/test_neighbors.R0000644000176200001440000001262313571660437017434 0ustar liggesuserslibrary(uwot) context("neighbors") i10nn4dist <- matrix(c( 0, 0.1414214, 0.1732051, 0.4690416, 0, 0.1732051, 0.3000000, 0.3316625, 0, 0.2449490, 0.2645751, 0.3000000, 0, 0.2449490, 0.3000000, 0.3162278, 0, 0.1414214, 0.2236068, 0.4582576, 0, 0.6164414, 0.6164414, 0.7000000, 0, 0.2645751, 0.3316625, 0.4242641, 0, 0.1732051, 0.2236068, 0.3316625, 0, 0.3000000, 0.4358899, 0.5099020, 0, 0.1732051, 0.3162278, 0.3162278 ), nrow = 10, byrow = TRUE) i10nn4idx <- matrix(c( 1, 5, 8, 10, 2, 10, 3, 4, 3, 4, 7, 2, 4, 3, 9, 10, 5, 1, 8, 7, 6, 1, 5, 8, 7, 3, 4, 8, 8, 1, 5, 10, 9, 4, 3, 2, 10, 2, 3, 4 ), nrow = 10, byrow = TRUE) ## Test specialized functions res <- FNN_nn(iris10, k = 4, include_self = TRUE) expect_equal(res$dist, i10nn4dist, tol = 1e-6) expect_equal(res$idx[-6, ], i10nn4idx[-6, ]) res <- dist_nn(diris10, k = 4, include_self = TRUE) expect_equal(res$dist, i10nn4dist, tol = 1e-6) expect_equal(res$idx[-6, ], i10nn4idx[-6, ]) res <- sparse_nn(dmiris10z, k = 4, include_self = TRUE) expect_equal(res$dist, i10nn4dist, tol = 1e-6) expect_equal(res$idx[-6, ], i10nn4idx[-6, ]) # Test overall function res <- find_nn(iris10, k = 4, include_self = TRUE) expect_equal(res$dist, i10nn4dist, tol = 1e-6) expect_equal(res$idx[-6, ], i10nn4idx[-6, ]) res <- find_nn(diris10, k = 4, include_self = TRUE) expect_equal(res$dist, i10nn4dist, tol = 1e-6) expect_equal(res$idx[-6, ], i10nn4idx[-6, ]) res <- find_nn(dmiris10z, k = 4, include_self = TRUE) expect_equal(res$dist, i10nn4dist, tol = 1e-6) expect_equal(res$idx[-6, ], i10nn4idx[-6, ]) # Test Annoy # ten iris entries where the 4 nearest neighbors are distinct uiris <- unique(iris) uirism <- as.matrix(uiris[, -5]) ui10 <- uirism[6:15, ] nn_index4 <- matrix(c( 6, 10, 3, 7, 7, 3, 5, 8, 7, 5, 2, 8, 9, 8, 2, 5, 8, 3, 7, 2, 1, 3, 10, 7, 3, 2, 5, 8, 5, 4, 7, 3, 4, 8, 2, 5, 6, 1, 3, 7 ), nrow = 10, byrow = TRUE) nn_dist4 <- matrix(c( 0.3464102, 0.6782330, 0.7000000, 0.8124038, 0.3000000, 0.4242641, 0.4795832, 0.4898979, 0.2236068, 0.3316625, 0.4242641, 0.4690416, 0.3464102, 0.4242641, 0.5477226, 0.5567764, 0.1732051, 0.3316625, 0.3464102, 0.4795832, 0.3464102, 0.5000000, 0.5830952, 0.6782330, 0.2236068, 0.3000000, 0.3464102, 0.4582576, 0.1732051, 0.4242641, 0.4582576, 0.4690416, 0.3464102, 0.5830952, 0.6164414, 0.7280110, 0.5830952, 0.6782330, 1.0440307, 1.2328828 ), nrow = 10, byrow = TRUE) self_nn_index4 <- matrix(c( 1, 6, 10, 3, 2, 7, 3, 5, 3, 7, 5, 2, 4, 9, 8, 2, 5, 8, 3, 7, 6, 1, 3, 10, 7, 3, 2, 5, 8, 5, 4, 7, 9, 4, 8, 2, 10, 6, 1, 3 ), nrow = 10, byrow = TRUE) self_nn_dist4 <- matrix(c( 0, 0.3464102, 0.6782330, 0.7000000, 0, 0.3000000, 0.4242641, 0.4795832, 0, 0.2236068, 0.3316625, 0.4242641, 0, 0.3464102, 0.4242641, 0.5477226, 0, 0.1732051, 0.3316625, 0.3464102, 0, 0.3464102, 0.5000000, 0.5830952, 0, 0.2236068, 0.3000000, 0.3464102, 0, 0.1732051, 0.4242641, 0.4582576, 0, 0.3464102, 0.5830952, 0.6164414, 0, 0.5830952, 0.6782330, 1.0440307 ), nrow = 10, byrow = TRUE) res <- annoy_nn(ui10, k = 4, n_threads = 0) expect_equal(res$idx, self_nn_index4, check.attributes = FALSE) expect_equal(res$dist, self_nn_dist4, check.attributes = FALSE, tol = 1e-6) res <- annoy_nn(ui10, k = 4, n_threads = 0, ret_index = TRUE) expect_equal(res$idx, self_nn_index4, check.attributes = FALSE) expect_equal(res$dist, self_nn_dist4, check.attributes = FALSE, tol = 1e-6) expect_true(!is.null(res$index)) expect_is(res$index, "Rcpp_AnnoyEuclidean") res <- annoy_nn(ui10, k = 4, n_threads = 1) expect_equal(res$idx, self_nn_index4, check.attributes = FALSE) expect_equal(res$dist, self_nn_dist4, check.attributes = FALSE, tol = 1e-6) res <- annoy_nn(ui10, k = 4, n_threads = 1, ret_index = TRUE) expect_equal(res$idx, self_nn_index4, check.attributes = FALSE) expect_equal(res$dist, self_nn_dist4, check.attributes = FALSE, tol = 1e-6) expect_true(!is.null(res$index)) expect_is(res$index, "Rcpp_AnnoyEuclidean") cos_index <- matrix( c( 1, 2, 7, 3, 2, 1, 7, 3, 3, 6, 4, 7, 4, 3, 5, 7, 5, 8, 4, 3, 6, 3, 9, 4, 7, 3, 1, 4, 8, 5, 4, 3, 9, 6, 10, 3, 10, 9, 6, 3 ), byrow = TRUE, ncol = 4, nrow = 10 ) # Cosine distances from HNSW cos_dist <- matrix( c( 0, 0.000131368637084961, 0.00048297643661499, 0.000737011432647705, 0, 0.000131368637084961, 0.000680804252624512, 0.000909507274627686, 0, 0.000168740749359131, 0.000244021415710449, 0.000422179698944092, 0, 0.000244021415710449, 0.000383198261260986, 0.000549376010894775, 0, 7.09891319274902e-05, 0.000383198261260986, 0.000682294368743896, 0, 0.000168740749359131, 0.000265955924987793, 0.000767052173614502, 0, 0.000422179698944092, 0.00048297643661499, 0.000549376010894775, 0, 7.09891319274902e-05, 0.000611364841461182, 0.000812351703643799, 0, 0.000265955924987793, 0.00078284740447998, 0.000819146633148193, 0, 0.00078284740447998, 0.00160372257232666, 0.00279802083969116 ), byrow = TRUE, ncol = 4, nrow = 10 ) res <- annoy_nn(ui10, k = 4, n_threads = 0, ret_index = TRUE, metric = "cosine") expect_equal(res$idx, cos_index, check.attributes = FALSE) expect_equal(res$dist, cos_dist, check.attributes = FALSE, tol = 1e-6) expect_true(!is.null(res$index)) expect_is(res$index, "Rcpp_AnnoyAngular") uwot/tests/testthat/test_transform.R0000644000176200001440000000641513403320452017451 0ustar liggesuserslibrary(uwot) context("Transform") graph <- V_asymm + diag(1, nrow(V_asymm), ncol(V_asymm)) dV <- as.matrix(graph) vdV <- as.vector(t(dV)) dgraph <- matrix(vdV[vdV > 0], byrow = TRUE, nrow = 10) dgraph <- t(apply(dgraph, 1, function(x) { sort(x, decreasing = TRUE) })) graph <- Matrix::t(graph) train_embedding <- matrix(1:20, nrow = 10) av <- matrix(c( 6.00, 16.00, 4.75, 14.75, 4.00, 14.00, 6.50, 16.50, 5.25, 15.25, 5.00, 15.00, 5.50, 15.50, 6.00, 16.00, 4.50, 14.50, 4.75, 14.75 ), nrow = 10, byrow = TRUE) embedding <- init_new_embedding(train_embedding, nn, graph = NULL, weighted = FALSE, n_threads = 0, verbose = FALSE ) expect_equal(embedding, av, check.attributes = FALSE) wav <- matrix(c( 4.774600, 14.77460, 5.153800, 15.15380, 4.120000, 14.12000, 5.485100, 15.48510, 4.573100, 14.57310, 4.000000, 14.00000, 5.138362, 15.13836, 5.184333, 15.18433, 5.191600, 15.19160, 5.166667, 15.16667 ), nrow = 10, byrow = TRUE) embedding <- init_new_embedding(train_embedding, nn, graph = dgraph, weighted = TRUE, n_threads = 0, verbose = FALSE ) expect_equal(embedding, wav, check.attributes = FALSE, tol = 1e-5) # Check threaded code RcppParallel::setThreadOptions(numThreads = 1) embedding <- init_new_embedding(train_embedding, nn, graph = NULL, weighted = FALSE, n_threads = 1, verbose = FALSE ) expect_equal(embedding, av, check.attributes = FALSE) embedding <- init_new_embedding(train_embedding, nn, graph = dgraph, weighted = TRUE, n_threads = 1, verbose = FALSE ) expect_equal(embedding, wav, check.attributes = FALSE, tol = 1e-5) iris10_range <- scale_input(iris10, scale_type = "range", ret_model = TRUE) iris10_rtrans <- apply_scaling(iris10, attr_to_scale_info(iris10_range)) expect_equal(iris10_range, iris10_rtrans, check.attributes = FALSE) iris10_maxabs <- scale_input(iris10, scale_type = "maxabs", ret_model = TRUE) iris10_matrans <- apply_scaling(iris10, attr_to_scale_info(iris10_maxabs)) expect_equal(iris10_maxabs, iris10_matrans, check.attributes = FALSE) iris10_scale <- scale_input(iris10, scale_type = "scale", ret_model = TRUE) iris10_strans <- apply_scaling(iris10, attr_to_scale_info(iris10_scale)) expect_equal(iris10_scale, iris10_strans, check.attributes = FALSE) iris10_zv_col <- iris10 iris10_zv_col[, 3] <- 10 iris10zvc_scale <- scale_input(iris10_zv_col, scale_type = "scale", ret_model = TRUE ) # scale the original iris10 here on purpose to check that full-variance column # is correctly removed iris10_zvstrans <- apply_scaling(iris10, attr_to_scale_info(iris10zvc_scale)) expect_equal(iris10zvc_scale, iris10_zvstrans, check.attributes = FALSE) iris10_none <- scale_input(iris10, scale_type = FALSE, ret_model = TRUE) expect_null(attr_to_scale_info(iris10_none)) iris10_colrange <- scale_input(iris10, scale_type = "colrange", ret_model = TRUE) iris10_crtrans <- apply_scaling(iris10, attr_to_scale_info(iris10_colrange)) expect_equal(iris10_colrange, iris10_crtrans, check.attributes = FALSE) # test pca transform works iris10pca <- pca_scores(iris10, ncol = 2, ret_extra = TRUE) iris10pcat <- apply_pca(iris10, iris10pca) expect_equal(iris10pca$scores, iris10pcat, check.attributes = FALSE) uwot/tests/testthat/test_saveload.R0000644000176200001440000000323313571660454017246 0ustar liggesuserslibrary(uwot) context("load/save model") test_that("can save and load simple model", { set.seed(1337) model <- umap(iris10, n_neighbors = 4, n_epochs = 2, init = "spca", metric = "euclidean", verbose = FALSE, n_threads = 0, ret_model = TRUE ) mod_fname <- tempfile(tmpdir = tempdir()) save_uwot(model, file = mod_fname) expect_true(file.exists(mod_fname)) # Can use model after saving set.seed(1337) res_trans <- umap_transform(iris10, model) expect_ok_matrix(res_trans) modelload <- load_uwot(file = mod_fname) set.seed(1337) resload_trans <- umap_transform(iris10, modelload) expect_ok_matrix(resload_trans) expect_equal(resload_trans, res_trans) if (file.exists(mod_fname)) { unlink(mod_fname) } }) test_that("can save and load mixed distance model", { set.seed(1337) jiris10 <- jitter(iris10) metric2 <- list( "euclidean" = c(1, 2), "cosine" = c("Petal.Length", "Petal.Width") ) model <- umap(jiris10, n_neighbors = 4, n_epochs = 2, init = "spca", metric = metric2, verbose = FALSE, n_threads = 0, ret_nn = TRUE, ret_model = TRUE ) mod_fname <- tempfile(tmpdir = tempdir()) save_uwot(model, file = mod_fname) expect_true(file.exists(mod_fname)) # Can use model after saving set.seed(1337) res_trans <- umap_transform(jiris10, model) expect_ok_matrix(res_trans) modelload <- load_uwot(file = mod_fname) set.seed(1337) resload_trans <- umap_transform(jiris10, modelload) expect_ok_matrix(resload_trans) expect_equal(resload_trans, res_trans) if (file.exists(mod_fname)) { unlink(mod_fname) } }) uwot/tests/testthat/test_pca.R0000644000176200001440000000277613571660447016230 0ustar liggesuserslibrary(uwot) context("PCA") iris10prcomp <- prcomp(iris10, retx = TRUE, center = TRUE, scale. = FALSE) test_that("PCA initialization", { iris10_pca_scores <- pca_scores(iris10, ncol = 2) suppressWarnings(iris10_irlba_scores <- irlba_scores(iris10, ncol = 2)) expect_equal(abs(iris10prcomp$x[, 1:2]), abs(iris10_pca_scores), check.attributes = FALSE ) expect_equal(abs(iris10prcomp$x[, 1:2]), abs(iris10_irlba_scores), check.attributes = FALSE ) }) test_that("1 component initialization works", { expect_ok_matrix(pca_init(iris10, ndim = 1), nc = 1) }) test_that("PCA returns model data", { iris10_pca_scores <- pca_scores(iris10, ncol = 2, ret_extra = TRUE) expect_equal(abs(iris10prcomp$x[, 1:2]), abs(iris10_pca_scores$scores), check.attributes = FALSE ) expect_equal(abs(iris10prcomp$rotation[, 1:2]), abs(iris10_pca_scores$rotation), check.attributes = FALSE ) expect_equal(abs(iris10prcomp$center), abs(iris10_pca_scores$center), check.attributes = FALSE ) suppressWarnings(iris10_irlba_scores <- irlba_scores(iris10, ncol = 2, ret_extra = TRUE )) expect_equal(abs(iris10prcomp$x[, 1:2]), abs(iris10_irlba_scores$scores), check.attributes = FALSE ) expect_equal(abs(iris10prcomp$rotation[, 1:2]), abs(iris10_irlba_scores$rotation), check.attributes = FALSE ) expect_equal(abs(iris10prcomp$center), abs(iris10_irlba_scores$center), check.attributes = FALSE ) }) uwot/tests/testthat/test_curve.R0000644000176200001440000000040613400360122016547 0ustar liggesuserslibrary(uwot) context("Curve Parameters") expect_equal(as.vector(find_ab_params(spread = 1, min_dist = 0.001)), c(1.929, 0.792), tol = 1e-3 ) expect_equal(as.vector(find_ab_params(spread = 1, min_dist = 0.1)), c(1.577, 0.895), tol = 1e-3 ) uwot/tests/testthat/helper_data.R0000644000176200001440000000320113405214034016635 0ustar liggesusers# Small -ve distances are possible dist2 <- function(X) { D2 <- rowSums(X * X) D2 + sweep(X %*% t(X) * -2, 2, t(D2), `+`) } # Squared Euclidean distances, ensuring no small -ve distances can occur safe_dist2 <- function(X) { D2 <- dist2(X) D2[D2 < 0] <- 0 D2 } # convert dataframe to distance matrix x2d <- function(X) { sqrt(safe_dist2(x2m(X))) } # Covert a vector into a 2D matrix for generating Y output c2y <- function(...) { matrix(unlist(list(...)), ncol = 2) } iris10 <- x2m(iris[1:10, ]) iris10_Y <- pca_scores(iris10, ncol = 2) diris10 <- dist(iris10) # Sparse iris10 dist dmiris10 <- as.matrix(diris10) dmiris10z <- dmiris10 dmiris10z[dmiris10z > 0.71] <- 0 dmiris10z <- Matrix::drop0(dmiris10z) # some Y data ycat <- as.factor(c(levels(iris$Species)[rep(1:3, each = 3)], NA)) ycat2 <- as.factor(c(NA, levels(iris$Species)[rep(1:3, times = 3)])) ynum <- (1:10) / 10 ynum2 <- seq(from = 10, to = -10, length.out = 10) / 100 nn <- find_nn(iris10, k = 4, method = "fnn", metric = "euclidean", n_threads = 0, verbose = FALSE ) # Just test that res is a matrix with valid numbers expect_ok_matrix <- function(res, nr = nrow(iris10), nc = 2) { expect_is(res, "matrix") expect_equal(nrow(res), nr) expect_equal(ncol(res), nc) expect_false(any(is.infinite(res))) } expect_is_nn <- function(res, nr = 10, k = 4) { expect_is(res, "list") expect_is_nn_matrix(res$dist, nr, k) expect_is_nn_matrix(res$idx, nr, k) } expect_is_nn_matrix <- function(res, nr = 10, k = 4) { expect_is(res, "matrix") expect_equal(nrow(res), nr) expect_equal(ncol(res), k) } uwot/tests/testthat/test_knn_aff.R0000644000176200001440000000322313400360122017025 0ustar liggesuserslibrary(uwot) context("knn affinity") expected_sparse <- matrix(0, nrow = 10, ncol = 10) for (i in seq_len(nrow(nn$idx))) { for (j in seq_len(ncol(nn$idx))) { expected_sparse[i, nn$idx[i, j]] <- 2 } } expected_sparse <- Matrix::drop0(expected_sparse) res <- nn_to_sparse(nn$idx, val = 2) expect_equal(res, expected_sparse) v <- 1 expected_sparse_mv <- matrix(0, nrow = 10, ncol = 10) for (i in seq_len(nrow(nn$idx))) { nnr <- sort(nn$idx[i, ]) for (j in seq_len(ncol(nn$idx))) { expected_sparse_mv[i, nnr[j]] <- v v <- v + 1 } } expect_equal(nn_to_sparse(nn$idx, matrix(1:40, nrow = 10, byrow = TRUE)), Matrix::drop0(expected_sparse_mv), check.attributes = FALSE ) res <- perplexity_similarities(iris10, 4, kernel = "knn", nn = nn) expected_sym_nn_graph <- matrix(0, nrow = 10, ncol = 10) o3 <- 1 / 3 o6 <- 1 / 6 expected_sym_nn_graph[1, c(5, 6, 8, 10)] <- c(o3, o6, o3, o6) expected_sym_nn_graph[2, c(3, 4, 9, 10)] <- c(o3, o6, o6, o3) expected_sym_nn_graph[3, c(2, 4, 7, 9, 10)] <- c(o3, o3, o3, o6, o6) expected_sym_nn_graph[4, c(2, 3, 7, 9, 10)] <- c(o6, o3, o6, o3, o3) expected_sym_nn_graph[5, c(1, 6, 7, 8)] <- c(o3, o6, o6, o3) expected_sym_nn_graph[6, c(1, 5, 8)] <- c(o6, o6, o6) expected_sym_nn_graph[7, c(3, 4, 5, 8)] <- c(o3, o6, o6, o6) expected_sym_nn_graph[8, c(1, 5, 6, 7, 10)] <- c(o3, o3, o6, o6, o6) expected_sym_nn_graph[9, c(2, 3, 4)] <- c(o6, o6, o3) expected_sym_nn_graph[10, c(1, 2, 3, 4, 8)] <- c(o6, o3, o6, o3, o6) expect_equal(sum(res), 10) expect_true(Matrix::isSymmetric(res)) expect_equal(as.matrix(res), expected_sym_nn_graph, check.attributes = FALSE, tol = 1e-7 ) uwot/tests/testthat/test_smooth_knn_dists.R0000644000176200001440000001254613571660457021047 0ustar liggesuserslibrary(uwot) context("Smooth kNN distances") ### C++ tests nn_8 <- find_nn(iris10, k = 8) res <- smooth_knn_distances_parallel(nn_8$dist, nn_8$idx)$matrix expect_equal(as.vector(res), c( 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0.883551016667945, 0.563402698221087, 0.789087277089996, 0.57607769298836, 0.750303047844192, 1, 0.656969510423194, 0.745156972400171, 0.586089930619138, 0.481555035145846, 0.279104282300107, 0.488196761636568, 0.514561802750023, 0.489639480054329, 0.330384991495572, 0.626571100714632, 0.367875074735739, 0.39660773625396, 0.438113497020007, 0.481555035145844, 0.238036308009313, 0.321078737378799, 0.423034729464725, 0.419490620521321, 0.275816663602388, 0.120285582819758, 0.297337437716562, 0.247710312149843, 0.377578194428495, 0.445038652274379, 0.229198240647999, 0.217928223724008, 0.13265884755527, 0.419490620521318, 0.257869637066222, 0.110625826611838, 0.260166429776407, 0.231017974667955, 0.364373813389398, 0.220580064268054, 0.212929653970733, 0.217928223724007, 0.0998021268957899, 0.0776802085446247, 0.195560609120723, 0.072176661510608, 0.215176296482231, 0.231017974667954, 0.147146427255277, 0.209014049190051, 0.157184945393181, 0.191460580118967, 0.0408496922133704, 0.0176222685661076, 0.190057981521641, 0.0703455098666948, 0.202477876903057, 0.148483915739209, 0.086695317543654, 0.162252224543109 )) # expect_equal(res$sigma, c( # 0.2567215, 0.22098923, 0.08285332, 0.09981823, # 0.28608322, 0.17873764, 0.15968704, 0.17134094, # 0.25434113, 0.19572449 # )) # expect_equal(res$rho, c( # 0.14142136, 0.17320508, 0.24494897, 0.24494897, # 0.14142136, 0.6164414, 0.26457513, 0.17320508, # 0.3, 0.17320508 # )) nn_4 <- find_nn(iris10, k = 4) res <- smooth_knn_distances_parallel(nn_4$dist, nn_4$idx)$matrix expect_equal(as.vector(res), c( 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0.838084924053271, 0.538562488894191, 0.672032147261722, 0.54465912156976, 0.719264468544344, 1, 0.646253111343435, 0.689408428587288, 0.574825084073865, 0.499998079054375, 0.161908061792063, 0.461447148366392, 0.327968484367062, 0.455344908480281, 0.280728069552432, 5.12868558931539e-10, 0.35375192447642, 0.310590623941469, 0.425174782792857, 0.499998079054373 )) # nn4 # expect_equal(res$sigma, c( # 0.17993927, 0.20488739, 0.0493803, 0.09060478, # 0.24940491, 0.00390625, 0.15367126, 0.13551712, # 0.24542618, 0.20633698 # )) # expect_equal(res$rho, c( # 0.14142136, 0.17320508, 0.24494897, 0.24494897, # 0.14142136, 0.6164414, 0.26457513, 0.17320508, 0.3, # 0.17320508 # )) ### Various fuzzy set matrices are defined in helper_fuzzy_sets.R # unsymmetrized fuzzy set res <- nn_to_sparse(nn_4$idx, as.vector(res), self_nbr = TRUE) expect_equal(res, V_asymm, tol = 1e-4) # Fuzzy Set Union expect_equal(fuzzy_set_union(res), V_union, tol = 1e-4) # mix intersection with union expect_equal(fuzzy_set_union(res, set_op_mix_ratio = 0.5), V_mix, tol = 1e-4 ) # intersection expect_equal(fuzzy_set_union(res, set_op_mix_ratio = 0), V_intersect, tol = 1e-4 ) res_cpp_conn1 <- smooth_knn_distances_parallel(nn_4$dist, nn_4$idx, n_iter = 64, local_connectivity = 1.0, bandwidth = 1.0, tol = 1e-5, min_k_dist_scale = 1e-3, parallelize = FALSE, verbose = FALSE )$matrix expect_equal(nn_to_sparse(nn_4$idx, as.vector(res_cpp_conn1), self_nbr = TRUE ), V_asymm, tol = 1e-4) res_cpp_conn1.5 <- smooth_knn_distances_parallel(nn_4$dist, nn_4$idx, n_iter = 64, local_connectivity = 1.5, bandwidth = 1.0, tol = 1e-5, min_k_dist_scale = 1e-3, parallelize = FALSE, verbose = FALSE )$matrix expect_equal(nn_to_sparse(nn_4$idx, as.vector(res_cpp_conn1.5), self_nbr = TRUE ), V_asymm_local, tol = 1e-4) RcppParallel::setThreadOptions(numThreads = 1) res_cpp_conn1 <- smooth_knn_distances_parallel(nn_4$dist, nn_4$idx, n_iter = 64, local_connectivity = 1.0, bandwidth = 1.0, tol = 1e-5, min_k_dist_scale = 1e-3, grain_size = 1, verbose = FALSE )$matrix expect_equal(nn_to_sparse(nn_4$idx, as.vector(res_cpp_conn1), self_nbr = TRUE ), V_asymm, tol = 1e-4) res_cpp_conn1.5 <- smooth_knn_distances_parallel(nn_4$dist, nn_4$idx, n_iter = 64, local_connectivity = 1.5, bandwidth = 1.0, tol = 1e-5, min_k_dist_scale = 1e-3, grain_size = 1, verbose = FALSE )$matrix expect_equal(nn_to_sparse(nn_4$idx, as.vector(res_cpp_conn1.5), self_nbr = TRUE ), V_asymm_local, tol = 1e-4) # Test cross-distances V_asymm_local_cross <- V_asymm_local diag(V_asymm_local_cross) <- 1 V_asymm_local_cross <- cbind( V_asymm_local_cross, matrix(0, nrow = 10, ncol = 2) ) res_cpp_conn1.5_cross <- smooth_knn_distances_parallel(nn_4$dist, nn_4$idx, n_iter = 64, local_connectivity = 1.5, bandwidth = 1.0, tol = 1e-5, min_k_dist_scale = 1e-3, parallelize = FALSE, verbose = FALSE )$matrix expect_equal(nn_to_sparse( nn_4$idx, as.vector(res_cpp_conn1.5_cross), self_nbr = FALSE, max_nbr_id = 12 ), V_asymm_local_cross, tol = 1e-4 ) res_cpp_conn1.5_cross <- smooth_knn_distances_parallel(nn_4$dist, nn_4$idx, n_iter = 64, local_connectivity = 1.5, bandwidth = 1.0, tol = 1e-5, min_k_dist_scale = 1e-3, verbose = FALSE )$matrix expect_equal(nn_to_sparse( nn_4$idx, as.vector(res_cpp_conn1.5_cross), self_nbr = FALSE, max_nbr_id = 12 ), V_asymm_local_cross, tol = 1e-4 ) uwot/tests/testthat/helper_fuzzy_sets.R0000644000176200001440000001051013400360122020145 0ustar liggesusers# fuzzy set data for iris10 with 4 neighbors # numbers have been compared with python fuzzy_simplicial_set # Asymmetric fuzzy set data V_asymm <- sparseMatrix( i = c( 5, 6, 8, 3, 9, 10, 2, 4, 7, 9, 10, 2, 3, 7, 9, 10, 1, 6, 8, 3, 5, 1, 5, 6, 7, 4, 1, 2, 4, 8 ), j = c( 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 7, 7, 8, 8, 8, 8, 9, 10, 10, 10, 10 ), x = c( 1, 1, 1, 0.328, 0.4252, 1, 0.5386, 1, 1, 0.5748, 0.5, 0.4614, 1, 0.6463, 1, 0.5, 1, 1, 0.6894, 0.672, 0.2807, 0.8381, 0.7193, 5.129e-10, 0.3538, 0.5447, 0.1619, 1, 0.4553, 0.3106 ) ) # Fuzzy Set Union V_union <- sparseMatrix( i = c( 5, 6, 8, 10, 3, 4, 9, 10, 2, 4, 7, 9, 10, 2, 3, 7, 9, 10, 1, 6, 7, 8, 1, 5, 8, 3, 4, 5, 8, 1, 5, 6, 7, 10, 2, 3, 4, 1, 2, 3, 4, 8 ), j = c( 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8, 8, 8, 9, 9, 9, 10, 10, 10, 10, 10 ), x = c( 1, 1, 1, 0.1619, 0.6899, 0.4614, 0.4252, 1, 0.6899, 1, 1, 0.5748, 0.5, 0.4614, 1, 0.6463, 1, 0.7277, 1, 1, 0.2807, 0.9128, 1, 1, 5.129e-10, 1, 0.6463, 0.2807, 0.3538, 1, 0.9128, 5.129e-10, 0.3538, 0.3106, 0.4252, 0.5748, 1, 0.1619, 1, 0.5, 0.7277, 0.3106 ) ) # mix intersection with union V_mix <- sparseMatrix( i = c( 5, 6, 8, 10, 3, 4, 9, 10, 2, 4, 7, 9, 10, 2, 3, 7, 9, 10, 1, 6, 7, 8, 1, 5, 8, 3, 4, 5, 8, 1, 5, 6, 7, 10, 2, 3, 4, 1, 2, 3, 4, 8 ), j = c( 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8, 8, 8, 9, 9, 9, 10, 10, 10, 10, 10 ), x = c( 1, 0.5, 0.919, 0.08095, 0.4333, 0.2307, 0.2126, 1, 0.4333, 1, 0.836, 0.2874, 0.25, 0.2307, 1, 0.3231, 0.7723, 0.4777, 1, 0.5, 0.1404, 0.7043, 0.5, 0.5, 2.564e-10, 0.836, 0.3231, 0.1404, 0.1769, 0.919, 0.7043, 2.564e-10, 0.1769, 0.1553, 0.2126, 0.2874, 0.7723, 0.08095, 1, 0.25, 0.4777, 0.1553 ) ) # intersection V_intersect <- sparseMatrix( i = c(5, 8, 3, 10, 2, 4, 7, 3, 9, 10, 1, 8, 3, 1, 5, 4, 2, 4), j = c(1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 7, 8, 8, 9, 10, 10), x = c( 1, 0.8381, 0.1766, 1, 0.1766, 1, 0.672, 1, 0.5447, 0.2277, 1, 0.4959, 0.672, 0.8381, 0.4959, 0.5447, 1, 0.2277 ) ) # asymm with local connectivity = 1.5 V_asymm_local <- sparseMatrix( i = c( 5, 6, 8, 3, 9, 10, 2, 4, 7, 9, 10, 2, 3, 7, 9, 10, 1, 6, 8, 3, 5, 1, 5, 6, 7, 4, 1, 2, 4, 8 ), j = c( 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 7, 7, 8, 8, 8, 8, 9, 10, 10, 10, 10 ), x = c( 1, 1, 1, 0.2559, 0.3748, 1, 0.5698, 1, 1, 0.6252, 0.5, 0.4302, 1, 0.7157, 1, 0.5, 1, 1, 0.7622, 0.7441, 0.2084, 0.8925, 0.7916, 5.129e-10, 0.2843, 0.5797, 0.1075, 1, 0.4203, 0.2378 ) ) V_union_local <- sparseMatrix( i = c( 5, 6, 8, 10, 3, 4, 9, 10, 2, 4, 7, 9, 10, 2, 3, 7, 9, 10, 1, 6, 7, 8, 1, 5, 8, 3, 4, 5, 8, 1, 5, 6, 7, 10, 2, 3, 4, 1, 2, 3, 4, 8 ), j = c( 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8, 8, 8, 9, 9, 9, 10, 10, 10, 10, 10 ), x = c( 1, 1, 1, 0.1075, 0.6799, 0.4302, 0.3748, 1, 0.6799, 1, 1, 0.6252, 0.5, 0.4302, 1, 0.7157, 1, 0.7102, 1, 1, 0.2084, 0.9504, 1, 1, 5.129e-10, 1, 0.7157, 0.2084, 0.2843, 1, 0.9504, 5.129e-10, 0.2843, 0.2378, 0.3748, 0.6252, 1, 0.1075, 1, 0.5, 0.7102, 0.2378 ) ) V_union_bandwidth <- sparseMatrix( i = c( 5, 6, 8, 10, 3, 4, 9, 10, 2, 4, 7, 9, 10, 2, 3, 7, 9, 10, 1, 6, 7, 8, 1, 5, 8, 3, 4, 5, 8, 1, 5, 6, 7, 10, 2, 3, 4, 1, 2, 3, 4, 8 ), j = c( 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8, 8, 8, 9, 9, 9, 10, 10, 10, 10, 10 ), x = c( 1, 1, 1, 0.02621, 0.3664, 0.2129, 0.1808, 1, 0.3664, 1, 1, 0.3304, 0.25, 0.2129, 1, 0.4176, 1, 0.4055, 1, 1, 0.07881, 0.7467, 1, 1, 2.63e-19, 1, 0.4176, 0.07881, 0.1251, 1, 0.7467, 2.63e-19, 0.1251, 0.09647, 0.1808, 0.3304, 1, 0.02621, 1, 0.25, 0.4055, 0.09647 ) ) V_intersect_local_bandwidth <- sparseMatrix( i = c(5, 8, 3, 10, 2, 4, 7, 3, 9, 10, 1, 8, 3, 1, 5, 4, 2, 4), j = c(1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 7, 8, 8, 9, 10, 10), x = c( 1, 0.7966, 0.02126, 1, 0.02126, 1, 0.5536, 1, 0.336, 0.04417, 1, 0.364, 0.5536, 0.7966, 0.364, 0.336, 1, 0.04417 ) ) uwot/tests/testthat/test_rand_init.R0000644000176200001440000000065513400360122017400 0ustar liggesuserslibrary(uwot) context("Random initialization") n_vertices <- 10 res_rand <- rand_init(n_vertices, ndim = 2, verbose = FALSE) expect_ok_matrix(rand_init(n_vertices, ndim = 2, verbose = FALSE)) expect_ok_matrix(rand_init_lv(n_vertices, ndim = 2, verbose = FALSE)) expect_ok_matrix(rand_init(n_vertices, ndim = 1, verbose = FALSE), nc = 1) expect_ok_matrix(rand_init_lv(n_vertices, ndim = 1, verbose = FALSE), nc = 1) uwot/tests/testthat/test_mixed_distances.R0000644000176200001440000000610113571660433020605 0ustar liggesuserslibrary(uwot) context("mixed distance calculations") set.seed(1337) res <- umap(iris10, n_neighbors = 4, n_epochs = 2, init = "spca", verbose = FALSE, n_threads = 0 ) expect_ok_matrix(res) set.seed(1337) resmli <- umap(iris10, n_neighbors = 4, n_epochs = 2, init = "spca", metric = list("euclidean" = 1:4), verbose = FALSE, n_threads = 0 ) expect_equal(resmli, res) set.seed(1337) resmls <- umap(iris10, n_neighbors = 4, n_epochs = 2, init = "spca", metric = list("euclidean" = c( "Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width" )), verbose = FALSE, n_threads = 0 ) expect_equal(resmls, res) set.seed(1337) jiris10 <- jitter(iris10) metric2 <- list( "euclidean" = c(1, 2), "euclidean" = c("Petal.Length", "Petal.Width") ) reseuc2 <- umap(jiris10, n_neighbors = 4, n_epochs = 2, init = "spca", metric = metric2, verbose = FALSE, n_threads = 0, ret_nn = TRUE, ret_model = TRUE ) expect_ok_matrix(reseuc2$embedding) expect_equal(reseuc2$metric, metric2) expect_is(reseuc2$nn, "list") expect_equal(names(reseuc2$nn), c("euclidean", "euclidean")) expect_is_nn(reseuc2$nn[[1]], 10, 4) expect_is_nn(reseuc2$nn[[2]], 10, 4) expect_ok_matrix(umap_transform(jiris10, reseuc2)) i10factor <- factor(c(rep("foo", 3), rep("bar", 3), rep("baz", 4))) res_y2 <- umap(iris10[, -1], y = cbind(i10factor, iris$Sepal.Length[1:10]), n_neighbors = 4, n_epochs = 2, init = "spca", verbose = FALSE, n_threads = 0 ) expect_ok_matrix(res_y2) nafactor <- as.factor(c(levels(iris$Species)[ c(rep(1, 3), rep(2, 3), rep(3, 3)) ], NA)) iris10c <- cbind(data.frame(iris10), nafactor) rescat <- umap(iris10c, metric = list("euclidean" = 1:4, "categorical" = "nafactor"), n_neighbors = 4, n_epochs = 2, init = "spca", verbose = FALSE, n_threads = 0 ) expect_ok_matrix(rescat) irismixed <- data.frame(iris10, ynum, ynum2, ycat, ycat2) resmixed <- umap(irismixed, n_neighbors = 4, n_epochs = 2, learning_rate = 0.5, min_dist = 0.001, init = "rand", verbose = FALSE, n_threads = 1, metric = list( "euclidean" = 1:4, "euclidean" = 5, "cosine" = 6, "categorical" = c("ycat", "ycat2") ) ) expect_ok_matrix(resmixed) irismixed <- data.frame(iris10, ynum, ynum2, ycat, ycat2) resmixed <- umap(irismixed, n_neighbors = 4, n_epochs = 2, learning_rate = 0.5, min_dist = 0.001, init = "rand", verbose = FALSE, n_threads = 1, metric = list( "euclidean" = 1:4, "euclidean" = 5, "cosine" = 6, "categorical" = c("ycat", "ycat2") ), ret_model = TRUE, ret_nn = TRUE ) expect_ok_matrix(resmixed$embedding) expect_ok_matrix(umap_transform(irismixed, resmixed, n_threads = 1)) expect_equal(names(resmixed$nn), c("euclidean", "euclidean", "cosine")) # #20: allow matrix column for categorical int_column <- c(1, 2, 3, 4, 4, 4, 2, 1, 2, 1) irisic <- cbind(iris10, int_column) resic <- umap(irisic, n_neighbors = 4, n_epochs = 2, learning_rate = 0.5, init = "spca", verbose = FALSE, n_threads = 0, metric = list("euclidean" = 1:4, "categorical" = 5) ) expect_ok_matrix(resic) uwot/tests/testthat/test_output.R0000644000176200001440000002743613571660445017023 0ustar liggesuserslibrary(uwot) context("API output") set.seed(1337) # No way to compare with the Python implementation due to differences in # random number implementations as well as floating point comparison # and various architecture differences. So we'll just check that the output # is ok res <- umap(iris10, n_neighbors = 4, n_epochs = 2, learning_rate = 0.5, min_dist = 0.001, init = "normlaplacian", verbose = FALSE, n_threads = 0 ) expect_ok_matrix(res) # Results are repeatable with n_threads = 0 (or 1) and same seed set.seed(1337) res2 <- umap(iris10, n_neighbors = 4, n_epochs = 2, learning_rate = 0.5, min_dist = 0.001, init = "normlaplacian", verbose = FALSE, n_threads = 0 ) expect_equal(res2, res) # Distance matrix input res <- umap(dist(iris10), n_neighbors = 4, n_epochs = 2, learning_rate = 0.5, min_dist = 0.001, init = "laplacian", verbose = FALSE, n_threads = 0 ) expect_ok_matrix(res) # t-UMAP and cosine metric res <- tumap(iris10, n_neighbors = 4, n_epochs = 2, learning_rate = 0.5, metric = "cosine", init = "spectral", verbose = FALSE, n_threads = 0 ) expect_ok_matrix(res) # UMAP and cosine metric n_threads = 1 issue #5 res <- umap(iris10, n_neighbors = 4, n_epochs = 2, learning_rate = 0.5, metric = "cosine", init = "spectral", verbose = FALSE, n_threads = 1 ) expect_ok_matrix(res) # metric = Manhattan res <- umap(iris10, n_neighbors = 4, n_epochs = 2, learning_rate = 0.5, metric = "manhattan", init = "rand", verbose = FALSE, n_threads = 0 ) expect_ok_matrix(res) res <- umap(iris10, n_neighbors = 4, n_epochs = 2, learning_rate = 0.5, metric = "manhattan", init = "spca", verbose = FALSE, n_threads = 1 ) expect_ok_matrix(res) # init with matrix iris10_pca <- prcomp(iris10, retx = TRUE, center = TRUE, scale. = FALSE )$x[, 1:2] res <- umap(iris10, n_neighbors = 4, n_epochs = 2, learning_rate = 0.5, min_dist = 0.001, init = iris10_pca, verbose = FALSE, n_threads = 0 ) expect_ok_matrix(res) # Ensure that internal C++ code doesn't modify user-supplied initialization expect_equal(iris10_pca, prcomp(iris10, retx = TRUE, center = TRUE, scale. = FALSE )$x[, 1:2]) # return nn # reset seed here so we can compare output with next test result set.seed(1337) res <- umap(iris10, n_neighbors = 4, n_epochs = 2, learning_rate = 0.5, min_dist = 0.001, init = "spca", verbose = FALSE, n_threads = 0, ret_nn = TRUE ) expect_is(res, "list") expect_ok_matrix(res$embedding) expect_is(res$nn, "list") expect_is(res$nn$euclidean, "list") expect_ok_matrix(res$nn$euclidean$idx, nc = 4) expect_ok_matrix(res$nn$euclidean$dist, nc = 4) # Use pre-calculated nn: should be the same as previous result set.seed(1337) res_nn <- umap(iris10, nn_method = res$nn[[1]], n_epochs = 2, learning_rate = 0.5, min_dist = 0.001, init = "spca", verbose = FALSE, n_threads = 0 ) expect_ok_matrix(res_nn) expect_equal(res_nn, res$embedding) # X = NULL is ok if passing nn data and rand init set.seed(1337) res_nnxn <- umap( X = NULL, nn_method = nn, n_epochs = 2, learning_rate = 0.5, min_dist = 0.001, init = "rand", verbose = FALSE, n_threads = 0 ) # Passing nn list directly is also ok set.seed(1337) res_nnl <- umap(iris10, nn_method = res$nn, n_epochs = 2, learning_rate = 0.5, min_dist = 0.001, init = "rand", verbose = FALSE, n_threads = 0, ret_nn = TRUE ) expect_ok_matrix(res_nnl$embedding) expect_equal(res_nnl$nn[[1]], res$nn[[1]]) expect_equal(names(res_nnl$nn), "precomputed") expect_equal(res_nnxn, res_nnl$embedding) # Use multiple nn data res_nn2 <- umap(iris10, nn_method = list(nn, nn), n_epochs = 2, learning_rate = 0.5, min_dist = 0.001, init = "spca", verbose = FALSE, n_threads = 0, ret_nn = TRUE ) expect_ok_matrix(res_nn2$embedding) expect_equal(names(res_nn2$nn), c("precomputed", "precomputed")) # lvish and force use of annoy res <- lvish(iris10, perplexity = 4, n_epochs = 2, learning_rate = 0.5, nn_method = "annoy", init = "lvrand", verbose = FALSE, n_threads = 1 ) expect_ok_matrix(res) # lvish with knn res <- lvish(iris10, kernel = "knn", perplexity = 4, n_epochs = 2, learning_rate = 0.5, init = "lvrand", verbose = FALSE, n_threads = 1 ) expect_ok_matrix(res) # return a model res <- umap(iris10, n_neighbors = 4, n_epochs = 2, learning_rate = 0.5, min_dist = 0.001, init = "rand", verbose = FALSE, n_threads = 1, ret_model = TRUE ) expect_is(res, "list") expect_ok_matrix(res$embedding) res_test <- umap_transform(iris10, res, n_threads = 1, verbose = FALSE) expect_ok_matrix(res_test) # return nn and a model res <- tumap(iris10, n_neighbors = 4, n_epochs = 2, learning_rate = 0.5, init = "rand", verbose = FALSE, n_threads = 1, ret_model = TRUE, ret_nn = TRUE ) expect_is(res, "list") expect_ok_matrix(res$embedding) expect_is(res$nn, "list") expect_is_nn(res$nn[[1]], k = 4) expect_equal(names(res$nn), "euclidean") res_test <- umap_transform(iris10, res, n_threads = 0, verbose = FALSE) expect_ok_matrix(res_test) # https://github.com/jlmelville/uwot/issues/6 res <- umap(iris10, n_components = 1, n_neighbors = 4, n_epochs = 2, n_threads = 1, verbose = FALSE ) expect_ok_matrix(res, nc = 1) # Supervised set.seed(1337) res_y <- umap(iris10, n_neighbors = 4, n_epochs = 2, learning_rate = 0.5, min_dist = 0.001, init = "spca", verbose = FALSE, n_threads = 0, y = 1 / (1:10)^2, target_n_neighbors = 2 ) expect_ok_matrix(res_y) # Repeat using equivalent NN info for y y_nn <- list( idx = matrix(c( 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 9 ), ncol = 2, byrow = TRUE), dist = matrix(c( 0, 0.750000000, 0, 0.138888896, 0, 0.048611112, 0, 0.022500001, 0, 0.012222221, 0, 0.007369615, 0, 0.004783163, 0, 0.003279321, 0, 0.002345679, 0, 0.002345679 ), ncol = 2, byrow = TRUE) ) set.seed(1337) res_ynn <- umap(iris10, n_neighbors = 4, n_epochs = 2, learning_rate = 0.5, min_dist = 0.001, init = "spca", verbose = FALSE, n_threads = 0, y = y_nn ) expect_ok_matrix(res_ynn) # Should be the same result expect_equal(res_ynn, res_y) bin10 <- structure(c( 0L, 0L, 1L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 0L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 1L, 0L, 0L, 1L, 1L, 0L, 0L, 1L, 1L, 0L ), .Dim = c(10L, 4L)) res <- umap(bin10, n_neighbors = 4, metric = "hamming", verbose = FALSE, n_threads = 1 ) expect_ok_matrix(res) # Multiple metrics set.seed(1337) res <- umap(iris10, n_neighbors = 4, n_epochs = 2, learning_rate = 0.5, init = "spca", verbose = FALSE, n_threads = 0, metric = list(euclidean = c(1, 2), euclidean = c(3, 4)), ret_model = TRUE ) res_trans <- umap_transform(iris10, model = res, verbose = FALSE, n_threads = 0, n_epochs = 2 ) expect_ok_matrix(res_trans) # PCA dimensionality reduction res <- umap(iris10, n_neighbors = 4, n_epochs = 2, learning_rate = 0.5, init = "spca", verbose = FALSE, n_threads = 0, pca = 2, ret_model = TRUE ) expect_ok_matrix(res$embedding) expect_is(res$pca_models, "list") expect_equal(length(res$pca_models), 1) expect_ok_matrix(res$pca_models[["1"]]$rotation, nr = 4, nc = 2) expect_equal(res$pca_models[["1"]]$center, c(4.86, 3.31, 1.45, 0.22), check.attributes = FALSE ) # no centering res <- umap(iris10, n_neighbors = 4, n_epochs = 2, learning_rate = 0.5, init = "spca", verbose = FALSE, n_threads = 0, pca = 2, pca_center = FALSE, ret_model = TRUE ) expect_ok_matrix(res$embedding) expect_is(res$pca_models, "list") expect_equal(length(res$pca_models), 1) expect_ok_matrix(res$pca_models[["1"]]$rotation, nr = 4, nc = 2) expect_null(res$pca_models[["1"]]$center) res <- umap(iris10, n_neighbors = 4, n_epochs = 2, learning_rate = 0.5, metric = list("euclidean" = 1:2, "euclidean" = 3:4), init = "spca", verbose = FALSE, n_threads = 0, pca = 2 ) expect_ok_matrix(res) # Mixed metrics, PCA and transform set.seed(1337) ib10 <- cbind(iris10, bin10, bin10) res <- umap(ib10, n_neighbors = 4, n_epochs = 2, learning_rate = 0.5, init = "spca", verbose = FALSE, n_threads = 0, metric = list( euclidean = c(1, 2), hamming = 5:12, euclidean = c(3, 4) ), pca = 2, ret_model = TRUE ) expect_ok_matrix(res$embedding) expect_is(res$pca_models, "list") expect_equal(length(res$pca_models), 2) expect_equal(names(res$pca_models), c("1", "3")) expect_ok_matrix(res$pca_models[["1"]]$rotation, nr = 2, nc = 2) expect_equal(res$pca_models[["1"]]$center, c(4.86, 3.31), check.attributes = FALSE ) expect_ok_matrix(res$pca_models[["3"]]$rotation, nr = 2, nc = 2) expect_equal(res$pca_models[["3"]]$center, c(1.45, 0.22), check.attributes = FALSE ) res_trans <- umap_transform(ib10, model = res, verbose = FALSE, n_threads = 0, n_epochs = 2 ) expect_ok_matrix(res_trans) # Override pca command in third block set.seed(1337) res <- umap(ib10, n_neighbors = 4, n_epochs = 2, learning_rate = 0.5, init = "spca", verbose = FALSE, n_threads = 0, metric = list( euclidean = c(1, 2), hamming = 5:8, euclidean = list(c(3, 4), pca = NULL) ), pca = 2, ret_model = TRUE ) expect_ok_matrix(res$embedding) expect_is(res$pca_models, "list") expect_equal(length(res$pca_models), 1) expect_equal(names(res$pca_models), "1") expect_ok_matrix(res$pca_models[["1"]]$rotation, nr = 2, nc = 2) expect_equal(res$pca_models[["1"]]$center, c(4.86, 3.31), check.attributes = FALSE ) res_trans <- umap_transform(ib10, model = res, verbose = FALSE, n_threads = 0, n_epochs = 2 ) expect_ok_matrix(res_trans) # Turn off PCA centering for binary data set.seed(1337) res <- umap(bin10, n_neighbors = 4, n_epochs = 2, learning_rate = 0.5, init = "spca", verbose = FALSE, n_threads = 0, metric = "manhattan", pca = 2, pca_center = FALSE, ret_model = TRUE ) expect_ok_matrix(res$embedding) expect_is(res$pca_models, "list") expect_equal(length(res$pca_models), 1) expect_equal(names(res$pca_models), "1") expect_ok_matrix(res$pca_models[["1"]]$rotation, nr = 4, nc = 2) expect_null(res$pca_models[["1"]]$center) res_trans <- umap_transform(bin10, model = res, verbose = FALSE, n_threads = 0, n_epochs = 2 ) expect_ok_matrix(res_trans) # shrunk spectral initialization res <- umap(iris10, n_neighbors = 4, n_epochs = 2, learning_rate = 0.5, init = "pca", verbose = FALSE, n_threads = 0, init_sdev = 2 ) expect_ok_matrix(res) res <- umap(iris10, n_neighbors = 4, n_epochs = 2, learning_rate = 0.5, init = "laplacian", verbose = FALSE, n_threads = 0, init_sdev = 0.1 ) expect_ok_matrix(res) res <- umap(iris10, n_neighbors = 4, n_epochs = 2, learning_rate = 0.5, init = "spectral", verbose = FALSE, n_threads = 0, init_sdev = 5 ) expect_ok_matrix(res) # umap transform when test datset size > train dataset size set.seed(1337) res <- umap(iris10[1:4, ], n_neighbors = 4, n_epochs = 2, learning_rate = 0.5, min_dist = 0.001, init = "rand", verbose = FALSE, ret_model = TRUE ) expect_is(res, "list") expect_ok_matrix(res$embedding, nr = 4) res_test <- umap_transform(iris10[5:10, ], res, verbose = FALSE, n_epochs = 10) expect_ok_matrix(res_test, nr = 6) # taus88 prng res <- umap(iris10, pcg_rand = FALSE, n_neighbors = 4, n_epochs = 2, learning_rate = 0.5, init = "spectral", verbose = FALSE, n_threads = 0, init_sdev = 5 ) expect_ok_matrix(res) # https://github.com/jlmelville/uwot/issues/39 res <- umap(iris10, n_neighbors = 4, n_threads = 0.5) expect_ok_matrix(res) res <- umap(iris10, n_neighbors = 4, n_threads = 1.5) expect_ok_matrix(res) res <- umap(iris10, n_neighbors = 4, n_sgd_threads = 0.5) expect_ok_matrix(res) res <- umap(iris10, n_neighbors = 4, n_sgd_threads = 1.5) expect_ok_matrix(res) uwot/tests/testthat.R0000644000176200001440000000007013400360122014361 0ustar liggesuserslibrary(testthat) library(uwot) test_check("uwot") uwot/src/0000755000176200001440000000000013571664300012044 5ustar liggesusersuwot/src/supervised.cpp0000644000176200001440000000601713571657123014752 0ustar liggesusers// UWOT -- An R package for dimensionality reduction using UMAP // // Copyright (C) 2018 James Melville // // This file is part of UWOT // // UWOT is free software: you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation, either version 3 of the License, or // (at your option) any later version. // // UWOT is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with UWOT. If not, see . #include using namespace Rcpp; // [[Rcpp::export]] NumericVector fast_intersection_cpp(const IntegerVector rows, const IntegerVector cols, NumericVector values, const IntegerVector target, double unknown_dist = 1.0, double far_dist = 5.0) { double ex_unknown = std::exp(-unknown_dist); double ex_far = std::exp(-far_dist); auto len = values.length(); for (auto nz = 0; nz < len; ++nz) { auto i = rows[nz]; auto j = cols[nz]; if (IntegerVector::is_na(target[i]) || IntegerVector::is_na(target[j])) { values[nz] = values[nz] * ex_unknown; } else if (target[i] != target[j]) { values[nz] = values[nz] * ex_far; } } return values; } // [[Rcpp::export]] NumericVector general_sset_intersection_cpp( const IntegerVector indptr1, const IntegerVector indices1, NumericVector data1, const IntegerVector indptr2, const IntegerVector indices2, NumericVector data2, const IntegerVector result_row, const IntegerVector result_col, NumericVector result_val, double mix_weight = 0.5) { double left_min = std::max(Rcpp::min(data1) / 2.0, 1.0e-8); double right_min = std::max(Rcpp::min(data2) / 2.0, 1.0e-8); for (auto idx = 0; idx < result_row.length(); idx++) { auto i = result_col[idx]; auto j = result_row[idx]; auto left_end = indices1.begin() + indptr1[i + 1]; auto left_it = std::lower_bound(indices1.begin() + indptr1[i], left_end, j); double left_val = (left_it != left_end && *left_it == j ? data1[left_it - indices1.begin()] : left_min); auto right_end = indices2.begin() + indptr2[i + 1]; auto right_it = std::lower_bound(indices2.begin() + indptr2[i], right_end, j); double right_val = (right_it != right_end && *right_it == j ? data2[right_it - indices2.begin()] : right_min); if (left_val > left_min || right_val > right_min) { if (mix_weight < 0.5) { result_val[idx] = left_val * std::pow(right_val, (mix_weight / (1.0 - mix_weight))); } else { result_val[idx] = right_val * std::pow(left_val, (((1.0 - mix_weight) / mix_weight))); } } } return result_val; } uwot/src/sampler.cpp0000644000176200001440000000404313571657123014221 0ustar liggesusers// UWOT -- An R package for dimensionality reduction using UMAP // // Copyright (C) 2018 James Melville // // This file is part of UWOT // // UWOT is free software: you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation, either version 3 of the License, or // (at your option) any later version. // // UWOT is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with UWOT. If not, see . #include "sampler.h" Sampler::Sampler(const std::vector &epochs_per_sample, const double negative_sample_rate) : epochs_per_sample(epochs_per_sample), epoch_of_next_sample(epochs_per_sample), epochs_per_negative_sample(epochs_per_sample.size()), epoch_of_next_negative_sample(epochs_per_sample.size()) { const std::size_t esz = epochs_per_sample.size(); const double nsr = 1.0 / negative_sample_rate; for (std::size_t i = 0; i < esz; i++) { epochs_per_negative_sample[i] = epochs_per_sample[i] * nsr; epoch_of_next_negative_sample[i] = epochs_per_negative_sample[i]; } } bool Sampler::is_sample_edge(const std::size_t i, const std::size_t n) const { return epoch_of_next_sample[i] <= n; } const std::size_t Sampler::get_num_neg_samples(const std::size_t i, const std::size_t n) const { return static_cast((n - epoch_of_next_negative_sample[i]) / epochs_per_negative_sample[i]); } void Sampler::next_sample(const std::size_t i, const std::size_t num_neg_samples) { epoch_of_next_sample[i] += epochs_per_sample[i]; epoch_of_next_negative_sample[i] += num_neg_samples * epochs_per_negative_sample[i]; } uwot/src/connected_components.cpp0000644000176200001440000000446513571657123016775 0ustar liggesusers// UWOT -- An R package for dimensionality reduction using UMAP // // Copyright (C) 2018 James Melville // // This file is part of UWOT // // UWOT is free software: you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation, either version 3 of the License, or // (at your option) any later version. // // UWOT is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with UWOT. If not, see . // Translated from the Python source code of: // scipy.sparse.csgraph.connected_components // Author: Jake Vanderplas -- // License: BSD, (C) 2012 // You may also use the (non-Rcpp) parts of the C++ algorithm under the same // license. #include using namespace Rcpp; // [[Rcpp::export]] List connected_components_undirected(const unsigned long N, const IntegerVector indices1, const IntegerVector indptr1, const IntegerVector indices2, const IntegerVector indptr2) { const int VOID = -1; const int END = -2; std::vector labels(N, VOID); std::vector SS(labels); int label = 0; int SS_head = END; for (unsigned int v = 0; v < N; ++v) { unsigned int vv = v; if (labels[vv] == VOID) { SS_head = vv; SS[vv] = END; while (SS_head != END) { vv = SS_head; SS_head = SS[vv]; labels[vv] = label; for (int jj = indptr1[vv]; jj < indptr1[vv + 1]; ++jj) { int ww = indices1[jj]; if (SS[ww] == VOID) { SS[ww] = SS_head; SS_head = ww; } } for (int jj = indptr2[vv]; jj < indptr2[vv + 1]; ++jj) { int ww = indices2[jj]; if (SS[ww] == VOID) { SS[ww] = SS_head; SS_head = ww; } } } ++label; } } return List::create(_["n_components"] = label, _["labels"] = labels); } uwot/src/gradient.h0000644000176200001440000000511013571657301014012 0ustar liggesusers// UWOT -- An R package for dimensionality reduction using UMAP // // Copyright (C) 2018 James Melville // // This file is part of UWOT // // UWOT is free software: you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation, either version 3 of the License, or // (at your option) any later version. // // UWOT is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with UWOT. If not, see . #ifndef UWOT_GRADIENT_H #define UWOT_GRADIENT_H #include // Class templated on the powfun function as suggested by Aaron Lun template class base_umap_gradient { public: base_umap_gradient(const double a, const double b, const double gamma); const double grad_attr(const double dist_squared) const; const double grad_rep(const double dist_squared) const; static const constexpr double clamp_hi = 4.0; static const constexpr double clamp_lo = -4.0; private: const double a; const double b; const double a_b_m2; const double gamma_b_2; }; // UMAP typedef base_umap_gradient umap_gradient; // apUMAP: UMAP with an approximate power calculation double fastPrecisePow(double, double); typedef base_umap_gradient apumap_gradient; // t-UMAP: the UMAP function with a = 1, and b = 1, which results in the Cauchy // distribution as used in t-SNE. This massively simplifies the gradient, // removing the pow calls, resulting in a noticeable speed increase (50% with // MNIST), although the resulting embedding has a larger spread than the // default. Also gamma is absent from this, because I believe it to be // un-necessary in the UMAP cost function. class tumap_gradient { public: tumap_gradient(); const double grad_attr(const double dist_squared) const; const double grad_rep(const double dist_squared) const; static const constexpr double clamp_hi = 4.0; static const constexpr double clamp_lo = -4.0; }; class largevis_gradient { public: largevis_gradient(const double gamma); const double grad_attr(const double dist_squared) const; const double grad_rep(const double dist_squared) const; static const constexpr double clamp_hi = 5.0; static const constexpr double clamp_lo = -5.0; private: const double gamma_2; }; #endif // UWOT_GRADIENT_H uwot/src/gradient.cpp0000644000176200001440000000571713571657123014364 0ustar liggesusers// UWOT -- An R package for dimensionality reduction using UMAP // // Copyright (C) 2018 James Melville // // This file is part of UWOT // // UWOT is free software: you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation, either version 3 of the License, or // (at your option) any later version. // // UWOT is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with UWOT. If not, see . #include "gradient.h" #include // https://martin.ankerl.com/2012/01/25/optimized-approximative-pow-in-c-and-cpp/ // an approximation to pow double fastPrecisePow(double a, double b) { // calculate approximation with fraction of the exponent int e = (int)b; union { double d; int x[2]; } u = {a}; u.x[1] = (int)((b - e) * (u.x[1] - 1072632447) + 1072632447); u.x[0] = 0; // exponentiation by squaring with the exponent's integer part // double r = u.d makes everything much slower, not sure why double r = 1.0; while (e) { if (e & 1) { r *= a; } a *= a; e >>= 1; } return r * u.d; } // UMAP implementation code template base_umap_gradient::base_umap_gradient(const double a, const double b, const double gamma) : a(a), b(b), a_b_m2(-2.0 * a * b), gamma_b_2(2.0 * gamma * b) {} template const double base_umap_gradient::grad_attr(const double dist_squared) const { const double pd2b = powfun(dist_squared, b); return (a_b_m2 * pd2b) / (dist_squared * (a * pd2b + 1.0)); } template const double base_umap_gradient::grad_rep(const double dist_squared) const { return gamma_b_2 / ((0.001 + dist_squared) * (a * powfun(dist_squared, b) + 1.0)); } // UMAP using standard power function template class base_umap_gradient; // apUMAP using approximate power function template class base_umap_gradient; // t-UMAP tumap_gradient::tumap_gradient() {} const double tumap_gradient::grad_attr(const double dist_squared) const { return -2.0 / (dist_squared + 1.0); } const double tumap_gradient::grad_rep(const double dist_squared) const { return 2.0 / ((0.001 + dist_squared) * (dist_squared + 1.0)); } // LargeVis largevis_gradient::largevis_gradient(const double gamma) : gamma_2(gamma * 2.0) {} const double largevis_gradient::grad_attr(const double dist_squared) const { return -2.0 / (dist_squared + 1.0); } const double largevis_gradient::grad_rep(const double dist_squared) const { return gamma_2 / ((0.1 + dist_squared) * (dist_squared + 1.0)); } uwot/src/optimize.cpp0000644000176200001440000003507413571657123014426 0ustar liggesusers// UWOT -- An R package for dimensionality reduction using UMAP // // Copyright (C) 2018 James Melville // // This file is part of UWOT // // UWOT is free software: you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation, either version 3 of the License, or // (at your option) any later version. // // UWOT is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with UWOT. If not, see . #include #include // [[Rcpp::depends(RcppProgress)]] #include #include "gradient.h" #include "sampler.h" #include "tauprng.h" // Must come after any include that transitively include dqrng // [[Rcpp::depends(RcppParallel)]] #include // Function to decide whether to move both vertices in an edge // Default empty version does nothing: used in umap_transform when // some of the vertices should be held fixed template void move_other_vertex(std::vector &embedding, const double grad_d, const std::size_t i, const std::size_t nrj) {} // Specialization to move the vertex: used in umap when both // vertices in an edge should be moved template <> void move_other_vertex(std::vector &embedding, const double grad_d, const std::size_t i, const std::size_t nrj) { embedding[nrj + i] -= grad_d; } const double clamp(const double v, const double lo, const double hi) { const double t = v < lo ? lo : v; return t > hi ? hi : t; } // Gradient: the type of gradient used in the optimization // DoMoveVertex: true if both ends of a positive edge should be updated template struct SgdWorker : public RcppParallel::Worker { int n; // epoch counter double alpha; const Gradient gradient; const std::vector positive_head; const std::vector positive_tail; Sampler sampler; std::vector &head_embedding; std::vector &tail_embedding; const std::size_t ndim; const std::size_t head_nvert; const std::size_t tail_nvert; tthread::mutex mutex; const double dist_eps; RngFactory rng_factory; SgdWorker(const Gradient &gradient, const std::vector &positive_head, const std::vector &positive_tail, Sampler &sampler, std::vector &head_embedding, std::vector &tail_embedding, const std::size_t ndim) : n(0), alpha(0.0), gradient(gradient), positive_head(positive_head), positive_tail(positive_tail), sampler(sampler), head_embedding(head_embedding), tail_embedding(tail_embedding), ndim(ndim), head_nvert(head_embedding.size() / ndim), tail_nvert(tail_embedding.size() / ndim), dist_eps(std::numeric_limits::epsilon()), rng_factory() {} void operator()(std::size_t begin, std::size_t end) { // std::unique_ptr prng(nullptr); // Each window gets its own PRNG state, to prevent locking inside the loop. auto prng = rng_factory.create(end); // // { // tthread::lock_guard guard(mutex); // prng.reset(new Rand()); // } std::vector dys(ndim); for (std::size_t i = begin; i < end; i++) { if (!sampler.is_sample_edge(i, n)) { continue; } const std::size_t dj = ndim * positive_head[i]; const std::size_t dk = ndim * positive_tail[i]; double dist_squared = 0.0; for (std::size_t d = 0; d < ndim; d++) { const double diff = head_embedding[dj + d] - tail_embedding[dk + d]; dys[d] = diff; dist_squared += diff * diff; } dist_squared = (std::max)(dist_eps, dist_squared); const double grad_coeff = gradient.grad_attr(dist_squared); for (std::size_t d = 0; d < ndim; d++) { const double grad_d = alpha * clamp(grad_coeff * dys[d], Gradient::clamp_lo, Gradient::clamp_hi); head_embedding[dj + d] += grad_d; move_other_vertex(tail_embedding, grad_d, d, dk); } const std::size_t n_neg_samples = sampler.get_num_neg_samples(i, n); for (std::size_t p = 0; p < n_neg_samples; p++) { const std::size_t dkn = prng(tail_nvert) * ndim; if (dj == dkn) { continue; } double dist_squared = 0.0; for (std::size_t d = 0; d < ndim; d++) { const double diff = head_embedding[dj + d] - tail_embedding[dkn + d]; dys[d] = diff; dist_squared += diff * diff; } dist_squared = (std::max)(dist_eps, dist_squared); const double grad_coeff = gradient.grad_rep(dist_squared); for (std::size_t d = 0; d < ndim; d++) { const double grad_d = alpha * clamp(grad_coeff * dys[d], Gradient::clamp_lo, Gradient::clamp_hi); head_embedding[dj + d] += grad_d; } } sampler.next_sample(i, n_neg_samples); } } void set_n(int n) { this->n = n; } void set_alpha(double alpha) { this->alpha = alpha; } void reseed() { this->rng_factory.reseed(); } }; template std::vector optimize_layout( const T &gradient, std::vector &head_embedding, std::vector &tail_embedding, const std::vector &positive_head, const std::vector &positive_tail, unsigned int n_epochs, unsigned int n_vertices, const std::vector &epochs_per_sample, double initial_alpha, double negative_sample_rate, bool parallelize = true, std::size_t grain_size = 1, bool verbose = false) { Sampler sampler(epochs_per_sample, negative_sample_rate); SgdWorker worker( gradient, positive_head, positive_tail, sampler, head_embedding, tail_embedding, head_embedding.size() / n_vertices); Progress progress(n_epochs, verbose); const auto n_epochs_per_sample = epochs_per_sample.size(); double alpha = initial_alpha; for (auto n = 0U; n < n_epochs; n++) { worker.set_alpha(alpha); worker.set_n(n); worker.reseed(); if (parallelize) { RcppParallel::parallelFor(0, n_epochs_per_sample, worker, grain_size); } else { worker(0, n_epochs_per_sample); } alpha = initial_alpha * (1.0 - (double(n) / double(n_epochs))); if (Progress::check_abort()) { progress.cleanup(); return head_embedding; } if (verbose) { progress.increment(); } } return head_embedding; } // [[Rcpp::export]] Rcpp::NumericMatrix optimize_layout_umap( Rcpp::NumericMatrix head_embedding, Rcpp::Nullable tail_embedding, const std::vector positive_head, const std::vector positive_tail, unsigned int n_epochs, unsigned int n_vertices, const std::vector epochs_per_sample, double a, double b, double gamma, double initial_alpha, double negative_sample_rate, bool approx_pow, bool pcg_rand = true, bool parallelize = true, std::size_t grain_size = 1, bool move_other = true, bool verbose = false) { // For normal UMAP, tail_embedding is NULL and we want to pass // a shallow copy of head_embedding as tail_embedding. // When updating new values, tail_embedding is the new coordinate to optimize // and gets passed as normal. auto head_vec = Rcpp::as>(head_embedding); std::vector *tail_vec_ptr = nullptr; bool delete_tail_ptr = false; if (tail_embedding.isNull()) { tail_vec_ptr = &head_vec; } else { tail_vec_ptr = new std::vector(Rcpp::as>(tail_embedding)); delete_tail_ptr = true; } std::vector result; if (approx_pow) { const apumap_gradient gradient(a, b, gamma); if (move_other) { if (pcg_rand) { result = optimize_layout( gradient, head_vec, *tail_vec_ptr, positive_head, positive_tail, n_epochs, n_vertices, epochs_per_sample, initial_alpha, negative_sample_rate, parallelize, grain_size, verbose); } else { result = optimize_layout( gradient, head_vec, *tail_vec_ptr, positive_head, positive_tail, n_epochs, n_vertices, epochs_per_sample, initial_alpha, negative_sample_rate, parallelize, grain_size, verbose); } } else { if (pcg_rand) { result = optimize_layout( gradient, head_vec, *tail_vec_ptr, positive_head, positive_tail, n_epochs, n_vertices, epochs_per_sample, initial_alpha, negative_sample_rate, parallelize, grain_size, verbose); } else { result = optimize_layout( gradient, head_vec, *tail_vec_ptr, positive_head, positive_tail, n_epochs, n_vertices, epochs_per_sample, initial_alpha, negative_sample_rate, parallelize, grain_size, verbose); } } } else { const umap_gradient gradient(a, b, gamma); if (move_other) { if (pcg_rand) { result = optimize_layout( gradient, head_vec, *tail_vec_ptr, positive_head, positive_tail, n_epochs, n_vertices, epochs_per_sample, initial_alpha, negative_sample_rate, parallelize, grain_size, verbose); } else { result = optimize_layout( gradient, head_vec, *tail_vec_ptr, positive_head, positive_tail, n_epochs, n_vertices, epochs_per_sample, initial_alpha, negative_sample_rate, parallelize, grain_size, verbose); } } else { if (pcg_rand) { result = optimize_layout( gradient, head_vec, *tail_vec_ptr, positive_head, positive_tail, n_epochs, n_vertices, epochs_per_sample, initial_alpha, negative_sample_rate, parallelize, grain_size, verbose); } else { result = optimize_layout( gradient, head_vec, *tail_vec_ptr, positive_head, positive_tail, n_epochs, n_vertices, epochs_per_sample, initial_alpha, negative_sample_rate, parallelize, grain_size, verbose); } } } if (delete_tail_ptr) { delete (tail_vec_ptr); } return Rcpp::NumericMatrix(head_embedding.nrow(), head_embedding.ncol(), result.begin()); } // [[Rcpp::export]] Rcpp::NumericMatrix optimize_layout_tumap( Rcpp::NumericMatrix head_embedding, Rcpp::Nullable tail_embedding, const std::vector positive_head, const std::vector positive_tail, unsigned int n_epochs, unsigned int n_vertices, const std::vector epochs_per_sample, double initial_alpha, double negative_sample_rate, bool pcg_rand = true, bool parallelize = true, std::size_t grain_size = 1, bool move_other = true, bool verbose = false) { const tumap_gradient gradient; auto head_vec = Rcpp::as>(head_embedding); std::vector *tail_vec_ptr = nullptr; bool delete_tail_ptr = false; if (tail_embedding.isNull()) { tail_vec_ptr = &head_vec; } else { tail_vec_ptr = new std::vector(Rcpp::as>(tail_embedding)); delete_tail_ptr = true; } std::vector result; if (move_other) { if (pcg_rand) { result = optimize_layout( gradient, head_vec, *tail_vec_ptr, positive_head, positive_tail, n_epochs, n_vertices, epochs_per_sample, initial_alpha, negative_sample_rate, parallelize, grain_size, verbose); } else { result = optimize_layout( gradient, head_vec, *tail_vec_ptr, positive_head, positive_tail, n_epochs, n_vertices, epochs_per_sample, initial_alpha, negative_sample_rate, parallelize, grain_size, verbose); } } else { if (pcg_rand) { result = optimize_layout( gradient, head_vec, *tail_vec_ptr, positive_head, positive_tail, n_epochs, n_vertices, epochs_per_sample, initial_alpha, negative_sample_rate, parallelize, grain_size, verbose); } else { result = optimize_layout( gradient, head_vec, *tail_vec_ptr, positive_head, positive_tail, n_epochs, n_vertices, epochs_per_sample, initial_alpha, negative_sample_rate, parallelize, grain_size, verbose); } } if (delete_tail_ptr) { delete (tail_vec_ptr); } return Rcpp::NumericMatrix(head_embedding.nrow(), head_embedding.ncol(), result.begin()); } // [[Rcpp::export]] Rcpp::NumericMatrix optimize_layout_largevis( Rcpp::NumericMatrix head_embedding, const std::vector positive_head, const std::vector positive_tail, unsigned int n_epochs, unsigned int n_vertices, const std::vector epochs_per_sample, double gamma, double initial_alpha, double negative_sample_rate, bool pcg_rand = true, bool parallelize = true, std::size_t grain_size = 1, bool verbose = false) { // We don't support adding extra points for LargeVis, so this is much simpler // than the UMAP case const largevis_gradient gradient(gamma); auto head_vec = Rcpp::as>(head_embedding); std::vector result; if (pcg_rand) { result = optimize_layout( gradient, head_vec, head_vec, positive_head, positive_tail, n_epochs, n_vertices, epochs_per_sample, initial_alpha, negative_sample_rate, parallelize, grain_size, verbose); } else { result = optimize_layout( gradient, head_vec, head_vec, positive_head, positive_tail, n_epochs, n_vertices, epochs_per_sample, initial_alpha, negative_sample_rate, parallelize, grain_size, verbose); } return Rcpp::NumericMatrix(head_embedding.nrow(), head_embedding.ncol(), result.begin()); } uwot/src/perplexity.cpp0000644000176200001440000001252413571657123014766 0ustar liggesusers// UWOT -- An R package for dimensionality reduction using UMAP // // Copyright (C) 2018 James Melville // // This file is part of UWOT // // UWOT is free software: you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation, either version 3 of the License, or // (at your option) any later version. // // UWOT is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with UWOT. If not, see . #include #include #include // [[Rcpp::depends(RcppParallel)]] #include struct PerplexityWorker : public RcppParallel::Worker { RcppParallel::RMatrix res; const RcppParallel::RMatrix nn_dist; const RcppParallel::RMatrix nn_idx; const unsigned int n_vertices; const unsigned int n_neighbors; const double target; const unsigned int n_iter; const double tol; const double double_max = (std::numeric_limits::max)(); tthread::mutex mutex; std::size_t n_search_fails; PerplexityWorker(Rcpp::NumericMatrix res, const Rcpp::NumericMatrix nn_dist, const Rcpp::IntegerMatrix nn_idx, const double perplexity, const unsigned int n_iter, const double tol) : res(res), nn_dist(nn_dist), nn_idx(nn_idx), n_vertices(nn_dist.nrow()), n_neighbors(nn_dist.ncol()), target(std::log(perplexity)), n_iter(n_iter), tol(tol), n_search_fails(0) {} void operator()(std::size_t begin, std::size_t end) { // number of binary search failures in this window std::size_t n_window_search_fails = 0; std::vector d2(n_neighbors - 1, 0.0); for (std::size_t i = begin; i < end; i++) { double beta = 1.0; double lo = 0.0; double hi = double_max; // best value seen is used only if binary search fails // (usually only happens if there are multiple degenerate distances) double beta_best = beta; double adiff_min = double_max; bool converged = false; // calculate squared distances and remember the minimum double dmin = double_max; double dtmp; for (unsigned int k = 1; k < n_neighbors; k++) { dtmp = nn_dist(i, k) * nn_dist(i, k); d2[k - 1] = dtmp; if (dtmp < dmin) { dmin = dtmp; } } // shift distances by minimum: this implements the log-sum-exp trick // D2, W and Z are their shifted versions // but P (and hence Shannon entropy) is unchanged for (unsigned int k = 1; k < n_neighbors; k++) { d2[k - 1] -= dmin; } for (unsigned int iter = 0; iter < n_iter; iter++) { double Z = 0.0; double H = 0.0; double sum_D2_W = 0.0; for (unsigned int k = 0; k < n_neighbors - 1; k++) { double W = std::exp(-d2[k] * beta); Z += W; sum_D2_W += d2[k] * W; } if (Z > 0) { H = std::log(Z) + beta * sum_D2_W / Z; } const double adiff = std::abs(H - target); if (adiff < tol) { converged = true; break; } // store best beta in case binary search fails if (adiff < adiff_min) { adiff_min = adiff; beta_best = beta; } if (H < target) { hi = beta; beta = 0.5 * (lo + hi); } else { lo = beta; if (hi == double_max) { beta *= 2.0; } else { beta = 0.5 * (lo + hi); } } } if (!converged) { ++n_window_search_fails; beta = beta_best; } double Z = 0.0; for (unsigned int k = 0; k < n_neighbors - 1; k++) { double W = std::exp(-d2[k] * beta); Z += W; // no longer need d2 at this point, store final W there d2[k] = W; } // This will index over d2, skipping when i == j std::size_t widx = 0; for (unsigned int k = 0; k < n_neighbors; k++) { unsigned int j = nn_idx(i, k) - 1; if (i != j) { res(i, k) = d2[widx] / Z; ++widx; } else { res(i, k) = 0.0; } } } // Update global count of failures { tthread::lock_guard guard(mutex); n_search_fails += n_window_search_fails; } } }; // [[Rcpp::export]] Rcpp::List calc_row_probabilities_parallel( const Rcpp::NumericMatrix nn_dist, const Rcpp::IntegerMatrix nn_idx, const double perplexity, const unsigned int n_iter = 200, const double tol = 1e-5, const bool parallelize = true, const std::size_t grain_size = 1, const bool verbose = false) { Rcpp::NumericMatrix res = Rcpp::NumericMatrix(nn_dist.nrow(), nn_dist.ncol()); const unsigned int n_vertices = nn_dist.nrow(); PerplexityWorker worker(res, nn_dist, nn_idx, perplexity, n_iter, tol); if (parallelize) { RcppParallel::parallelFor(0, n_vertices, worker, grain_size); } else { worker(0, n_vertices); } return Rcpp::List::create(Rcpp::Named("matrix") = res, Rcpp::Named("n_failures") = worker.n_search_fails); } uwot/src/Makevars0000644000176200001440000000037313420522370013534 0ustar liggesusers# Turn on C++11 support to get access to long long (guaranteed 64-bit ints) CXX_STD = CXX11 PKG_CXXFLAGS += -DRCPP_PARALLEL_USE_TBB=1 -DSTRICT_R_HEADERS # RcppParallel PKG_LIBS += $(shell ${R_HOME}/bin/Rscript -e "RcppParallel::RcppParallelLibs()") uwot/src/tauprng.h0000644000176200001440000000611413571657123013704 0ustar liggesusers// UWOT -- An R package for dimensionality reduction using UMAP // // Copyright (C) 2018 James Melville // // This file is part of UWOT // // UWOT is free software: you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation, either version 3 of the License, or // (at your option) any later version. // // UWOT is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with UWOT. If not, see . // Three-component combined Tausworthe "taus88" PRNG from L'Ecuyer 1996. #ifndef UWOT_TAUPRNG_H #define UWOT_TAUPRNG_H #include "Rcpp.h" #include // linked from dqrng #include "convert_seed.h" #include "pcg_random.hpp" // NOT THREAD SAFE // based on code in the dqsample package static uint64_t random64() { return R::runif(0, 1) * (std::numeric_limits::max)(); } // NOT THREAD SAFE static uint32_t random32() { return R::runif(0, 1) * (std::numeric_limits::max)(); } struct tau_prng { uint64_t state0; uint64_t state1; // technically this needs to always be > 7 uint64_t state2; // and this should be > 15 static constexpr uint64_t MAGIC0 = static_cast(4294967294); static constexpr uint64_t MAGIC1 = static_cast(4294967288); static constexpr uint64_t MAGIC2 = static_cast(4294967280); tau_prng(uint64_t state0, uint64_t state1, uint64_t state2) : state0(state0), state1(state1 > 7 ? state1 : 8), state2(state2 > 15 ? state2 : 16) {} int32_t operator()() { state0 = (((state0 & MAGIC0) << 12) & 0xffffffff) ^ ((((state0 << 13) & 0xffffffff) ^ state0) >> 19); state1 = (((state1 & MAGIC1) << 4) & 0xffffffff) ^ ((((state1 << 2) & 0xffffffff) ^ state1) >> 25); state2 = (((state2 & MAGIC2) << 17) & 0xffffffff) ^ ((((state2 << 3) & 0xffffffff) ^ state2) >> 11); return state0 ^ state1 ^ state2; } // return a value in (0, n] std::size_t operator()(const std::size_t n) { std::size_t result = (*this)() % n; return result; } }; struct tau_factory { uint64_t seed1; uint64_t seed2; tau_factory() : seed1(random64()), seed2(random64()) {} void reseed() { seed1 = random64(); seed2 = random64(); } tau_prng create(uint64_t seed) { return tau_prng(seed1, seed2, seed); } }; struct pcg_prng { pcg32 gen; pcg_prng(uint64_t seed) { gen.seed(seed); } // return a value in (0, n] std::size_t operator()(const std::size_t n) { std::size_t result = gen(n); return result; } }; struct pcg_factory { uint32_t seed1; pcg_factory() : seed1(random32()) {} void reseed() { seed1 = random32(); } pcg_prng create(uint32_t seed) { uint32_t seeds[2] = {seed1, seed}; return pcg_prng(dqrng::convert_seed(seeds, 2)); } }; #endif // UWOT_TAUPRNG_H uwot/src/sampler.h0000644000176200001440000000273013571657123013667 0ustar liggesusers// UWOT -- An R package for dimensionality reduction using UMAP // // Copyright (C) 2018 James Melville // // This file is part of UWOT // // UWOT is free software: you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation, either version 3 of the License, or // (at your option) any later version. // // UWOT is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with UWOT. If not, see . #ifndef UWOT_SAMPLER_H #define UWOT_SAMPLER_H #include // Weighted edge sampler class Sampler { public: Sampler(const std::vector &epochs_per_sample, const double negative_sample_rate); bool is_sample_edge(const std::size_t i, const std::size_t n) const; const std::size_t get_num_neg_samples(const std::size_t i, const std::size_t n) const; void next_sample(const std::size_t i, const std::size_t num_neg_samples); private: std::vector epochs_per_sample; std::vector epoch_of_next_sample; std::vector epochs_per_negative_sample; std::vector epoch_of_next_negative_sample; }; #endif // UWOT_SAMPLER_H uwot/src/Makevars.win0000644000176200001440000000043513420522262014327 0ustar liggesusers# Turn on C++11 support to get access to long long (guaranteed 64-bit ints) CXX_STD = CXX11 # RcppParallel PKG_CXXFLAGS += -DRCPP_PARALLEL_USE_TBB=1 -DSTRICT_R_HEADERS PKG_LIBS += $(shell "${R_HOME}/bin${R_ARCH_BIN}/Rscript.exe" \ -e "RcppParallel::RcppParallelLibs()") uwot/src/smooth_knn.cpp0000644000176200001440000001542313571657123014741 0ustar liggesusers// UWOT -- An R package for dimensionality reduction using UMAP // // Copyright (C) 2018 James Melville // // This file is part of UWOT // // UWOT is free software: you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation, either version 3 of the License, or // (at your option) any later version. // // UWOT is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with UWOT. If not, see . #include #include #include #include #include // [[Rcpp::depends(RcppParallel)]] #include struct SmoothKnnWorker : public RcppParallel::Worker { const RcppParallel::RMatrix nn_dist; const RcppParallel::RMatrix nn_idx; RcppParallel::RMatrix nn_weights; const unsigned int n_vertices; const unsigned int n_neighbors; const double target; const unsigned int n_iter; const double local_connectivity; const double bandwidth; const double tol; const double min_k_dist_scale; const double mean_distances; const double double_max = (std::numeric_limits::max)(); tthread::mutex mutex; std::size_t n_search_fails; SmoothKnnWorker(const Rcpp::NumericMatrix &nn_dist, const Rcpp::IntegerMatrix &nn_idx, Rcpp::NumericMatrix nn_weights, const unsigned int n_iter, const double local_connectivity, const double bandwidth, const double tol, const double min_k_dist_scale) : nn_dist(nn_dist), nn_idx(nn_idx), nn_weights(nn_weights), n_vertices(nn_dist.nrow()), n_neighbors(nn_dist.ncol()), target(std::log2(n_neighbors)), n_iter(n_iter), local_connectivity(local_connectivity), bandwidth(bandwidth), tol(tol), min_k_dist_scale(min_k_dist_scale), mean_distances(mean(nn_dist)), n_search_fails(0) {} void operator()(std::size_t begin, std::size_t end) { // number of binary search failures in this window std::size_t n_window_search_fails = 0; std::vector non_zero_distances; non_zero_distances.reserve(n_neighbors); for (std::size_t i = begin; i < end; i++) { double sigma = 1.0; non_zero_distances.clear(); double lo = 0.0; double hi = double_max; // best value seen is used only if binary search fails // NB there is already a safeguard against sigma getting too large // so this is less of a problem than with the perplexity search double sigma_best = sigma; double adiff_min = double_max; auto ith_distances = nn_dist.row(i); for (std::size_t k = 0; k < ith_distances.length(); k++) { if (ith_distances[k] > 0.0) { non_zero_distances.push_back(ith_distances[k]); } } // Find rho, the distance to the nearest neighbor (excluding zero distance // neighbors) double rho = 0.0; if (non_zero_distances.size() >= local_connectivity) { int index = static_cast(std::floor(local_connectivity)); double interpolation = local_connectivity - index; if (index > 0) { rho = non_zero_distances[index - 1]; if (interpolation >= tol) { rho += interpolation * (non_zero_distances[index] - non_zero_distances[index - 1]); } } else if (non_zero_distances.size() > 0) { rho = interpolation * non_zero_distances[0]; } } else if (non_zero_distances.size() > 0) { rho = *std::max_element(non_zero_distances.begin(), non_zero_distances.end()); } bool converged = false; for (unsigned int iter = 0; iter < n_iter; iter++) { double val = 0.0; // NB we iterate from 1, not 0: don't use the self-distance. // Makes using Rcpp sugar sufficiently awkward so do the explicit loop for (unsigned int k = 1; k < n_neighbors; k++) { double dist = (std::max)(0.0, ith_distances[k] - rho); val += std::exp(-dist / sigma); } const double adiff = std::abs(val - target); if (adiff < tol) { converged = true; break; } // store best sigma in case binary search fails (usually in the presence // of multiple degenerate distances) if (adiff < adiff_min) { adiff_min = adiff; sigma_best = sigma; } if (val > target) { hi = sigma; sigma = 0.5 * (lo + hi); } else { lo = sigma; if (hi == double_max) { sigma *= 2; } else { sigma = 0.5 * (lo + hi); } } } if (!converged) { ++n_window_search_fails; sigma = sigma_best; } if (rho > 0.0) { double mean = std::accumulate(ith_distances.begin(), ith_distances.end(), 0.0) / ith_distances.length(); sigma = (std::max)(min_k_dist_scale * mean, sigma); } else { sigma = (std::max)(min_k_dist_scale * mean_distances, sigma); } std::vector res(n_neighbors, 0.0); for (std::size_t k = 0; k < n_neighbors; k++) { double rk = ith_distances[k] - rho; if (rk <= 0) { res[k] = 1.0; } else { res[k] = std::exp(-rk / (sigma * bandwidth)); } } for (unsigned int k = 0; k < n_neighbors; k++) { nn_weights(i, k) = res[k]; } } // Update global count of failures { tthread::lock_guard guard(mutex); n_search_fails += n_window_search_fails; } } }; // [[Rcpp::export]] Rcpp::List smooth_knn_distances_parallel( const Rcpp::NumericMatrix &nn_dist, const Rcpp::IntegerMatrix &nn_idx, const unsigned int n_iter = 64, const double local_connectivity = 1.0, const double bandwidth = 1.0, const double tol = 1e-5, const double min_k_dist_scale = 1e-3, const bool parallelize = true, const std::size_t grain_size = 1, const bool verbose = false) { const unsigned int n_vertices = nn_dist.nrow(); Rcpp::NumericMatrix nn_weights(n_vertices, nn_idx.ncol()); SmoothKnnWorker worker(nn_dist, nn_idx, nn_weights, n_iter, local_connectivity, bandwidth, tol, min_k_dist_scale); if (parallelize) { RcppParallel::parallelFor(0, n_vertices, worker, grain_size); } else { worker(0, n_vertices); } return Rcpp::List::create(Rcpp::Named("matrix") = nn_weights, Rcpp::Named("n_failures") = worker.n_search_fails); } uwot/src/transform.cpp0000644000176200001440000001142713571657123014575 0ustar liggesusers// UWOT -- An R package for dimensionality reduction using UMAP // // Copyright (C) 2018 James Melville // // This file is part of UWOT // // UWOT is free software: you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation, either version 3 of the License, or // (at your option) any later version. // // UWOT is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with UWOT. If not, see . #include #include #include // [[Rcpp::depends(RcppParallel)]] struct AverageWorker : public RcppParallel::Worker { const RcppParallel::RMatrix train_embedding; const RcppParallel::RMatrix nn_index; RcppParallel::RMatrix embedding; const std::size_t nc; const std::size_t nnbrs; const double one_over_n; AverageWorker(Rcpp::NumericMatrix train_embedding, Rcpp::IntegerMatrix nn_index, Rcpp::NumericMatrix embedding) : train_embedding(train_embedding), nn_index(nn_index), embedding(embedding), nc(train_embedding.ncol()), nnbrs(nn_index.ncol()), one_over_n(1.0 / nnbrs) {} void operator()(std::size_t begin, std::size_t end) { std::vector sumc(nc); for (std::size_t i = begin; i < end; i++) { std::fill(sumc.begin(), sumc.end(), 0.0); for (std::size_t j = 0; j < nnbrs; j++) { auto nbr = nn_index(i, j) - 1; for (std::size_t k = 0; k < nc; k++) { sumc[k] += train_embedding(nbr, k); } } for (std::size_t k = 0; k < nc; k++) { embedding(i, k) = sumc[k] * one_over_n; } } } }; // [[Rcpp::export]] Rcpp::NumericMatrix init_transform_av_parallel( Rcpp::NumericMatrix train_embedding, Rcpp::IntegerMatrix nn_index, bool parallelize = true, const std::size_t grain_size = 1) { Rcpp::NumericMatrix embedding(nn_index.nrow(), train_embedding.ncol()); AverageWorker worker(train_embedding, nn_index, embedding); if (parallelize) { RcppParallel::parallelFor(0, nn_index.nrow(), worker, grain_size); } else { worker(0, nn_index.nrow()); } return embedding; } struct WeightedAverageWorker : public RcppParallel::Worker { const RcppParallel::RMatrix train_embedding; const RcppParallel::RMatrix nn_index; const RcppParallel::RMatrix nn_weights; RcppParallel::RMatrix embedding; const std::size_t nc; const std::size_t nnbrs; WeightedAverageWorker(Rcpp::NumericMatrix train_embedding, Rcpp::IntegerMatrix nn_index, const Rcpp::NumericMatrix &nn_weights, Rcpp::NumericMatrix embedding) : train_embedding(train_embedding), nn_index(nn_index), nn_weights(nn_weights), embedding(embedding), nc(train_embedding.ncol()), nnbrs(nn_index.ncol()) {} void operator()(std::size_t begin, std::size_t end) { std::vector sumc(nc); for (std::size_t i = begin; i < end; i++) { std::fill(sumc.begin(), sumc.end(), 0.0); double sumw = 0.0; for (std::size_t j = 0; j < nnbrs; j++) { auto nbr = nn_index(i, j) - 1; double w = nn_weights(i, j); sumw += w; for (std::size_t k = 0; k < nc; k++) { sumc[k] += train_embedding(nbr, k) * w; } } for (std::size_t k = 0; k < nc; k++) { embedding(i, k) = sumc[k] / sumw; } } } }; // Initialize embedding as a weighted average of nearest neighbors of each point // train_embedding: n_train x dim matrix of final embedding coordinates // nn_index: n_test x n_nbrs matrix of indexes of neighbors in X_train that are // nearest neighbors of X_test // weights: n_test x n_nbrs weight matrix // Returns the n_test x dim matrix of initialized coordinates. // [[Rcpp::export]] Rcpp::NumericMatrix init_transform_parallel(Rcpp::NumericMatrix train_embedding, Rcpp::IntegerMatrix nn_index, Rcpp::NumericMatrix nn_weights, const std::size_t grain_size = 1, bool parallelize = true) { Rcpp::NumericMatrix embedding(nn_index.nrow(), train_embedding.ncol()); WeightedAverageWorker worker(train_embedding, nn_index, nn_weights, embedding); if (parallelize) { RcppParallel::parallelFor(0, nn_index.nrow(), worker, grain_size); } else { worker(0, nn_index.nrow()); } return embedding; } uwot/src/RcppExports.cpp0000644000176200001440000005403613571657304015057 0ustar liggesusers// Generated by using Rcpp::compileAttributes() -> do not edit by hand // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 #include using namespace Rcpp; // connected_components_undirected List connected_components_undirected(const unsigned long N, const IntegerVector indices1, const IntegerVector indptr1, const IntegerVector indices2, const IntegerVector indptr2); RcppExport SEXP _uwot_connected_components_undirected(SEXP NSEXP, SEXP indices1SEXP, SEXP indptr1SEXP, SEXP indices2SEXP, SEXP indptr2SEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< const unsigned long >::type N(NSEXP); Rcpp::traits::input_parameter< const IntegerVector >::type indices1(indices1SEXP); Rcpp::traits::input_parameter< const IntegerVector >::type indptr1(indptr1SEXP); Rcpp::traits::input_parameter< const IntegerVector >::type indices2(indices2SEXP); Rcpp::traits::input_parameter< const IntegerVector >::type indptr2(indptr2SEXP); rcpp_result_gen = Rcpp::wrap(connected_components_undirected(N, indices1, indptr1, indices2, indptr2)); return rcpp_result_gen; END_RCPP } // annoy_euclidean_nns Rcpp::List annoy_euclidean_nns(const std::string& index_name, const Rcpp::NumericMatrix& mat, std::size_t n_neighbors, std::size_t search_k, std::size_t grain_size, bool verbose); RcppExport SEXP _uwot_annoy_euclidean_nns(SEXP index_nameSEXP, SEXP matSEXP, SEXP n_neighborsSEXP, SEXP search_kSEXP, SEXP grain_sizeSEXP, SEXP verboseSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< const std::string& >::type index_name(index_nameSEXP); Rcpp::traits::input_parameter< const Rcpp::NumericMatrix& >::type mat(matSEXP); Rcpp::traits::input_parameter< std::size_t >::type n_neighbors(n_neighborsSEXP); Rcpp::traits::input_parameter< std::size_t >::type search_k(search_kSEXP); Rcpp::traits::input_parameter< std::size_t >::type grain_size(grain_sizeSEXP); Rcpp::traits::input_parameter< bool >::type verbose(verboseSEXP); rcpp_result_gen = Rcpp::wrap(annoy_euclidean_nns(index_name, mat, n_neighbors, search_k, grain_size, verbose)); return rcpp_result_gen; END_RCPP } // annoy_cosine_nns Rcpp::List annoy_cosine_nns(const std::string& index_name, const Rcpp::NumericMatrix& mat, std::size_t n_neighbors, std::size_t search_k, std::size_t grain_size, bool verbose); RcppExport SEXP _uwot_annoy_cosine_nns(SEXP index_nameSEXP, SEXP matSEXP, SEXP n_neighborsSEXP, SEXP search_kSEXP, SEXP grain_sizeSEXP, SEXP verboseSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< const std::string& >::type index_name(index_nameSEXP); Rcpp::traits::input_parameter< const Rcpp::NumericMatrix& >::type mat(matSEXP); Rcpp::traits::input_parameter< std::size_t >::type n_neighbors(n_neighborsSEXP); Rcpp::traits::input_parameter< std::size_t >::type search_k(search_kSEXP); Rcpp::traits::input_parameter< std::size_t >::type grain_size(grain_sizeSEXP); Rcpp::traits::input_parameter< bool >::type verbose(verboseSEXP); rcpp_result_gen = Rcpp::wrap(annoy_cosine_nns(index_name, mat, n_neighbors, search_k, grain_size, verbose)); return rcpp_result_gen; END_RCPP } // annoy_manhattan_nns Rcpp::List annoy_manhattan_nns(const std::string& index_name, const Rcpp::NumericMatrix& mat, std::size_t n_neighbors, std::size_t search_k, std::size_t grain_size, bool verbose); RcppExport SEXP _uwot_annoy_manhattan_nns(SEXP index_nameSEXP, SEXP matSEXP, SEXP n_neighborsSEXP, SEXP search_kSEXP, SEXP grain_sizeSEXP, SEXP verboseSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< const std::string& >::type index_name(index_nameSEXP); Rcpp::traits::input_parameter< const Rcpp::NumericMatrix& >::type mat(matSEXP); Rcpp::traits::input_parameter< std::size_t >::type n_neighbors(n_neighborsSEXP); Rcpp::traits::input_parameter< std::size_t >::type search_k(search_kSEXP); Rcpp::traits::input_parameter< std::size_t >::type grain_size(grain_sizeSEXP); Rcpp::traits::input_parameter< bool >::type verbose(verboseSEXP); rcpp_result_gen = Rcpp::wrap(annoy_manhattan_nns(index_name, mat, n_neighbors, search_k, grain_size, verbose)); return rcpp_result_gen; END_RCPP } // annoy_hamming_nns Rcpp::List annoy_hamming_nns(const std::string& index_name, const Rcpp::NumericMatrix& mat, std::size_t n_neighbors, std::size_t search_k, std::size_t grain_size, bool verbose); RcppExport SEXP _uwot_annoy_hamming_nns(SEXP index_nameSEXP, SEXP matSEXP, SEXP n_neighborsSEXP, SEXP search_kSEXP, SEXP grain_sizeSEXP, SEXP verboseSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< const std::string& >::type index_name(index_nameSEXP); Rcpp::traits::input_parameter< const Rcpp::NumericMatrix& >::type mat(matSEXP); Rcpp::traits::input_parameter< std::size_t >::type n_neighbors(n_neighborsSEXP); Rcpp::traits::input_parameter< std::size_t >::type search_k(search_kSEXP); Rcpp::traits::input_parameter< std::size_t >::type grain_size(grain_sizeSEXP); Rcpp::traits::input_parameter< bool >::type verbose(verboseSEXP); rcpp_result_gen = Rcpp::wrap(annoy_hamming_nns(index_name, mat, n_neighbors, search_k, grain_size, verbose)); return rcpp_result_gen; END_RCPP } // optimize_layout_umap Rcpp::NumericMatrix optimize_layout_umap(Rcpp::NumericMatrix head_embedding, Rcpp::Nullable tail_embedding, const std::vector positive_head, const std::vector positive_tail, unsigned int n_epochs, unsigned int n_vertices, const std::vector epochs_per_sample, double a, double b, double gamma, double initial_alpha, double negative_sample_rate, bool approx_pow, bool pcg_rand, bool parallelize, std::size_t grain_size, bool move_other, bool verbose); RcppExport SEXP _uwot_optimize_layout_umap(SEXP head_embeddingSEXP, SEXP tail_embeddingSEXP, SEXP positive_headSEXP, SEXP positive_tailSEXP, SEXP n_epochsSEXP, SEXP n_verticesSEXP, SEXP epochs_per_sampleSEXP, SEXP aSEXP, SEXP bSEXP, SEXP gammaSEXP, SEXP initial_alphaSEXP, SEXP negative_sample_rateSEXP, SEXP approx_powSEXP, SEXP pcg_randSEXP, SEXP parallelizeSEXP, SEXP grain_sizeSEXP, SEXP move_otherSEXP, SEXP verboseSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< Rcpp::NumericMatrix >::type head_embedding(head_embeddingSEXP); Rcpp::traits::input_parameter< Rcpp::Nullable >::type tail_embedding(tail_embeddingSEXP); Rcpp::traits::input_parameter< const std::vector >::type positive_head(positive_headSEXP); Rcpp::traits::input_parameter< const std::vector >::type positive_tail(positive_tailSEXP); Rcpp::traits::input_parameter< unsigned int >::type n_epochs(n_epochsSEXP); Rcpp::traits::input_parameter< unsigned int >::type n_vertices(n_verticesSEXP); Rcpp::traits::input_parameter< const std::vector >::type epochs_per_sample(epochs_per_sampleSEXP); Rcpp::traits::input_parameter< double >::type a(aSEXP); Rcpp::traits::input_parameter< double >::type b(bSEXP); Rcpp::traits::input_parameter< double >::type gamma(gammaSEXP); Rcpp::traits::input_parameter< double >::type initial_alpha(initial_alphaSEXP); Rcpp::traits::input_parameter< double >::type negative_sample_rate(negative_sample_rateSEXP); Rcpp::traits::input_parameter< bool >::type approx_pow(approx_powSEXP); Rcpp::traits::input_parameter< bool >::type pcg_rand(pcg_randSEXP); Rcpp::traits::input_parameter< bool >::type parallelize(parallelizeSEXP); Rcpp::traits::input_parameter< std::size_t >::type grain_size(grain_sizeSEXP); Rcpp::traits::input_parameter< bool >::type move_other(move_otherSEXP); Rcpp::traits::input_parameter< bool >::type verbose(verboseSEXP); rcpp_result_gen = Rcpp::wrap(optimize_layout_umap(head_embedding, tail_embedding, positive_head, positive_tail, n_epochs, n_vertices, epochs_per_sample, a, b, gamma, initial_alpha, negative_sample_rate, approx_pow, pcg_rand, parallelize, grain_size, move_other, verbose)); return rcpp_result_gen; END_RCPP } // optimize_layout_tumap Rcpp::NumericMatrix optimize_layout_tumap(Rcpp::NumericMatrix head_embedding, Rcpp::Nullable tail_embedding, const std::vector positive_head, const std::vector positive_tail, unsigned int n_epochs, unsigned int n_vertices, const std::vector epochs_per_sample, double initial_alpha, double negative_sample_rate, bool pcg_rand, bool parallelize, std::size_t grain_size, bool move_other, bool verbose); RcppExport SEXP _uwot_optimize_layout_tumap(SEXP head_embeddingSEXP, SEXP tail_embeddingSEXP, SEXP positive_headSEXP, SEXP positive_tailSEXP, SEXP n_epochsSEXP, SEXP n_verticesSEXP, SEXP epochs_per_sampleSEXP, SEXP initial_alphaSEXP, SEXP negative_sample_rateSEXP, SEXP pcg_randSEXP, SEXP parallelizeSEXP, SEXP grain_sizeSEXP, SEXP move_otherSEXP, SEXP verboseSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< Rcpp::NumericMatrix >::type head_embedding(head_embeddingSEXP); Rcpp::traits::input_parameter< Rcpp::Nullable >::type tail_embedding(tail_embeddingSEXP); Rcpp::traits::input_parameter< const std::vector >::type positive_head(positive_headSEXP); Rcpp::traits::input_parameter< const std::vector >::type positive_tail(positive_tailSEXP); Rcpp::traits::input_parameter< unsigned int >::type n_epochs(n_epochsSEXP); Rcpp::traits::input_parameter< unsigned int >::type n_vertices(n_verticesSEXP); Rcpp::traits::input_parameter< const std::vector >::type epochs_per_sample(epochs_per_sampleSEXP); Rcpp::traits::input_parameter< double >::type initial_alpha(initial_alphaSEXP); Rcpp::traits::input_parameter< double >::type negative_sample_rate(negative_sample_rateSEXP); Rcpp::traits::input_parameter< bool >::type pcg_rand(pcg_randSEXP); Rcpp::traits::input_parameter< bool >::type parallelize(parallelizeSEXP); Rcpp::traits::input_parameter< std::size_t >::type grain_size(grain_sizeSEXP); Rcpp::traits::input_parameter< bool >::type move_other(move_otherSEXP); Rcpp::traits::input_parameter< bool >::type verbose(verboseSEXP); rcpp_result_gen = Rcpp::wrap(optimize_layout_tumap(head_embedding, tail_embedding, positive_head, positive_tail, n_epochs, n_vertices, epochs_per_sample, initial_alpha, negative_sample_rate, pcg_rand, parallelize, grain_size, move_other, verbose)); return rcpp_result_gen; END_RCPP } // optimize_layout_largevis Rcpp::NumericMatrix optimize_layout_largevis(Rcpp::NumericMatrix head_embedding, const std::vector positive_head, const std::vector positive_tail, unsigned int n_epochs, unsigned int n_vertices, const std::vector epochs_per_sample, double gamma, double initial_alpha, double negative_sample_rate, bool pcg_rand, bool parallelize, std::size_t grain_size, bool verbose); RcppExport SEXP _uwot_optimize_layout_largevis(SEXP head_embeddingSEXP, SEXP positive_headSEXP, SEXP positive_tailSEXP, SEXP n_epochsSEXP, SEXP n_verticesSEXP, SEXP epochs_per_sampleSEXP, SEXP gammaSEXP, SEXP initial_alphaSEXP, SEXP negative_sample_rateSEXP, SEXP pcg_randSEXP, SEXP parallelizeSEXP, SEXP grain_sizeSEXP, SEXP verboseSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< Rcpp::NumericMatrix >::type head_embedding(head_embeddingSEXP); Rcpp::traits::input_parameter< const std::vector >::type positive_head(positive_headSEXP); Rcpp::traits::input_parameter< const std::vector >::type positive_tail(positive_tailSEXP); Rcpp::traits::input_parameter< unsigned int >::type n_epochs(n_epochsSEXP); Rcpp::traits::input_parameter< unsigned int >::type n_vertices(n_verticesSEXP); Rcpp::traits::input_parameter< const std::vector >::type epochs_per_sample(epochs_per_sampleSEXP); Rcpp::traits::input_parameter< double >::type gamma(gammaSEXP); Rcpp::traits::input_parameter< double >::type initial_alpha(initial_alphaSEXP); Rcpp::traits::input_parameter< double >::type negative_sample_rate(negative_sample_rateSEXP); Rcpp::traits::input_parameter< bool >::type pcg_rand(pcg_randSEXP); Rcpp::traits::input_parameter< bool >::type parallelize(parallelizeSEXP); Rcpp::traits::input_parameter< std::size_t >::type grain_size(grain_sizeSEXP); Rcpp::traits::input_parameter< bool >::type verbose(verboseSEXP); rcpp_result_gen = Rcpp::wrap(optimize_layout_largevis(head_embedding, positive_head, positive_tail, n_epochs, n_vertices, epochs_per_sample, gamma, initial_alpha, negative_sample_rate, pcg_rand, parallelize, grain_size, verbose)); return rcpp_result_gen; END_RCPP } // calc_row_probabilities_parallel Rcpp::List calc_row_probabilities_parallel(const Rcpp::NumericMatrix nn_dist, const Rcpp::IntegerMatrix nn_idx, const double perplexity, const unsigned int n_iter, const double tol, const bool parallelize, const std::size_t grain_size, const bool verbose); RcppExport SEXP _uwot_calc_row_probabilities_parallel(SEXP nn_distSEXP, SEXP nn_idxSEXP, SEXP perplexitySEXP, SEXP n_iterSEXP, SEXP tolSEXP, SEXP parallelizeSEXP, SEXP grain_sizeSEXP, SEXP verboseSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< const Rcpp::NumericMatrix >::type nn_dist(nn_distSEXP); Rcpp::traits::input_parameter< const Rcpp::IntegerMatrix >::type nn_idx(nn_idxSEXP); Rcpp::traits::input_parameter< const double >::type perplexity(perplexitySEXP); Rcpp::traits::input_parameter< const unsigned int >::type n_iter(n_iterSEXP); Rcpp::traits::input_parameter< const double >::type tol(tolSEXP); Rcpp::traits::input_parameter< const bool >::type parallelize(parallelizeSEXP); Rcpp::traits::input_parameter< const std::size_t >::type grain_size(grain_sizeSEXP); Rcpp::traits::input_parameter< const bool >::type verbose(verboseSEXP); rcpp_result_gen = Rcpp::wrap(calc_row_probabilities_parallel(nn_dist, nn_idx, perplexity, n_iter, tol, parallelize, grain_size, verbose)); return rcpp_result_gen; END_RCPP } // smooth_knn_distances_parallel Rcpp::List smooth_knn_distances_parallel(const Rcpp::NumericMatrix& nn_dist, const Rcpp::IntegerMatrix& nn_idx, const unsigned int n_iter, const double local_connectivity, const double bandwidth, const double tol, const double min_k_dist_scale, const bool parallelize, const std::size_t grain_size, const bool verbose); RcppExport SEXP _uwot_smooth_knn_distances_parallel(SEXP nn_distSEXP, SEXP nn_idxSEXP, SEXP n_iterSEXP, SEXP local_connectivitySEXP, SEXP bandwidthSEXP, SEXP tolSEXP, SEXP min_k_dist_scaleSEXP, SEXP parallelizeSEXP, SEXP grain_sizeSEXP, SEXP verboseSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< const Rcpp::NumericMatrix& >::type nn_dist(nn_distSEXP); Rcpp::traits::input_parameter< const Rcpp::IntegerMatrix& >::type nn_idx(nn_idxSEXP); Rcpp::traits::input_parameter< const unsigned int >::type n_iter(n_iterSEXP); Rcpp::traits::input_parameter< const double >::type local_connectivity(local_connectivitySEXP); Rcpp::traits::input_parameter< const double >::type bandwidth(bandwidthSEXP); Rcpp::traits::input_parameter< const double >::type tol(tolSEXP); Rcpp::traits::input_parameter< const double >::type min_k_dist_scale(min_k_dist_scaleSEXP); Rcpp::traits::input_parameter< const bool >::type parallelize(parallelizeSEXP); Rcpp::traits::input_parameter< const std::size_t >::type grain_size(grain_sizeSEXP); Rcpp::traits::input_parameter< const bool >::type verbose(verboseSEXP); rcpp_result_gen = Rcpp::wrap(smooth_knn_distances_parallel(nn_dist, nn_idx, n_iter, local_connectivity, bandwidth, tol, min_k_dist_scale, parallelize, grain_size, verbose)); return rcpp_result_gen; END_RCPP } // fast_intersection_cpp NumericVector fast_intersection_cpp(const IntegerVector rows, const IntegerVector cols, NumericVector values, const IntegerVector target, double unknown_dist, double far_dist); RcppExport SEXP _uwot_fast_intersection_cpp(SEXP rowsSEXP, SEXP colsSEXP, SEXP valuesSEXP, SEXP targetSEXP, SEXP unknown_distSEXP, SEXP far_distSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< const IntegerVector >::type rows(rowsSEXP); Rcpp::traits::input_parameter< const IntegerVector >::type cols(colsSEXP); Rcpp::traits::input_parameter< NumericVector >::type values(valuesSEXP); Rcpp::traits::input_parameter< const IntegerVector >::type target(targetSEXP); Rcpp::traits::input_parameter< double >::type unknown_dist(unknown_distSEXP); Rcpp::traits::input_parameter< double >::type far_dist(far_distSEXP); rcpp_result_gen = Rcpp::wrap(fast_intersection_cpp(rows, cols, values, target, unknown_dist, far_dist)); return rcpp_result_gen; END_RCPP } // general_sset_intersection_cpp NumericVector general_sset_intersection_cpp(const IntegerVector indptr1, const IntegerVector indices1, NumericVector data1, const IntegerVector indptr2, const IntegerVector indices2, NumericVector data2, const IntegerVector result_row, const IntegerVector result_col, NumericVector result_val, double mix_weight); RcppExport SEXP _uwot_general_sset_intersection_cpp(SEXP indptr1SEXP, SEXP indices1SEXP, SEXP data1SEXP, SEXP indptr2SEXP, SEXP indices2SEXP, SEXP data2SEXP, SEXP result_rowSEXP, SEXP result_colSEXP, SEXP result_valSEXP, SEXP mix_weightSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< const IntegerVector >::type indptr1(indptr1SEXP); Rcpp::traits::input_parameter< const IntegerVector >::type indices1(indices1SEXP); Rcpp::traits::input_parameter< NumericVector >::type data1(data1SEXP); Rcpp::traits::input_parameter< const IntegerVector >::type indptr2(indptr2SEXP); Rcpp::traits::input_parameter< const IntegerVector >::type indices2(indices2SEXP); Rcpp::traits::input_parameter< NumericVector >::type data2(data2SEXP); Rcpp::traits::input_parameter< const IntegerVector >::type result_row(result_rowSEXP); Rcpp::traits::input_parameter< const IntegerVector >::type result_col(result_colSEXP); Rcpp::traits::input_parameter< NumericVector >::type result_val(result_valSEXP); Rcpp::traits::input_parameter< double >::type mix_weight(mix_weightSEXP); rcpp_result_gen = Rcpp::wrap(general_sset_intersection_cpp(indptr1, indices1, data1, indptr2, indices2, data2, result_row, result_col, result_val, mix_weight)); return rcpp_result_gen; END_RCPP } // init_transform_av_parallel Rcpp::NumericMatrix init_transform_av_parallel(Rcpp::NumericMatrix train_embedding, Rcpp::IntegerMatrix nn_index, bool parallelize, const std::size_t grain_size); RcppExport SEXP _uwot_init_transform_av_parallel(SEXP train_embeddingSEXP, SEXP nn_indexSEXP, SEXP parallelizeSEXP, SEXP grain_sizeSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< Rcpp::NumericMatrix >::type train_embedding(train_embeddingSEXP); Rcpp::traits::input_parameter< Rcpp::IntegerMatrix >::type nn_index(nn_indexSEXP); Rcpp::traits::input_parameter< bool >::type parallelize(parallelizeSEXP); Rcpp::traits::input_parameter< const std::size_t >::type grain_size(grain_sizeSEXP); rcpp_result_gen = Rcpp::wrap(init_transform_av_parallel(train_embedding, nn_index, parallelize, grain_size)); return rcpp_result_gen; END_RCPP } // init_transform_parallel Rcpp::NumericMatrix init_transform_parallel(Rcpp::NumericMatrix train_embedding, Rcpp::IntegerMatrix nn_index, Rcpp::NumericMatrix nn_weights, const std::size_t grain_size, bool parallelize); RcppExport SEXP _uwot_init_transform_parallel(SEXP train_embeddingSEXP, SEXP nn_indexSEXP, SEXP nn_weightsSEXP, SEXP grain_sizeSEXP, SEXP parallelizeSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< Rcpp::NumericMatrix >::type train_embedding(train_embeddingSEXP); Rcpp::traits::input_parameter< Rcpp::IntegerMatrix >::type nn_index(nn_indexSEXP); Rcpp::traits::input_parameter< Rcpp::NumericMatrix >::type nn_weights(nn_weightsSEXP); Rcpp::traits::input_parameter< const std::size_t >::type grain_size(grain_sizeSEXP); Rcpp::traits::input_parameter< bool >::type parallelize(parallelizeSEXP); rcpp_result_gen = Rcpp::wrap(init_transform_parallel(train_embedding, nn_index, nn_weights, grain_size, parallelize)); return rcpp_result_gen; END_RCPP } static const R_CallMethodDef CallEntries[] = { {"_uwot_connected_components_undirected", (DL_FUNC) &_uwot_connected_components_undirected, 5}, {"_uwot_annoy_euclidean_nns", (DL_FUNC) &_uwot_annoy_euclidean_nns, 6}, {"_uwot_annoy_cosine_nns", (DL_FUNC) &_uwot_annoy_cosine_nns, 6}, {"_uwot_annoy_manhattan_nns", (DL_FUNC) &_uwot_annoy_manhattan_nns, 6}, {"_uwot_annoy_hamming_nns", (DL_FUNC) &_uwot_annoy_hamming_nns, 6}, {"_uwot_optimize_layout_umap", (DL_FUNC) &_uwot_optimize_layout_umap, 18}, {"_uwot_optimize_layout_tumap", (DL_FUNC) &_uwot_optimize_layout_tumap, 14}, {"_uwot_optimize_layout_largevis", (DL_FUNC) &_uwot_optimize_layout_largevis, 13}, {"_uwot_calc_row_probabilities_parallel", (DL_FUNC) &_uwot_calc_row_probabilities_parallel, 8}, {"_uwot_smooth_knn_distances_parallel", (DL_FUNC) &_uwot_smooth_knn_distances_parallel, 10}, {"_uwot_fast_intersection_cpp", (DL_FUNC) &_uwot_fast_intersection_cpp, 6}, {"_uwot_general_sset_intersection_cpp", (DL_FUNC) &_uwot_general_sset_intersection_cpp, 10}, {"_uwot_init_transform_av_parallel", (DL_FUNC) &_uwot_init_transform_av_parallel, 4}, {"_uwot_init_transform_parallel", (DL_FUNC) &_uwot_init_transform_parallel, 5}, {NULL, NULL, 0} }; RcppExport void R_init_uwot(DllInfo *dll) { R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); R_useDynamicSymbols(dll, FALSE); } uwot/src/nn_parallel.cpp0000644000176200001440000001136413571657123015051 0ustar liggesusers#include // [[Rcpp::depends(RcppParallel)]] #include #if defined(__MINGW32__) #undef Realloc #undef Free #endif #define __ERROR_PRINTER_OVERRIDE__ REprintf #include #include template struct NNWorker : public RcppParallel::Worker { std::string index_name; RcppParallel::RMatrix mat; RcppParallel::RMatrix dists; RcppParallel::RMatrix idx; std::size_t ncol; std::size_t n_neighbors; std::size_t search_k; NNWorker(const std::string &index_name, const Rcpp::NumericMatrix &mat, Rcpp::NumericMatrix &dists, Rcpp::IntegerMatrix &idx, std::size_t ncol, std::size_t n_neighbors, std::size_t search_k) : index_name(index_name), mat(mat), dists(dists), idx(idx), ncol(ncol), n_neighbors(n_neighbors), search_k(search_k) {} void operator()(std::size_t begin, std::size_t end) { AnnoyIndex index(ncol); index.load(index_name.c_str()); for (std::size_t i = begin; i < end; i++) { RcppParallel::RMatrix::Row row = mat.row(i); std::vector fv(row.length()); std::copy(row.begin(), row.end(), fv.begin()); std::vector result; std::vector distances; index.get_nns_by_vector(fv.data(), n_neighbors, search_k, &result, &distances); if (result.size() != n_neighbors || distances.size() != n_neighbors) { break; } for (std::size_t j = 0; j < n_neighbors; j++) { dists(i, j) = distances[j]; idx(i, j) = result[j]; } } } }; // [[Rcpp::export]] Rcpp::List annoy_euclidean_nns(const std::string &index_name, const Rcpp::NumericMatrix &mat, std::size_t n_neighbors, std::size_t search_k, std::size_t grain_size = 1, bool verbose = false) { std::size_t nrow = mat.rows(); std::size_t ncol = mat.cols(); Rcpp::NumericMatrix dist(nrow, n_neighbors); Rcpp::IntegerMatrix idx(nrow, n_neighbors); idx.fill(-1); NNWorker worker( index_name, mat, dist, idx, ncol, n_neighbors, search_k); RcppParallel::parallelFor(0, nrow, worker, grain_size); return Rcpp::List::create(Rcpp::Named("item") = idx, Rcpp::Named("distance") = dist); } // [[Rcpp::export]] Rcpp::List annoy_cosine_nns(const std::string &index_name, const Rcpp::NumericMatrix &mat, std::size_t n_neighbors, std::size_t search_k, std::size_t grain_size = 1, bool verbose = false) { std::size_t nrow = mat.rows(); std::size_t ncol = mat.cols(); Rcpp::NumericMatrix dist(nrow, n_neighbors); Rcpp::IntegerMatrix idx(nrow, n_neighbors); idx.fill(-1); NNWorker worker( index_name, mat, dist, idx, ncol, n_neighbors, search_k); RcppParallel::parallelFor(0, nrow, worker, grain_size); return Rcpp::List::create(Rcpp::Named("item") = idx, Rcpp::Named("distance") = dist); } // [[Rcpp::export]] Rcpp::List annoy_manhattan_nns(const std::string &index_name, const Rcpp::NumericMatrix &mat, std::size_t n_neighbors, std::size_t search_k, std::size_t grain_size = 1, bool verbose = false) { std::size_t nrow = mat.rows(); std::size_t ncol = mat.cols(); Rcpp::NumericMatrix dist(nrow, n_neighbors); Rcpp::IntegerMatrix idx(nrow, n_neighbors); idx.fill(-1); NNWorker worker( index_name, mat, dist, idx, ncol, n_neighbors, search_k); RcppParallel::parallelFor(0, nrow, worker, grain_size); return Rcpp::List::create(Rcpp::Named("item") = idx, Rcpp::Named("distance") = dist); } // [[Rcpp::export]] Rcpp::List annoy_hamming_nns(const std::string &index_name, const Rcpp::NumericMatrix &mat, std::size_t n_neighbors, std::size_t search_k, std::size_t grain_size = 1, bool verbose = false) { std::size_t nrow = mat.rows(); std::size_t ncol = mat.cols(); Rcpp::NumericMatrix dist(nrow, n_neighbors); Rcpp::IntegerMatrix idx(nrow, n_neighbors); idx.fill(-1); NNWorker worker( index_name, mat, dist, idx, ncol, n_neighbors, search_k); RcppParallel::parallelFor(0, nrow, worker, grain_size); return Rcpp::List::create(Rcpp::Named("item") = idx, Rcpp::Named("distance") = dist); } uwot/R/0000755000176200001440000000000013571664300011456 5ustar liggesusersuwot/R/uwot.R0000644000176200001440000030455113571660314012610 0ustar liggesusers#' Dimensionality Reduction with UMAP #' #' Carry out dimensionality reduction of a dataset using the Uniform Manifold #' Approximation and Projection (UMAP) method (McInnes & Healy, 2018). Some of #' the following help text is lifted verbatim from the Python reference #' implementation at \url{https://github.com/lmcinnes/umap}. #' #' @param X Input data. Can be a \code{\link{data.frame}}, \code{\link{matrix}}, #' \code{\link[stats]{dist}} object or \code{\link[Matrix]{sparseMatrix}}. A #' sparse matrix is interpreted as a distance matrix and both implicit and #' explicit zero entries are ignored. Set zero distances you want to keep to #' an arbitrarily small non-zero value (e.g. \code{1e-10}). Matrix and data #' frames should contain one observation per row. Data frames will have any #' non-numeric columns removed, although factor columns will be used if #' explicitly included via \code{metric} (see the help for \code{metric} for #' details). Can be \code{NULL} if precomputed nearest neighbor data is passed #' to \code{nn_method}, and \code{init} is not \code{"spca"} or \code{"pca"}. #' @param n_neighbors The size of local neighborhood (in terms of number of #' neighboring sample points) used for manifold approximation. Larger values #' result in more global views of the manifold, while smaller values result in #' more local data being preserved. In general values should be in the range #' \code{2} to \code{100}. #' @param n_components The dimension of the space to embed into. This defaults #' to \code{2} to provide easy visualization, but can reasonably be set to any #' integer value in the range \code{2} to \code{100}. #' @param metric Type of distance metric to use to find nearest neighbors. One #' of: #' \itemize{ #' \item \code{"euclidean"} (the default) #' \item \code{"cosine"} #' \item \code{"manhattan"} #' \item \code{"hamming"} #' \item \code{"categorical"} (see below) #' } #' Only applies if \code{nn_method = "annoy"} (for \code{nn_method = "fnn"}, the #' distance metric is always "euclidean"). #' #' If \code{X} is a data frame or matrix, then multiple metrics can be #' specified, by passing a list to this argument, where the name of each item in #' the list is one of the metric names above. The value of each list item should #' be a vector giving the names or integer ids of the columns to be included in #' a calculation, e.g. \code{metric = list(euclidean = 1:4, manhattan = 5:10)}. #' #' Each metric calculation results in a separate fuzzy simplicial set, which are #' intersected together to produce the final set. Metric names can be repeated. #' Because non-numeric columns are removed from the data frame, it is safer to #' use column names than integer ids. #' #' Factor columns can also be used by specifying the metric name #' \code{"categorical"}. Factor columns are treated different from numeric #' columns and although multiple factor columns can be specified in a vector, #' each factor column specified is processed individually. If you specify #' a non-factor column, it will be coerced to a factor. #' #' For a given data block, you may override the \code{pca} and \code{pca_center} #' arguments for that block, by providing a list with one unnamed item #' containing the column names or ids, and then any of the \code{pca} or #' \code{pca_center} overrides as named items, e.g. \code{metric = #' list(euclidean = 1:4, manhattan = list(5:10, pca_center = FALSE))}. This #' exists to allow mixed binary and real-valued data to be included and to have #' PCA applied to both, but with centering applied only to the real-valued data #' (it is typical not to apply centering to binary data before PCA is applied). #' @param n_epochs Number of epochs to use during the optimization of the #' embedded coordinates. By default, this value is set to \code{500} for datasets #' containing 10,000 vertices or less, and \code{200} otherwise. #' @param scale Scaling to apply to \code{X} if it is a data frame or matrix: #' \itemize{ #' \item{\code{"none"} or \code{FALSE} or \code{NULL}} No scaling. #' \item{\code{"Z"} or \code{"scale"} or \code{TRUE}} Scale each column to #' zero mean and variance 1. #' \item{\code{"maxabs"}} Center each column to mean 0, then divide each #' element by the maximum absolute value over the entire matrix. #' \item{\code{"range"}} Range scale the entire matrix, so the smallest #' element is 0 and the largest is 1. #' \item{\code{"colrange"}} Scale each column in the range (0,1). #' } #' For UMAP, the default is \code{"none"}. #' @param learning_rate Initial learning rate used in optimization of the #' coordinates. #' @param init Type of initialization for the coordinates. Options are: #' \itemize{ #' \item \code{"spectral"} Spectral embedding using the normalized Laplacian #' of the fuzzy 1-skeleton, with Gaussian noise added. #' \item \code{"normlaplacian"}. Spectral embedding using the normalized #' Laplacian of the fuzzy 1-skeleton, without noise. #' \item \code{"random"}. Coordinates assigned using a uniform random #' distribution between -10 and 10. #' \item \code{"lvrandom"}. Coordinates assigned using a Gaussian #' distribution with standard deviation 1e-4, as used in LargeVis #' (Tang et al., 2016) and t-SNE. #' \item \code{"laplacian"}. Spectral embedding using the Laplacian Eigenmap #' (Belkin and Niyogi, 2002). #' \item \code{"pca"}. The first two principal components from PCA of #' \code{X} if \code{X} is a data frame, and from a 2-dimensional classical #' MDS if \code{X} is of class \code{"dist"}. #' \item \code{"spca"}. Like \code{"pca"}, but each dimension is then scaled #' so the standard deviation is 1e-4, to give a distribution similar to that #' used in t-SNE. This is an alias for \code{init = "pca", init_sdev = #' 1e-4}. #' \item \code{"agspectral"} An "approximate global" modification of #' \code{"spectral"} which all edges in the graph to a value of 1, and then #' sets a random number of edges (\code{negative_sample_rate} edges per #' vertex) to 0.1, to approximate the effect of non-local affinities. #' \item A matrix of initial coordinates. #' } #' For spectral initializations, (\code{"spectral"}, \code{"normlaplacian"}, #' \code{"laplacian"}), if more than one connected component is identified, #' each connected component is initialized separately and the results are #' merged. If \code{verbose = TRUE} the number of connected components are #' logged to the console. The existence of multiple connected components #' implies that a global view of the data cannot be attained with this #' initialization. Either a PCA-based initialization or increasing the value of #' \code{n_neighbors} may be more appropriate. #' @param init_sdev If non-\code{NULL}, scales each dimension of the initialized #' coordinates (including any user-supplied matrix) to this standard #' deviation. By default no scaling is carried out, except when \code{init = #' "spca"}, in which case the value is \code{0.0001}. Scaling the input may #' help if the unscaled versions result in initial coordinates with large #' inter-point distances or outliers. This usually results in small gradients #' during optimization and very little progress being made to the layout. #' Shrinking the initial embedding by rescaling can help under these #' circumstances. Scaling the result of \code{init = "pca"} is usually #' recommended and \code{init = "spca"} as an alias for \code{init = "pca", #' init_sdev = 1e-4} but for the spectral initializations the scaled versions #' usually aren't necessary unless you are using a large value of #' \code{n_neighbors} (e.g. \code{n_neighbors = 150} or higher). #' @param spread The effective scale of embedded points. In combination with #' \code{min_dist}, this determines how clustered/clumped the embedded points #' are. #' @param min_dist The effective minimum distance between embedded points. #' Smaller values will result in a more clustered/clumped embedding where #' nearby points on the manifold are drawn closer together, while larger #' values will result on a more even dispersal of points. The value should be #' set relative to the \code{spread} value, which determines the scale at #' which embedded points will be spread out. #' @param set_op_mix_ratio Interpolate between (fuzzy) union and intersection as #' the set operation used to combine local fuzzy simplicial sets to obtain a #' global fuzzy simplicial sets. Both fuzzy set operations use the product #' t-norm. The value of this parameter should be between \code{0.0} and #' \code{1.0}; a value of \code{1.0} will use a pure fuzzy union, while #' \code{0.0} will use a pure fuzzy intersection. #' @param local_connectivity The local connectivity required -- i.e. the number #' of nearest neighbors that should be assumed to be connected at a local #' level. The higher this value the more connected the manifold becomes #' locally. In practice this should be not more than the local intrinsic #' dimension of the manifold. #' @param bandwidth The effective bandwidth of the kernel if we view the #' algorithm as similar to Laplacian Eigenmaps. Larger values induce more #' connectivity and a more global view of the data, smaller values concentrate #' more locally. #' @param repulsion_strength Weighting applied to negative samples in low #' dimensional embedding optimization. Values higher than one will result in #' greater weight being given to negative samples. #' @param negative_sample_rate The number of negative edge/1-simplex samples to #' use per positive edge/1-simplex sample in optimizing the low dimensional #' embedding. #' @param a More specific parameters controlling the embedding. If \code{NULL} #' these values are set automatically as determined by \code{min_dist} and #' \code{spread}. #' @param b More specific parameters controlling the embedding. If \code{NULL} #' these values are set automatically as determined by \code{min_dist} and #' \code{spread}. #' @param nn_method Method for finding nearest neighbors. Options are: #' \itemize{ #' \item \code{"fnn"}. Use exact nearest neighbors via the #' \href{https://cran.r-project.org/package=FNN}{FNN} package. #' \item \code{"annoy"} Use approximate nearest neighbors via the #' \href{https://cran.r-project.org/package=RcppAnnoy}{RcppAnnoy} package. #' } #' By default, if \code{X} has less than 4,096 vertices, the exact nearest #' neighbors are found. Otherwise, approximate nearest neighbors are used. #' You may also pass precalculated nearest neighbor data to this argument. It #' must be a list consisting of two elements: #' \itemize{ #' \item \code{"idx"}. A \code{n_vertices x n_neighbors} matrix #' containing the integer indexes of the nearest neighbors in \code{X}. Each #' vertex is considered to be its own nearest neighbor, i.e. #' \code{idx[, 1] == 1:n_vertices}. #' \item \code{"dist"}. A \code{n_vertices x n_neighbors} matrix #' containing the distances of the nearest neighbors. #' } #' Multiple nearest neighbor data (e.g. from two different precomputed #' metrics) can be passed by passing a list containing the nearest neighbor #' data lists as items. #' The \code{n_neighbors} parameter is ignored when using precomputed #' nearest neighbor data. #' @param n_trees Number of trees to build when constructing the nearest #' neighbor index. The more trees specified, the larger the index, but the #' better the results. With \code{search_k}, determines the accuracy of the #' Annoy nearest neighbor search. Only used if the \code{nn_method} is #' \code{"annoy"}. Sensible values are between \code{10} to \code{100}. #' @param search_k Number of nodes to search during the neighbor retrieval. The #' larger k, the more the accurate results, but the longer the search takes. #' With \code{n_trees}, determines the accuracy of the Annoy nearest neighbor #' search. Only used if the \code{nn_method} is \code{"annoy"}. #' @param approx_pow If \code{TRUE}, use an approximation to the power function #' in the UMAP gradient, from #' \url{https://martin.ankerl.com/2012/01/25/optimized-approximative-pow-in-c-and-cpp/}. #' @param y Optional target data for supervised dimension reduction. Can be a #' vector, matrix or data frame. Use the \code{target_metric} parameter to #' specify the metrics to use, using the same syntax as \code{metric}. Usually #' either a single numeric or factor column is used, but more complex formats #' are possible. The following types are allowed: #' \itemize{ #' \item Factor columns with the same length as \code{X}. \code{NA} is #' allowed for any observation with an unknown level, in which case #' UMAP operates as a form of semi-supervised learning. Each column is #' treated separately. #' \item Numeric data. \code{NA} is \emph{not} allowed in this case. Use the #' parameter \code{target_n_neighbors} to set the number of neighbors used #' with \code{y}. If unset, \code{n_neighbors} is used. Unlike factors, #' numeric columns are grouped into one block unless \code{target_metric} #' specifies otherwise. For example, if you wish columns \code{a} and #' \code{b} to be treated separately, specify #' \code{target_metric = list(euclidean = "a", euclidean = "b")}. Otherwise, #' the data will be effectively treated as a matrix with two columns. #' \item Nearest neighbor data, consisting of a list of two matrices, #' \code{idx} and \code{dist}. These represent the precalculated nearest #' neighbor indices and distances, respectively. This #' is the same format as that expected for precalculated data in #' \code{nn_method}. This format assumes that the underlying data was a #' numeric vector. Any user-supplied value of the \code{target_n_neighbors} #' parameter is ignored in this case, because the the number of columns in #' the matrices is used for the value. Multiple nearest neighbor data using #' different metrics can be supplied by passing a list of these lists. #' } #' Unlike \code{X}, all factor columns included in \code{y} are automatically #' used. #' @param target_n_neighbors Number of nearest neighbors to use to construct the #' target simplicial set. Default value is \code{n_neighbors}. Applies only if #' \code{y} is non-\code{NULL} and \code{numeric}. #' @param target_metric The metric used to measure distance for \code{y} if #' using supervised dimension reduction. Used only if \code{y} is numeric. #' @param target_weight Weighting factor between data topology and target #' topology. A value of 0.0 weights entirely on data, a value of 1.0 weights #' entirely on target. The default of 0.5 balances the weighting equally #' between data and target. Only applies if \code{y} is non-\code{NULL}. #' @param pca If set to a positive integer value, reduce data to this number of #' columns using PCA. Doesn't applied if the distance \code{metric} is #' \code{"hamming"}, or the dimensions of the data is larger than the #' number specified (i.e. number of rows and columns must be larger than the #' value of this parameter). If you have > 100 columns in a data frame or #' matrix, reducing the number of columns in this way may substantially #' increase the performance of the nearest neighbor search at the cost of a #' potential decrease in accuracy. In many t-SNE applications, a value of 50 #' is recommended, although there's no guarantee that this is appropriate for #' all settings. #' @param pca_center If \code{TRUE}, center the columns of \code{X} before #' carrying out PCA. For binary data, it's recommended to set this to #' \code{FALSE}. #' @param pcg_rand If \code{TRUE}, use the PCG random number generator (O'Neill, #' 2014) during optimization. Otherwise, use the faster (but probably less #' statistically good) Tausworthe "taus88" generator. The default is #' \code{TRUE}. #' @param fast_sgd If \code{TRUE}, then the following combination of parameters #' is set: \code{pcg_rand = TRUE}, \code{n_sgd_threads = "auto"} and #' \code{approx_pow = TRUE}. The default is \code{FALSE}. Setting this to #' \code{TRUE} will speed up the stochastic optimization phase, but give a #' potentially less accurate embedding, and which will not be exactly #' reproducible even with a fixed seed. For visualization, \code{fast_sgd = #' TRUE} will give perfectly good results. For more generic dimensionality #' reduction, it's safer to leave \code{fast_sgd = FALSE}. If \code{fast_sgd = #' TRUE}, then user-supplied values of \code{pcg_rand}, \code{n_sgd_threads}, #' and \code{approx_pow} are ignored. #' @param ret_model If \code{TRUE}, then return extra data that can be used to #' add new data to an existing embedding via \code{\link{umap_transform}}. The #' embedded coordinates are returned as the list item \code{embedding}. If #' \code{FALSE}, just return the coordinates. This parameter can be used in #' conjunction with \code{ret_nn}. Note that some settings are incompatible #' with the production of a UMAP model: external neighbor data (passed via a #' list to \code{nn_method}), and factor columns that were included #' via the \code{metric} parameter. In the latter case, the model produced is #' based only on the numeric data. A transformation using new data is #' possible, but the factor columns in the new data are ignored. #' @param ret_nn If \code{TRUE}, then in addition to the embedding, also return #' nearest neighbor data that can be used as input to \code{nn_method} to #' avoid the overhead of repeatedly calculating the nearest neighbors when #' manipulating unrelated parameters (e.g. \code{min_dist}, \code{n_epochs}, #' \code{init}). See the "Value" section for the names of the list items. If #' \code{FALSE}, just return the coordinates. Note that the nearest neighbors #' could be sensitive to data scaling, so be wary of reusing nearest neighbor #' data if modifying the \code{scale} parameter. This parameter can be used in #' conjunction with \code{ret_model}. #' @param n_threads Number of threads to use (except during stochastic gradient #' descent). Default is half that recommended by RcppParallel. For #' nearest neighbor search, only applies if \code{nn_method = "annoy"}. If #' \code{n_threads > 1}, then the Annoy index will be temporarily written to #' disk in the location determined by \code{\link[base]{tempfile}}. #' @param n_sgd_threads Number of threads to use during stochastic gradient #' descent. If set to > 1, then results will not be reproducible, even if #' `set.seed` is called with a fixed seed before running. Set to #' \code{"auto"} go use the same value as \code{n_threads}. #' @param grain_size Minimum batch size for multithreading. If the number of #' items to process in a thread falls below this number, then no threads will #' be used. Used in conjunction with \code{n_threads} and #' \code{n_sgd_threads}. #' @param tmpdir Temporary directory to store nearest neighbor indexes during #' nearest neighbor search. Default is \code{\link{tempdir}}. The index is #' only written to disk if \code{n_threads > 1} and #' \code{nn_method = "annoy"}; otherwise, this parameter is ignored. #' @param verbose If \code{TRUE}, log details to the console. #' @return A matrix of optimized coordinates, or: #' \itemize{ #' \item if \code{ret_model = TRUE}, returns a #' list containing extra information that can be used to add new data to an #' existing embedding via \code{\link{umap_transform}}. In this case, the #' coordinates are available in the list item \code{embedding}. #' \item if \code{ret_nn = TRUE}, returns the nearest neighbor data as a #' list called \code{nn}. This contains one list for each \code{metric} #' calculated, itself containing a matrix \code{idx} with the integer ids of #' the neighbors; and a matrix \code{dist} with the distances. The \code{nn} #' list (or a sub-list) can be used as input to the \code{nn_method} #' parameter. #' } #' Both \code{ret_model} and \code{ret_nn} can be \code{TRUE}, in which case #' the returned list contains the combined data. #' @examples #' # Non-numeric columns are automatically removed so you can pass data frames #' # directly in a lot of cases without pre-processing #' iris_umap <- umap(iris, #' n_neighbors = 50, learning_rate = 0.5, #' init = "random" #' ) #' #' # Although not an issue for the iris dataset, for high dimensional data #' # (> 100 columns), using PCA to reduce dimensionality is highly #' # recommended to avoid nearest neighbor searches taking a long time #' # 50 dimensions is a good value to start with. If there are fewer columns #' # in the input than the requested number of components, the parameter is #' # ignored. #' iris_umap <- umap(iris, pca = 50) #' #' # Faster approximation to the gradient #' iris_umap <- umap(iris, n_neighbors = 15, approx_pow = TRUE) #' #' # Can specify min_dist and spread parameters to control separation and size #' # of clusters #' iris_umap <- umap(iris, n_neighbors = 15, min_dist = 1, spread = 5) #' #' # Supervised dimension reduction using the 'Species' factor column #' iris_sumap <- umap(iris, #' n_neighbors = 15, min_dist = 0.001, #' y = iris$Species, target_weight = 0.5 #' ) #' \donttest{ #' # Calculate Petal and Sepal neighbors separately (uses intersection of the resulting sets): #' iris_umap <- umap(iris, metric = list( #' "euclidean" = c("Sepal.Length", "Sepal.Width"), #' "euclidean" = c("Petal.Length", "Petal.Width") #' )) #' #' # Can also use individual factor columns #' iris_umap <- umap(iris, metric = list( #' "euclidean" = c("Sepal.Length", "Sepal.Width"), #' "euclidean" = c("Petal.Length", "Petal.Width"), #' "categorical" = "Species" #' )) #' # Return NN info #' iris_umap <- umap(iris, ret_nn = TRUE) #' #' # Re-use NN info for greater efficiency #' # Here we use random initialization #' iris_umap_spca <- umap(iris, init = "rand", nn_method = iris_umap$nn) #' } #' #' @references #' Belkin, M., & Niyogi, P. (2002). #' Laplacian eigenmaps and spectral techniques for embedding and clustering. #' In \emph{Advances in neural information processing systems} #' (pp. 585-591). #' \url{http://papers.nips.cc/paper/1961-laplacian-eigenmaps-and-spectral-techniques-for-embedding-and-clustering.pdf} #' #' McInnes, L., & Healy, J. (2018). #' UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction #' \emph{arXiv preprint} \emph{arXiv}:1802.03426. #' \url{https://arxiv.org/abs/1802.03426} #' #' O’Neill, M. E. (2014). #' \emph{PCG: A family of simple fast space-efficient statistically good #' algorithms for random number generation} #' (Report No. HMC-CS-2014-0905). Harvey Mudd College. #' #' Tang, J., Liu, J., Zhang, M., & Mei, Q. (2016, April). #' Visualizing large-scale and high-dimensional data. #' In \emph{Proceedings of the 25th International Conference on World Wide Web} #' (pp. 287-297). #' International World Wide Web Conferences Steering Committee. #' \url{https://arxiv.org/abs/1602.00370} #' #' Van der Maaten, L., & Hinton, G. (2008). #' Visualizing data using t-SNE. #' \emph{Journal of Machine Learning Research}, \emph{9} (2579-2605). #' \url{http://www.jmlr.org/papers/v9/vandermaaten08a.html} #' @export umap <- function(X, n_neighbors = 15, n_components = 2, metric = "euclidean", n_epochs = NULL, learning_rate = 1, scale = FALSE, init = "spectral", init_sdev = NULL, spread = 1, min_dist = 0.01, set_op_mix_ratio = 1.0, local_connectivity = 1.0, bandwidth = 1.0, repulsion_strength = 1.0, negative_sample_rate = 5.0, a = NULL, b = NULL, nn_method = NULL, n_trees = 50, search_k = 2 * n_neighbors * n_trees, approx_pow = FALSE, y = NULL, target_n_neighbors = n_neighbors, target_metric = "euclidean", target_weight = 0.5, pca = NULL, pca_center = TRUE, pcg_rand = TRUE, fast_sgd = FALSE, ret_model = FALSE, ret_nn = FALSE, n_threads = max(1, RcppParallel::defaultNumThreads() / 2), n_sgd_threads = 0, grain_size = 1, tmpdir = tempdir(), verbose = getOption("verbose", TRUE)) { uwot( X = X, n_neighbors = n_neighbors, n_components = n_components, metric = metric, n_epochs = n_epochs, alpha = learning_rate, scale = scale, init = init, init_sdev = init_sdev, spread = spread, min_dist = min_dist, set_op_mix_ratio = set_op_mix_ratio, local_connectivity = local_connectivity, bandwidth = bandwidth, gamma = repulsion_strength, negative_sample_rate = negative_sample_rate, a = a, b = b, nn_method = nn_method, n_trees = n_trees, search_k = search_k, method = "umap", approx_pow = approx_pow, n_threads = n_threads, n_sgd_threads = n_sgd_threads, grain_size = grain_size, y = y, target_n_neighbors = target_n_neighbors, target_weight = target_weight, target_metric = target_metric, pca = pca, pca_center = pca_center, pcg_rand = pcg_rand, fast_sgd = fast_sgd, ret_model = ret_model, ret_nn = ret_nn, tmpdir = tempdir(), verbose = verbose ) } #' Dimensionality Reduction Using t-Distributed UMAP (t-UMAP) #' #' A faster (but less flexible) version of the UMAP gradient. For more detail on #' UMAP, see the \code{\link{umap}} function. #' #' By setting the UMAP curve parameters \code{a} and \code{b} to \code{1}, you #' get back the Cauchy distribution as used in t-SNE and LargeVis. It also #' results in a substantially simplified gradient expression. This can give #' a speed improvement of around 50\%. #' #' @param X Input data. Can be a \code{\link{data.frame}}, \code{\link{matrix}}, #' \code{\link[stats]{dist}} object or \code{\link[Matrix]{sparseMatrix}}. A #' sparse matrix is interpreted as a distance matrix and both implicit and #' explicit zero entries are ignored. Set zero distances you want to keep to #' an arbitrarily small non-zero value (e.g. \code{1e-10}). Matrix and data #' frames should contain one observation per row. Data frames will have any #' non-numeric columns removed, although factor columns will be used if #' explicitly included via \code{metric} (see the help for \code{metric} for #' details). Can be \code{NULL} if precomputed nearest neighbor data is passed #' to \code{nn_method}, and \code{init} is not \code{"spca"} or \code{"pca"}. #' @param n_neighbors The size of local neighborhood (in terms of number of #' neighboring sample points) used for manifold approximation. Larger values #' result in more global views of the manifold, while smaller values result in #' more local data being preserved. In general values should be in the range #' \code{2} to \code{100}. #' @param n_components The dimension of the space to embed into. This defaults #' to \code{2} to provide easy visualization, but can reasonably be set to any #' integer value in the range \code{2} to \code{100}. #' @param metric Type of distance metric to use to find nearest neighbors. One #' of: #' \itemize{ #' \item \code{"euclidean"} (the default) #' \item \code{"cosine"} #' \item \code{"manhattan"} #' \item \code{"hamming"} #' \item \code{"categorical"} (see below) #' } #' Only applies if \code{nn_method = "annoy"} (for \code{nn_method = "fnn"}, the #' distance metric is always "euclidean"). #' #' If \code{X} is a data frame or matrix, then multiple metrics can be #' specified, by passing a list to this argument, where the name of each item in #' the list is one of the metric names above. The value of each list item should #' be a vector giving the names or integer ids of the columns to be included in #' a calculation, e.g. \code{metric = list(euclidean = 1:4, manhattan = 5:10)}. #' #' Each metric calculation results in a separate fuzzy simplicial set, which are #' intersected together to produce the final set. Metric names can be repeated. #' Because non-numeric columns are removed from the data frame, it is safer to #' use column names than integer ids. #' #' Factor columns can also be used by specifying the metric name #' \code{"categorical"}. Factor columns are treated different from numeric #' columns and although multiple factor columns can be specified in a vector, #' each factor column specified is processed individually. If you specify #' a non-factor column, it will be coerced to a factor. #' #' For a given data block, you may override the \code{pca} and \code{pca_center} #' arguments for that block, by providing a list with one unnamed item #' containing the column names or ids, and then any of the \code{pca} or #' \code{pca_center} overrides as named items, e.g. \code{metric = #' list(euclidean = 1:4, manhattan = list(5:10, pca_center = FALSE))}. This #' exists to allow mixed binary and real-valued data to be included and to have #' PCA applied to both, but with centering applied only to the real-valued data #' (it is typical not to apply centering to binary data before PCA is applied). #' @param n_epochs Number of epochs to use during the optimization of the #' embedded coordinates. By default, this value is set to \code{500} for datasets #' containing 10,000 vertices or less, and \code{200} otherwise. #' @param learning_rate Initial learning rate used in optimization of the #' coordinates. #' @param scale Scaling to apply to \code{X} if it is a data frame or matrix: #' \itemize{ #' \item{\code{"none"} or \code{FALSE} or \code{NULL}} No scaling. #' \item{\code{"Z"} or \code{"scale"} or \code{TRUE}} Scale each column to #' zero mean and variance 1. #' \item{\code{"maxabs"}} Center each column to mean 0, then divide each #' element by the maximum absolute value over the entire matrix. #' \item{\code{"range"}} Range scale the entire matrix, so the smallest #' element is 0 and the largest is 1. #' \item{\code{"colrange"}} Scale each column in the range (0,1). #' } #' For t-UMAP, the default is \code{"none"}. #' @param init Type of initialization for the coordinates. Options are: #' \itemize{ #' \item \code{"spectral"} Spectral embedding using the normalized Laplacian #' of the fuzzy 1-skeleton, with Gaussian noise added. #' \item \code{"normlaplacian"}. Spectral embedding using the normalized #' Laplacian of the fuzzy 1-skeleton, without noise. #' \item \code{"random"}. Coordinates assigned using a uniform random #' distribution between -10 and 10. #' \item \code{"lvrandom"}. Coordinates assigned using a Gaussian #' distribution with standard deviation 1e-4, as used in LargeVis #' (Tang et al., 2016) and t-SNE. #' \item \code{"laplacian"}. Spectral embedding using the Laplacian Eigenmap #' (Belkin and Niyogi, 2002). #' \item \code{"pca"}. The first two principal components from PCA of #' \code{X} if \code{X} is a data frame, and from a 2-dimensional classical #' MDS if \code{X} is of class \code{"dist"}. #' \item \code{"spca"}. Like \code{"pca"}, but each dimension is then scaled #' so the standard deviation is 1e-4, to give a distribution similar to that #' used in t-SNE. This is an alias for \code{init = "pca", init_sdev = #' 1e-4}. #' \item \code{"agspectral"} An "approximate global" modification of #' \code{"spectral"} which all edges in the graph to a value of 1, and then #' sets a random number of edges (\code{negative_sample_rate} edges per #' vertex) to 0.1, to approximate the effect of non-local affinities. #' \item A matrix of initial coordinates. #' } #' For spectral initializations, (\code{"spectral"}, \code{"normlaplacian"}, #' \code{"laplacian"}), if more than one connected component is identified, #' each connected component is initialized separately and the results are #' merged. If \code{verbose = TRUE} the number of connected components are #' logged to the console. The existence of multiple connected components #' implies that a global view of the data cannot be attained with this #' initialization. Either a PCA-based initialization or increasing the value of #' \code{n_neighbors} may be more appropriate. #' @param init_sdev If non-\code{NULL}, scales each dimension of the initialized #' coordinates (including any user-supplied matrix) to this standard #' deviation. By default no scaling is carried out, except when \code{init = #' "spca"}, in which case the value is \code{0.0001}. Scaling the input may #' help if the unscaled versions result in initial coordinates with large #' inter-point distances or outliers. This usually results in small gradients #' during optimization and very little progress being made to the layout. #' Shrinking the initial embedding by rescaling can help under these #' circumstances. Scaling the result of \code{init = "pca"} is usually #' recommended and \code{init = "spca"} as an alias for \code{init = "pca", #' init_sdev = 1e-4} but for the spectral initializations the scaled versions #' usually aren't necessary unless you are using a large value of #' \code{n_neighbors} (e.g. \code{n_neighbors = 150} or higher). #' @param set_op_mix_ratio Interpolate between (fuzzy) union and intersection as #' the set operation used to combine local fuzzy simplicial sets to obtain a #' global fuzzy simplicial sets. Both fuzzy set operations use the product #' t-norm. The value of this parameter should be between \code{0.0} and #' \code{1.0}; a value of \code{1.0} will use a pure fuzzy union, while #' \code{0.0} will use a pure fuzzy intersection. #' @param local_connectivity The local connectivity required -- i.e. the number #' of nearest neighbors that should be assumed to be connected at a local #' level. The higher this value the more connected the manifold becomes #' locally. In practice this should be not more than the local intrinsic #' dimension of the manifold. #' @param bandwidth The effective bandwidth of the kernel if we view the #' algorithm as similar to Laplacian Eigenmaps. Larger values induce more #' connectivity and a more global view of the data, smaller values concentrate #' more locally. #' @param repulsion_strength Weighting applied to negative samples in low #' dimensional embedding optimization. Values higher than one will result in #' greater weight being given to negative samples. #' @param negative_sample_rate The number of negative edge/1-simplex samples to #' use per positive edge/1-simplex sample in optimizing the low dimensional #' embedding. #' @param nn_method Method for finding nearest neighbors. Options are: #' \itemize{ #' \item \code{"fnn"}. Use exact nearest neighbors via the #' \href{https://cran.r-project.org/package=FNN}{FNN} package. #' \item \code{"annoy"} Use approximate nearest neighbors via the #' \href{https://cran.r-project.org/package=RcppAnnoy}{RcppAnnoy} package. #' } #' By default, if \code{X} has less than 4,096 vertices, the exact nearest #' neighbors are found. Otherwise, approximate nearest neighbors are used. #' You may also pass precalculated nearest neighbor data to this argument. It #' must be a list consisting of two elements: #' \itemize{ #' \item \code{"idx"}. A \code{n_vertices x n_neighbors} matrix #' containing the integer indexes of the nearest neighbors in \code{X}. Each #' vertex is considered to be its own nearest neighbor, i.e. #' \code{idx[, 1] == 1:n_vertices}. #' \item \code{"dist"}. A \code{n_vertices x n_neighbors} matrix #' containing the distances of the nearest neighbors. #' } #' Multiple nearest neighbor data (e.g. from two different precomputed #' metrics) can be passed by passing a list containing the nearest neighbor #' data lists as items. #' The \code{n_neighbors} parameter is ignored when using precalculated #' nearest neighbor data. #' @param n_trees Number of trees to build when constructing the nearest #' neighbor index. The more trees specified, the larger the index, but the #' better the results. With \code{search_k}, determines the accuracy of the #' Annoy nearest neighbor search. Only used if the \code{nn_method} is #' \code{"annoy"}. Sensible values are between \code{10} to \code{100}. #' @param search_k Number of nodes to search during the neighbor retrieval. The #' larger k, the more the accurate results, but the longer the search takes. #' With \code{n_trees}, determines the accuracy of the Annoy nearest neighbor #' search. Only used if the \code{nn_method} is \code{"annoy"}. #' @param y Optional target data for supervised dimension reduction. Can be a #' vector, matrix or data frame. Use the \code{target_metric} parameter to #' specify the metrics to use, using the same syntax as \code{metric}. Usually #' either a single numeric or factor column is used, but more complex formats #' are possible. The following types are allowed: #' \itemize{ #' \item Factor columns with the same length as \code{X}. \code{NA} is #' allowed for any observation with an unknown level, in which case #' UMAP operates as a form of semi-supervised learning. Each column is #' treated separately. #' \item Numeric data. \code{NA} is \emph{not} allowed in this case. Use the #' parameter \code{target_n_neighbors} to set the number of neighbors used #' with \code{y}. If unset, \code{n_neighbors} is used. Unlike factors, #' numeric columns are grouped into one block unless \code{target_metric} #' specifies otherwise. For example, if you wish columns \code{a} and #' \code{b} to be treated separately, specify #' \code{target_metric = list(euclidean = "a", euclidean = "b")}. Otherwise, #' the data will be effectively treated as a matrix with two columns. #' \item Nearest neighbor data, consisting of a list of two matrices, #' \code{idx} and \code{dist}. These represent the precalculated nearest #' neighbor indices and distances, respectively. This #' is the same format as that expected for precalculated data in #' \code{nn_method}. This format assumes that the underlying data was a #' numeric vector. Any user-supplied value of the \code{target_n_neighbors} #' parameter is ignored in this case, because the the number of columns in #' the matrices is used for the value. Multiple nearest neighbor data using #' different metrics can be supplied by passing a list of these lists. #' } #' Unlike \code{X}, all factor columns included in \code{y} are automatically #' used. #' @param target_n_neighbors Number of nearest neighbors to use to construct the #' target simplicial set. Default value is \code{n_neighbors}. Applies only if #' \code{y} is non-\code{NULL} and \code{numeric}. #' @param target_metric The metric used to measure distance for \code{y} if #' using supervised dimension reduction. Used only if \code{y} is numeric. #' @param target_weight Weighting factor between data topology and target #' topology. A value of 0.0 weights entirely on data, a value of 1.0 weights #' entirely on target. The default of 0.5 balances the weighting equally #' between data and target. Only applies if \code{y} is non-\code{NULL}. #' @param pca If set to a positive integer value, reduce data to this number of #' columns using PCA. Doesn't applied if the distance \code{metric} is #' \code{"hamming"}, or the dimensions of the data is larger than the #' number specified (i.e. number of rows and columns must be larger than the #' value of this parameter). If you have > 100 columns in a data frame or #' matrix, reducing the number of columns in this way may substantially #' increase the performance of the nearest neighbor search at the cost of a #' potential decrease in accuracy. In many t-SNE applications, a value of 50 #' is recommended, although there's no guarantee that this is appropriate for #' all settings. #' @param pca_center If \code{TRUE}, center the columns of \code{X} before #' carrying out PCA. For binary data, it's recommended to set this to #' \code{FALSE}. #' @param pcg_rand If \code{TRUE}, use the PCG random number generator (O'Neill, #' 2014) during optimization. Otherwise, use the faster (but probably less #' statistically good) Tausworthe "taus88" generator. The default is #' \code{TRUE}. #' @param fast_sgd If \code{TRUE}, then the following combination of parameters #' is set: \code{pcg_rand = TRUE} and \code{n_sgd_threads = "auto"}. The #' default is \code{FALSE}. Setting this to \code{TRUE} will speed up the #' stochastic optimization phase, but give a potentially less accurate #' embedding, and which will not be exactly reproducible even with a fixed #' seed. For visualization, \code{fast_sgd = TRUE} will give perfectly good #' results. For more generic dimensionality reduction, it's safer to leave #' \code{fast_sgd = FALSE}. If \code{fast_sgd = TRUE}, then user-supplied #' values of \code{pcg_rand} and \code{n_sgd_threads}, are ignored. #' @param ret_model If \code{TRUE}, then return extra data that can be used to #' add new data to an existing embedding via \code{\link{umap_transform}}. The #' embedded coordinates are returned as the list item \code{embedding}. If #' \code{FALSE}, just return the coordinates. This parameter can be used in #' conjunction with \code{ret_nn}. Note that some settings are incompatible #' with the production of a UMAP model: external neighbor data (passed via a #' list to \code{nn_method}), and factor columns that were included #' via the \code{metric} parameter. In the latter case, the model produced is #' based only on the numeric data. A transformation using new data is #' possible, but the factor columns in the new data are ignored. #' @param ret_nn If \code{TRUE}, then in addition to the embedding, also return #' nearest neighbor data that can be used as input to \code{nn_method} to #' avoid the overhead of repeatedly calculating the nearest neighbors when #' manipulating unrelated parameters (e.g. \code{min_dist}, \code{n_epochs}, #' \code{init}). See the "Value" section for the names of the list items. If #' \code{FALSE}, just return the coordinates. Note that the nearest neighbors #' could be sensitive to data scaling, so be wary of reusing nearest neighbor #' data if modifying the \code{scale} parameter. This parameter can be used in #' conjunction with \code{ret_model}. #' @param n_threads Number of threads to use (except during stochastic gradient #' descent). Default is half that recommended by RcppParallel. For #' nearest neighbor search, only applies if \code{nn_method = "annoy"}. If #' \code{n_threads > 1}, then the Annoy index will be temporarily written to #' disk in the location determined by \code{\link[base]{tempfile}}. #' @param n_sgd_threads Number of threads to use during stochastic gradient #' descent. If set to > 1, then results will not be reproducible, even if #' `set.seed` is called with a fixed seed before running. Set to #' \code{"auto"} go use the same value as \code{n_threads}. #' @param grain_size Minimum batch size for multithreading. If the number of #' items to process in a thread falls below this number, then no threads will #' be used. Used in conjunction with \code{n_threads} and #' \code{n_sgd_threads}. #' @param tmpdir Temporary directory to store nearest neighbor indexes during #' nearest neighbor search. Default is \code{\link{tempdir}}. The index is #' only written to disk if \code{n_threads > 1} and #' \code{nn_method = "annoy"}; otherwise, this parameter is ignored. #' @param verbose If \code{TRUE}, log details to the console. #' @return A matrix of optimized coordinates, or: #' \itemize{ #' \item if \code{ret_model = TRUE}, returns a #' list containing extra information that can be used to add new data to an #' existing embedding via \code{\link{umap_transform}}. In this case, the #' coordinates are available in the list item \code{embedding}. #' \item if \code{ret_nn = TRUE}, returns the nearest neighbor data as a #' list called \code{nn}. This contains one list for each \code{metric} #' calculated, itself containing a matrix \code{idx} with the integer ids of #' the neighbors; and a matrix \code{dist} with the distances. The \code{nn} #' list (or a sub-list) can be used as input to the \code{nn_method} #' parameter. #' } #' Both \code{ret_model} and \code{ret_nn} can be \code{TRUE}, in which case #' the returned list contains the combined data. #' #' @examples #' iris_tumap <- tumap(iris, n_neighbors = 50, learning_rate = 0.5) #' @export tumap <- function(X, n_neighbors = 15, n_components = 2, metric = "euclidean", n_epochs = NULL, learning_rate = 1, scale = FALSE, init = "spectral", init_sdev = NULL, set_op_mix_ratio = 1.0, local_connectivity = 1.0, bandwidth = 1.0, repulsion_strength = 1.0, negative_sample_rate = 5.0, nn_method = NULL, n_trees = 50, search_k = 2 * n_neighbors * n_trees, n_threads = max(1, RcppParallel::defaultNumThreads() / 2), n_sgd_threads = 0, grain_size = 1, y = NULL, target_n_neighbors = n_neighbors, target_metric = "euclidean", target_weight = 0.5, pca = NULL, pca_center = TRUE, pcg_rand = TRUE, fast_sgd = FALSE, ret_model = FALSE, ret_nn = FALSE, tmpdir = tempdir(), verbose = getOption("verbose", TRUE)) { uwot( X = X, n_neighbors = n_neighbors, n_components = n_components, metric = metric, n_epochs = n_epochs, alpha = learning_rate, scale = scale, init = init, init_sdev = init_sdev, spread = NULL, min_dist = NULL, set_op_mix_ratio = set_op_mix_ratio, local_connectivity = local_connectivity, bandwidth = bandwidth, gamma = repulsion_strength, negative_sample_rate = negative_sample_rate, a = NULL, b = NULL, nn_method = nn_method, n_trees = n_trees, search_k = search_k, method = "tumap", n_threads = n_threads, n_sgd_threads = n_sgd_threads, grain_size = grain_size, y = y, target_n_neighbors = target_n_neighbors, target_weight = target_weight, target_metric = target_metric, pca = pca, pca_center = pca_center, pcg_rand = pcg_rand, fast_sgd = fast_sgd, ret_model = ret_model, ret_nn = ret_nn, tmpdir = tmpdir, verbose = verbose ) } #' Dimensionality Reduction with a LargeVis-like method #' #' Carry out dimensionality reduction of a dataset using a method similar to #' LargeVis (Tang et al., 2016). #' #' \code{lvish} differs from the official LargeVis implementation in the #' following: #' #' \itemize{ #' \item Only the nearest-neighbor index search phase is multi-threaded. #' \item Matrix input data is not normalized. #' \item The \code{n_trees} parameter cannot be dynamically chosen based on #' data set size. #' \item Nearest neighbor results are not refined via the #' neighbor-of-my-neighbor method. The \code{search_k} parameter is twice #' as large than default to compensate. #' \item Gradient values are clipped to \code{4.0} rather than \code{5.0}. #' \item Negative edges are generated by uniform sampling of vertexes rather #' than their degree ^ 0.75. #' \item The default number of samples is much reduced. The default number of #' epochs, \code{n_epochs}, is set to \code{5000}, much larger than for #' \code{\link{umap}}, but may need to be increased further depending on your #' dataset. Using \code{init = "spectral"} can help. #' } #' #' @param X Input data. Can be a \code{\link{data.frame}}, \code{\link{matrix}}, #' \code{\link[stats]{dist}} object or \code{\link[Matrix]{sparseMatrix}}. A #' sparse matrix is interpreted as a distance matrix and both implicit and #' explicit zero entries are ignored. Set zero distances you want to keep to #' an arbitrarily small non-zero value (e.g. \code{1e-10}). Matrix and data #' frames should contain one observation per row. Data frames will have any #' non-numeric columns removed, although factor columns will be used if #' explicitly included via \code{metric} (see the help for \code{metric} for #' details). Can be \code{NULL} if precomputed nearest neighbor data is passed #' to \code{nn_method}, and \code{init} is not \code{"spca"} or \code{"pca"}. #' @param perplexity Controls the size of the local neighborhood used for #' manifold approximation. This is the analogous to \code{n_neighbors} in #' \code{\link{umap}}. Change this, rather than \code{n_neighbors}. #' @param n_neighbors The number of neighbors to use when calculating the #' \code{perplexity}. Usually set to three times the value of the #' \code{perplexity}. Must be at least as large as \code{perplexity}. #' @param n_components The dimension of the space to embed into. This defaults #' to \code{2} to provide easy visualization, but can reasonably be set to any #' integer value in the range \code{2} to \code{100}. #' @param metric Type of distance metric to use to find nearest neighbors. One #' of: #' \itemize{ #' \item \code{"euclidean"} (the default) #' \item \code{"cosine"} #' \item \code{"manhattan"} #' \item \code{"hamming"} #' \item \code{"categorical"} (see below) #' } #' Only applies if \code{nn_method = "annoy"} (for \code{nn_method = "fnn"}, the #' distance metric is always "euclidean"). #' #' If \code{X} is a data frame or matrix, then multiple metrics can be #' specified, by passing a list to this argument, where the name of each item in #' the list is one of the metric names above. The value of each list item should #' be a vector giving the names or integer ids of the columns to be included in #' a calculation, e.g. \code{metric = list(euclidean = 1:4, manhattan = 5:10)}. #' #' Each metric calculation results in a separate fuzzy simplicial set, which are #' intersected together to produce the final set. Metric names can be repeated. #' Because non-numeric columns are removed from the data frame, it is safer to #' use column names than integer ids. #' #' Factor columns can also be used by specifying the metric name #' \code{"categorical"}. Factor columns are treated different from numeric #' columns and although multiple factor columns can be specified in a vector, #' each factor column specified is processed individually. If you specify #' a non-factor column, it will be coerced to a factor. #' #' For a given data block, you may override the \code{pca} and \code{pca_center} #' arguments for that block, by providing a list with one unnamed item #' containing the column names or ids, and then any of the \code{pca} or #' \code{pca_center} overrides as named items, e.g. \code{metric = #' list(euclidean = 1:4, manhattan = list(5:10, pca_center = FALSE))}. This #' exists to allow mixed binary and real-valued data to be included and to have #' PCA applied to both, but with centering applied only to the real-valued data #' (it is typical not to apply centering to binary data before PCA is applied). #' @param n_epochs Number of epochs to use during the optimization of the #' embedded coordinates. The default is calculate the number of epochs #' dynamically based on dataset size, to give the same number of edge samples #' as the LargeVis defaults. This is usually substantially larger than the #' UMAP defaults. #' @param learning_rate Initial learning rate used in optimization of the #' coordinates. #' @param scale Scaling to apply to \code{X} if it is a data frame or matrix: #' \itemize{ #' \item{\code{"none"} or \code{FALSE} or \code{NULL}} No scaling. #' \item{\code{"Z"} or \code{"scale"} or \code{TRUE}} Scale each column to #' zero mean and variance 1. #' \item{\code{"maxabs"}} Center each column to mean 0, then divide each #' element by the maximum absolute value over the entire matrix. #' \item{\code{"range"}} Range scale the entire matrix, so the smallest #' element is 0 and the largest is 1. #' \item{\code{"colrange"}} Scale each column in the range (0,1). #' } #' For lvish, the default is \code{"maxabs"}, for consistency with LargeVis. #' @param init Type of initialization for the coordinates. Options are: #' \itemize{ #' \item \code{"spectral"} Spectral embedding using the normalized Laplacian #' of the fuzzy 1-skeleton, with Gaussian noise added. #' \item \code{"normlaplacian"}. Spectral embedding using the normalized #' Laplacian of the fuzzy 1-skeleton, without noise. #' \item \code{"random"}. Coordinates assigned using a uniform random #' distribution between -10 and 10. #' \item \code{"lvrandom"}. Coordinates assigned using a Gaussian #' distribution with standard deviation 1e-4, as used in LargeVis #' (Tang et al., 2016) and t-SNE. #' \item \code{"laplacian"}. Spectral embedding using the Laplacian Eigenmap #' (Belkin and Niyogi, 2002). #' \item \code{"pca"}. The first two principal components from PCA of #' \code{X} if \code{X} is a data frame, and from a 2-dimensional classical #' MDS if \code{X} is of class \code{"dist"}. #' \item \code{"spca"}. Like \code{"pca"}, but each dimension is then scaled #' so the standard deviation is 1e-4, to give a distribution similar to that #' used in t-SNE and LargeVis. This is an alias for \code{init = "pca", #' init_sdev = 1e-4}. #' \item \code{"agspectral"} An "approximate global" modification of #' \code{"spectral"} which all edges in the graph to a value of 1, and then #' sets a random number of edges (\code{negative_sample_rate} edges per #' vertex) to 0.1, to approximate the effect of non-local affinities. #' \item A matrix of initial coordinates. #' } #' For spectral initializations, (\code{"spectral"}, \code{"normlaplacian"}, #' \code{"laplacian"}), if more than one connected component is identified, #' each connected component is initialized separately and the results are #' merged. If \code{verbose = TRUE} the number of connected components are #' logged to the console. The existence of multiple connected components #' implies that a global view of the data cannot be attained with this #' initialization. Either a PCA-based initialization or increasing the value of #' \code{n_neighbors} may be more appropriate. #' @param init_sdev If non-\code{NULL}, scales each dimension of the initialized #' coordinates (including any user-supplied matrix) to this standard #' deviation. By default no scaling is carried out, except when \code{init = #' "spca"}, in which case the value is \code{0.0001}. Scaling the input may #' help if the unscaled versions result in initial coordinates with large #' inter-point distances or outliers. This usually results in small gradients #' during optimization and very little progress being made to the layout. #' Shrinking the initial embedding by rescaling can help under these #' circumstances. Scaling the result of \code{init = "pca"} is usually #' recommended and \code{init = "spca"} as an alias for \code{init = "pca", #' init_sdev = 1e-4} but for the spectral initializations the scaled versions #' usually aren't necessary unless you are using a large value of #' \code{n_neighbors} (e.g. \code{n_neighbors = 150} or higher). #' @param repulsion_strength Weighting applied to negative samples in low #' dimensional embedding optimization. Values higher than one will result in #' greater weight being given to negative samples. #' @param negative_sample_rate The number of negative edge/1-simplex samples to #' use per positive edge/1-simplex sample in optimizing the low dimensional #' embedding. #' @param nn_method Method for finding nearest neighbors. Options are: #' \itemize{ #' \item \code{"fnn"}. Use exact nearest neighbors via the #' \href{https://cran.r-project.org/package=FNN}{FNN} package. #' \item \code{"annoy"} Use approximate nearest neighbors via the #' \href{https://cran.r-project.org/package=RcppAnnoy}{RcppAnnoy} package. #' } #' By default, if \code{X} has less than 4,096 vertices, the exact nearest #' neighbors are found. Otherwise, approximate nearest neighbors are used. #' You may also pass precalculated nearest neighbor data to this argument. It #' must be a list consisting of two elements: #' \itemize{ #' \item \code{"idx"}. A \code{n_vertices x n_neighbors} matrix #' containing the integer indexes of the nearest neighbors in \code{X}. Each #' vertex is considered to be its own nearest neighbor, i.e. #' \code{idx[, 1] == 1:n_vertices}. #' \item \code{"dist"}. A \code{n_vertices x n_neighbors} matrix #' containing the distances of the nearest neighbors. #' } #' Multiple nearest neighbor data (e.g. from two different precomputed #' metrics) can be passed by passing a list containing the nearest neighbor #' data lists as items. #' The \code{n_neighbors} parameter is ignored when using precomputed #' nearest neighbor data. #' @param n_trees Number of trees to build when constructing the nearest #' neighbor index. The more trees specified, the larger the index, but the #' better the results. With \code{search_k}, determines the accuracy of the #' Annoy nearest neighbor search. Only used if the \code{nn_method} is #' \code{"annoy"}. Sensible values are between \code{10} to \code{100}. #' @param search_k Number of nodes to search during the neighbor retrieval. The #' larger k, the more the accurate results, but the longer the search takes. #' With \code{n_trees}, determines the accuracy of the Annoy nearest neighbor #' search. Only used if the \code{nn_method} is \code{"annoy"}. #' @param n_threads Number of threads to use (except during stochastic gradient #' descent). Default is half that recommended by RcppParallel. For #' nearest neighbor search, only applies if \code{nn_method = "annoy"}. If #' \code{n_threads > 1}, then the Annoy index will be temporarily written to #' disk in the location determined by \code{\link[base]{tempfile}}. #' @param n_sgd_threads Number of threads to use during stochastic gradient #' descent. If set to > 1, then results will not be reproducible, even if #' `set.seed` is called with a fixed seed before running. Set to #' \code{"auto"} go use the same value as \code{n_threads}. #' @param grain_size Minimum batch size for multithreading. If the number of #' items to process in a thread falls below this number, then no threads will #' be used. Used in conjunction with \code{n_threads} and #' \code{n_sgd_threads}. #' @param kernel Type of kernel function to create input probabilities. Can be #' one of \code{"gauss"} (the default) or \code{"knn"}. \code{"gauss"} uses #' the usual Gaussian weighted similarities. \code{"knn"} assigns equal #' probabilities to every edge in the nearest neighbor graph, and zero #' otherwise, using \code{perplexity} nearest neighbors. The \code{n_neighbors} #' parameter is ignored in this case. #' @param pca If set to a positive integer value, reduce data to this number of #' columns using PCA. Doesn't applied if the distance \code{metric} is #' \code{"hamming"}, or the dimensions of the data is larger than the #' number specified (i.e. number of rows and columns must be larger than the #' value of this parameter). If you have > 100 columns in a data frame or #' matrix, reducing the number of columns in this way may substantially #' increase the performance of the nearest neighbor search at the cost of a #' potential decrease in accuracy. In many t-SNE applications, a value of 50 #' is recommended, although there's no guarantee that this is appropriate for #' all settings. #' @param pca_center If \code{TRUE}, center the columns of \code{X} before #' carrying out PCA. For binary data, it's recommended to set this to #' \code{FALSE}. #' @param pcg_rand If \code{TRUE}, use the PCG random number generator (O'Neill, #' 2014) during optimization. Otherwise, use the faster (but probably less #' statistically good) Tausworthe "taus88" generator. The default is #' \code{TRUE}. #' @param fast_sgd If \code{TRUE}, then the following combination of parameters #' is set: \code{pcg_rand = TRUE} and \code{n_sgd_threads = "auto"}. The #' default is \code{FALSE}. Setting this to \code{TRUE} will speed up the #' stochastic optimization phase, but give a potentially less accurate #' embedding, and which will not be exactly reproducible even with a fixed #' seed. For visualization, \code{fast_sgd = TRUE} will give perfectly good #' results. For more generic dimensionality reduction, it's safer to leave #' \code{fast_sgd = FALSE}. If \code{fast_sgd = TRUE}, then user-supplied #' values of \code{pcg_rand} and \code{n_sgd_threads}, are ignored. #' @param ret_nn If \code{TRUE}, then in addition to the embedding, also return #' nearest neighbor data that can be used as input to \code{nn_method} to #' avoid the overhead of repeatedly calculating the nearest neighbors when #' manipulating unrelated parameters (e.g. \code{min_dist}, \code{n_epochs}, #' \code{init}). See the "Value" section for the names of the list items. If #' \code{FALSE}, just return the coordinates. Note that the nearest neighbors #' could be sensitive to data scaling, so be wary of reusing nearest neighbor #' data if modifying the \code{scale} parameter. #' @param tmpdir Temporary directory to store nearest neighbor indexes during #' nearest neighbor search. Default is \code{\link{tempdir}}. The index is #' only written to disk if \code{n_threads > 1} and #' \code{nn_method = "annoy"}; otherwise, this parameter is ignored. #' @param verbose If \code{TRUE}, log details to the console. #' @return A matrix of optimized coordinates, or if \code{ret_nn = TRUE}, #' returns the nearest neighbor data as a list containing a matrix \code{idx} #' with the integer ids of the neighbors; and a matrix \code{dist} with the #' distances. This list can be used as input to the \code{nn_method} #' parameter. #' @references #' Tang, J., Liu, J., Zhang, M., & Mei, Q. (2016, April). #' Visualizing large-scale and high-dimensional data. #' In \emph{Proceedings of the 25th International Conference on World Wide Web} #' (pp. 287-297). #' International World Wide Web Conferences Steering Committee. #' \url{https://arxiv.org/abs/1602.00370} #' #' @examples #' # Default number of epochs is much larger than for UMAP, assumes random #' # initialization #' # If using a more global initialization, can use fewer epochs #' iris_lvish_short <- lvish(iris, #' perplexity = 50, n_epochs = 200, #' init = "pca" #' ) #' #' # Use perplexity rather than n_neighbors to control the size of the local #' # neighborhood #' # 200 epochs may be too small for a random initialization #' iris_lvish <- lvish(iris, #' perplexity = 50, learning_rate = 0.5, #' init = "random", n_epochs = 200 #' ) #' @export lvish <- function(X, perplexity = 50, n_neighbors = perplexity * 3, n_components = 2, metric = "euclidean", n_epochs = -1, learning_rate = 1, scale = "maxabs", init = "lvrandom", init_sdev = NULL, repulsion_strength = 7, negative_sample_rate = 5.0, nn_method = NULL, n_trees = 50, search_k = 2 * n_neighbors * n_trees, n_threads = max(1, RcppParallel::defaultNumThreads() / 2), n_sgd_threads = 0, grain_size = 1, kernel = "gauss", pca = NULL, pca_center = TRUE, pcg_rand = TRUE, fast_sgd = FALSE, ret_nn = FALSE, tmpdir = tempdir(), verbose = getOption("verbose", TRUE)) { uwot(X, n_neighbors = n_neighbors, n_components = n_components, metric = metric, n_epochs = n_epochs, alpha = learning_rate, scale = scale, init = init, init_sdev = init_sdev, gamma = repulsion_strength, negative_sample_rate = negative_sample_rate, nn_method = nn_method, n_trees = n_trees, search_k = search_k, method = "largevis", perplexity = perplexity, pca = pca, pca_center = pca_center, n_threads = n_threads, n_sgd_threads = n_sgd_threads, grain_size = grain_size, kernel = kernel, ret_nn = ret_nn, pcg_rand = pcg_rand, fast_sgd = fast_sgd, tmpdir = tmpdir, verbose = verbose ) } # Function that does all the real work uwot <- function(X, n_neighbors = 15, n_components = 2, metric = "euclidean", n_epochs = NULL, alpha = 1, scale = FALSE, init = "spectral", init_sdev = NULL, spread = 1, min_dist = 0.01, set_op_mix_ratio = 1.0, local_connectivity = 1.0, bandwidth = 1.0, gamma = 1.0, negative_sample_rate = 5.0, a = NULL, b = NULL, nn_method = NULL, n_trees = 50, search_k = 2 * n_neighbors * n_trees, method = "umap", perplexity = 50, approx_pow = FALSE, y = NULL, target_n_neighbors = n_neighbors, target_metric = "euclidean", target_weight = 0.5, n_threads = max(1, RcppParallel::defaultNumThreads() / 2), n_sgd_threads = 0, grain_size = 1, kernel = "gauss", ret_model = FALSE, ret_nn = FALSE, pca = NULL, pca_center = TRUE, pcg_rand = TRUE, fast_sgd = FALSE, tmpdir = tempdir(), verbose = getOption("verbose", TRUE)) { if (method == "umap" && (is.null(a) || is.null(b))) { ab_res <- find_ab_params(spread = spread, min_dist = min_dist) a <- ab_res[1] b <- ab_res[2] tsmessage("UMAP embedding parameters a = ", formatC(a), " b = ", formatC(b)) } if (n_neighbors < 2) { stop("n_neighbors must be >= 2") } if (set_op_mix_ratio < 0.0 || set_op_mix_ratio > 1.0) { stop("set_op_mix_ratio must be between 0.0 and 1.0") } if (local_connectivity < 1.0) { stop("local_connectivity cannot be < 1.0") } if (!is.null(y) && is.numeric(y) && any(is.na(y))) { stop("numeric y cannot contain NA") } if (!is.numeric(n_components) || n_components < 1) { stop("'n_components' must be a positive integer") } if (!is.null(pca)) { if (!is.numeric(pca) || pca < 1) { stop("'pca' must be a positive integer") } if (pca < n_components) { stop("'pca' must be >= n_components") } } if (fast_sgd) { n_sgd_threads <- "auto" pcg_rand <- FALSE approx_pow <- TRUE } if (n_threads < 0) { stop("n_threads cannot be < 0") } if (n_threads %% 1 != 0) { n_threads <- round(n_threads) tsmessage("Non-integer 'n_threads' provided. Setting to ", n_threads) } if (n_sgd_threads == "auto") { n_sgd_threads <- n_threads } if (n_sgd_threads < 0) { stop("n_sgd_threads cannot be < 0") } if (n_sgd_threads %% 1 != 0) { n_sgd_threads <- round(n_sgd_threads) tsmessage("Non-integer 'n_sgd_threads' provided. Setting to ", n_sgd_threads) } if (n_threads > 0) { RcppParallel::setThreadOptions(numThreads = n_threads) } # Store categorical columns to be used to generate the graph Xcat <- NULL # number of original columns in data frame (or matrix) # will be used only if using df or matrix and ret_model = TRUE norig_col <- NULL if (is.null(X)) { if (!is.list(nn_method)) { stop("If X is NULL, must provide NN data in nn_method") } if (is.character(init) && tolower(init) %in% c("spca", "pca")) { stop("init = 'pca' and 'spca' can't be used with X = NULL") } n_vertices <- x2nv(nn_method) } else if (methods::is(X, "dist")) { if (ret_model) { stop("Can only create models with dense matrix or data frame input") } n_vertices <- attr(X, "Size") tsmessage("Read ", n_vertices, " rows") } else if (methods::is(X, "sparseMatrix")) { if (ret_model) { stop("Can only create models with dense matrix or data frame input") } n_vertices <- nrow(X) if (ncol(X) != n_vertices) { stop("Sparse matrices are only supported as distance matrices") } tsmessage("Read ", n_vertices, " rows of sparse distance matrix") } else { cat_ids <- NULL norig_col <- ncol(X) if (methods::is(X, "data.frame") || methods::is(X, "matrix")) { if (methods::is(X, "matrix")) { X <- data.frame(X) } cat_res <- find_categoricals(metric) metric <- cat_res$metrics cat_ids <- cat_res$categoricals # Convert categorical columns to factors if they aren't already if (!is.null(cat_ids)) { X[, cat_ids] <- lapply(X[, cat_ids, drop = FALSE], factor) Xcat <- X[, cat_ids, drop = FALSE] } indexes <- which(vapply(X, is.numeric, logical(1))) if (length(indexes) == 0) { stop("No numeric columns found") } X <- as.matrix(X[, indexes]) } n_vertices <- nrow(X) tsmessage( "Read ", n_vertices, " rows and found ", ncol(X), " numeric columns", appendLF = is.null(cat_ids) ) if (length(cat_ids) > 0) { tsmessage(" and ", pluralize("categorical column", length(cat_ids)), time_stamp = FALSE ) } X <- scale_input(X, scale_type = scale, ret_model = ret_model, verbose = verbose ) } if (method == "largevis" && kernel == "knn") { n_neighbors <- perplexity } if (n_neighbors > n_vertices) { # If nn_method is a list, we will determine n_neighbors later if (!is.list(nn_method)) { # Otherwise,for LargeVis, n_neighbors normally determined from perplexity # not an error to be too large if (method == "largevis") { tsmessage("Setting n_neighbors to ", n_vertices) n_neighbors <- n_vertices } else { stop("n_neighbors must be smaller than the dataset size") } } } if (!is.list(metric)) { metrics <- list(c()) names(metrics) <- metric } else { metrics <- metric } # For typical case of numeric matrix X and not using hamming distance, save # PCA results here in case initialization uses PCA too pca_models <- NULL pca_shortcut <- FALSE if (!is.null(pca) && length(metric) == 1 && metric != "hamming" && is.matrix(X) && ncol(X) > pca) { tsmessage("Reducing X column dimension to ", pca, " via PCA") pca_res <- pca_scores(X, ncol = pca, center = pca_center, ret_extra = ret_model, verbose = verbose ) if (ret_model) { X <- pca_res$scores pca_models[["1"]] <- pca_res[c("center", "rotation")] pca_res <- NULL } else { X <- pca_res } pca_shortcut <- TRUE } d2sr <- data2set(X, Xcat, n_neighbors, metrics, nn_method, n_trees, search_k, method, set_op_mix_ratio, local_connectivity, bandwidth, perplexity, kernel, n_threads, grain_size, ret_model, pca = pca, pca_center = pca_center, n_vertices = n_vertices, tmpdir = tmpdir, verbose = verbose ) V <- d2sr$V nns <- d2sr$nns if (is.null(pca_models)) { pca_models <- d2sr$pca_models } if (!is.null(y)) { tsmessage("Processing y data") if (!is.list(target_metric)) { target_metrics <- list(c()) names(target_metrics) <- target_metric } else { target_metrics <- target_metric } ycat <- NULL ycat_ids <- NULL if (methods::is(y, "data.frame")) { ycat_res <- find_categoricals(target_metric) target_metric <- ycat_res$metrics ycat_ids <- ycat_res$categoricals if (!is.null(ycat_ids)) { ycat <- y[, ycat_ids, drop = FALSE] } else { ycindexes <- which(vapply(y, is.factor, logical(1))) if (length(ycindexes) > 0) { ycat <- (y[, ycindexes, drop = FALSE]) } } yindexes <- which(vapply(y, is.numeric, logical(1))) if (length(yindexes) > 0) { y <- as.matrix(y[, yindexes]) } else { y <- NULL } } else if (is.list(y)) { nn_method <- y } else if (is.numeric(y)) { y <- as.matrix(y) } else if (is.factor(y)) { ycat <- data.frame(y) y <- NULL } if (!is.null(y)) { yd2sr <- data2set(y, ycat, target_n_neighbors, target_metrics, nn_method, n_trees, search_k, method, set_op_mix_ratio = 1.0, local_connectivity = 1.0, bandwidth = 1.0, perplexity = perplexity, kernel = kernel, n_threads = n_threads, grain_size = grain_size, ret_model = FALSE, pca = pca, n_vertices = n_vertices, tmpdir = tmpdir, verbose = verbose ) tsmessage( "Intersecting X and Y sets with target weight = ", formatC(target_weight) ) V <- set_intersect(V, yd2sr$V, target_weight, reset = TRUE) yd2sr$V <- NULL yd2sr$nns <- NULL } else if (!is.null(ycat)) { V <- categorical_intersection_df(ycat, V, weight = target_weight, verbose = verbose ) } } if (!(ret_model || ret_nn)) { nns <- NULL gc() } if (methods::is(init, "matrix")) { if (nrow(init) != n_vertices || ncol(init) != n_components) { stop("init matrix does not match necessary configuration for X") } tsmessage("Initializing from user-supplied matrix") embedding <- init } else { init <- match.arg(tolower(init), c( "spectral", "random", "lvrandom", "normlaplacian", "laplacian", "spca", "pca", "inormlaplacian", "ispectral", "agspectral" )) if (init_is_spectral(init)) { connected <- connected_components(V) if (connected$n_components > 1) { tsmessage( "Found ", connected$n_components, " connected components, ", "falling back to 'spca' initialization with init_sdev = 1" ) init <- "spca" init_sdev <- 1 } } # Don't repeat PCA initialization if we've already done it once if (pca_shortcut && init %in% c("spca", "pca") && pca >= n_components) { embedding <- X[, 1:n_components] if (init == "spca") { tsmessage("Initializing from scaled PCA") } else { tsmessage("Initializing from PCA") } } else { embedding <- switch(init, spectral = spectral_init(V, ndim = n_components, verbose = verbose), random = rand_init(n_vertices, n_components, verbose = verbose), lvrandom = rand_init_lv(n_vertices, n_components, verbose = verbose), normlaplacian = normalized_laplacian_init(V, ndim = n_components, verbose = verbose ), laplacian = laplacian_eigenmap(V, ndim = n_components, verbose = verbose), # we handle scaling pca below spca = pca_init(X, ndim = n_components, verbose = verbose), pca = pca_init(X, ndim = n_components, verbose = verbose), ispectral = irlba_spectral_init(V, ndim = n_components, verbose = verbose), inormlaplacian = irlba_normalized_laplacian_init(V, ndim = n_components, verbose = verbose ), agspectral = agspectral_init(V, n_neg_nbrs = negative_sample_rate, ndim = n_components, verbose = verbose ), stop("Unknown initialization method: '", init, "'") ) } if (!is.null(init_sdev) || init == "spca") { if (is.null(init_sdev)) { init_sdev <- 1e-4 } embedding <- shrink_coords(embedding, init_sdev) } } if (is.null(n_epochs) || n_epochs <= 0) { if (method == "largevis") { n_epochs <- lvish_epochs(n_vertices, V) } else { if (n_vertices <= 10000) { n_epochs <- 500 } else { n_epochs <- 200 } } } V@x[V@x < max(V@x) / n_epochs] <- 0 V <- Matrix::drop0(V) epochs_per_sample <- make_epochs_per_sample(V@x, n_epochs) positive_head <- V@i positive_tail <- Matrix::which(V != 0, arr.ind = TRUE)[, 2] - 1 tsmessage( "Commencing optimization for ", n_epochs, " epochs, with ", length(positive_head), " positive edges", pluralize("thread", n_sgd_threads, " using") ) parallelize <- n_sgd_threads > 0 if (n_sgd_threads > 0) { RcppParallel::setThreadOptions(numThreads = n_sgd_threads) } embedding <- t(embedding) if (tolower(method) == "umap") { embedding <- optimize_layout_umap( head_embedding = embedding, tail_embedding = NULL, positive_head = positive_head, positive_tail = positive_tail, n_epochs = n_epochs, n_vertices = n_vertices, epochs_per_sample = epochs_per_sample, a = a, b = b, gamma = gamma, initial_alpha = alpha, negative_sample_rate, approx_pow = approx_pow, pcg_rand = pcg_rand, parallelize = parallelize, grain_size = grain_size, move_other = TRUE, verbose = verbose ) } else if (method == "tumap") { embedding <- optimize_layout_tumap( head_embedding = embedding, tail_embedding = NULL, positive_head = positive_head, positive_tail = positive_tail, n_epochs = n_epochs, n_vertices = n_vertices, epochs_per_sample = epochs_per_sample, initial_alpha = alpha, negative_sample_rate = negative_sample_rate, pcg_rand = pcg_rand, parallelize = parallelize, grain_size = grain_size, move_other = TRUE, verbose = verbose ) } else { embedding <- optimize_layout_largevis( head_embedding = embedding, positive_head = positive_head, positive_tail = positive_tail, n_epochs = n_epochs, n_vertices = n_vertices, epochs_per_sample = epochs_per_sample, gamma = gamma, initial_alpha = alpha, negative_sample_rate = negative_sample_rate, pcg_rand = pcg_rand, parallelize = parallelize, grain_size = grain_size, verbose = verbose ) } embedding <- t(embedding) gc() # Center the points before returning embedding <- scale(embedding, center = TRUE, scale = FALSE) tsmessage("Optimization finished") if (ret_model || ret_nn) { nblocks <- length(nns) res <- list(embedding = embedding) if (ret_model) { res <- append(res, list( scale_info = attr_to_scale_info(X), n_neighbors = n_neighbors, # Can't use nn descent during transform, so if used in training, # double the Annoy search parameter to compensate search_k = search_k, local_connectivity = local_connectivity, n_epochs = n_epochs, alpha = alpha, negative_sample_rate = negative_sample_rate, method = method, a = a, b = b, gamma = gamma, approx_pow = approx_pow, metric = metrics, norig_col = norig_col, pcg_rand = pcg_rand )) if (nblocks > 1) { res$nn_index <- list() for (i in 1:nblocks) { res$nn_index[[i]] <- nns[[i]]$index } } else { res$nn_index <- nns[[1]]$index } if (!is.null(pca_models)) { res$pca_models <- pca_models } } if (ret_nn) { res$nn <- list() for (i in 1:nblocks) { res$nn[[i]] <- list(idx = nns[[i]]$idx, dist = nns[[i]]$dist) } names(res$nn) <- names(nns) } } else { res <- embedding } res } #' Save or Load a Model #' #' Functions to write a UMAP model to a file, and to restore. #' #' @param model a UMAP model create by \code{\link{umap}}. #' @param file name of the file where the model is to be saved or read from. #' #' @examples #' # create model #' model <- umap(iris[1:100, ], ret_model = TRUE) #' #' # save #' model_file <- tempfile("iris_umap") #' save_uwot(model, file = model_file) #' #' # restore #' model2 <- load_uwot(file = model_file) #' #' identical(model, model2) #' #' unlink(model_file) #' @export save_uwot <- function(model, file) { wd <- getwd() tryCatch( { # create directory to store files in mod_dir <- tempfile(pattern = "dir") dir.create(mod_dir) uwot_dir <- file.path(mod_dir, "uwot") dir.create(uwot_dir) # save model model_tmpfname <- file.path(uwot_dir, "model") saveRDS(model, file = model_tmpfname) # save each nn index metrics <- names(model$metric) n_metrics <- length(metrics) for (i in 1:n_metrics) { nn_tmpfname <- file.path(uwot_dir, paste0("nn", i)) if (n_metrics == 1) { model$nn_index$save(nn_tmpfname) model$nn_index$unload() model$nn_index$load(nn_tmpfname) } else { model$nn_index[[i]]$save(nn_tmpfname) model$nn_index[[i]]$unload() model$nn_index[[i]]$load(nn_tmpfname) } } # archive the files under the temp dir into the single target file # change directory so the archive only contains one directory setwd(mod_dir) utils::tar(tarfile = file, files = "uwot/") }, finally = { setwd(wd) if (file.exists(mod_dir)) { unlink(mod_dir, recursive = TRUE) } } ) } #' Save or Load a Model #' #' Functions to write a UMAP model to a file, and to restore. #' #' @param file name of the file where the model is to be saved or read from. #' #' @examples #' # create model #' model <- umap(iris[1:100, ], ret_model = TRUE) #' #' # save #' model_file <- tempfile("iris_umap") #' save_uwot(model, file = model_file) #' #' # restore #' model2 <- load_uwot(file = model_file) #' #' identical(model, model2) #' #' unlink(model_file) #' @export load_uwot <- function(file) { model <- NULL tryCatch( { # create directory to store files in mod_dir <- tempfile(pattern = "dir") dir.create(mod_dir) utils::untar(file, exdir = mod_dir) model_fname <- file.path(mod_dir, "uwot/model") if (!file.exists(model_fname)) { stop("Can't find model in ", file) } model <- readRDS(file = model_fname) metrics <- names(model$metric) n_metrics <- length(metrics) for (i in 1:n_metrics) { nn_fname <- file.path(mod_dir, paste0("uwot/nn", i)) if (!file.exists(nn_fname)) { stop("Can't find nearest neighbor index ", nn_fname, " in ", file) } metric <- metrics[[i]] # 31: need to specify the index dimensionality when creating the index ann <- create_ann(metric, ndim = length(model$metric[[i]])) ann$load(nn_fname) if (n_metrics == 1) { model$nn_index <- ann } else { model$nn_index[[i]] <- ann } } }, finally = { if (file.exists(mod_dir)) { unlink(mod_dir, recursive = TRUE) } } ) model } # Get the number of vertices in X x2nv <- function(X) { if (is.list(X)) { if (!is.null(X$idx)) { n_vertices <- x2nv(X$idx) } else { if (length(X) > 0) { n_vertices <- x2nv(X[[1]]) } else { stop("Can't find n_vertices for list X") } } } else if (methods::is(X, "dist")) { n_vertices <- attr(X, "Size") } else if (methods::is(X, "sparseMatrix")) { n_vertices <- nrow(X) } else if (methods::is(X, "data.frame") || methods::is(X, "matrix")) { n_vertices <- nrow(X) } else if (is.numeric(X)) { n_vertices <- length(X) } else { stop("Can't find number of vertices for X of type '", class(X)[1], "'") } n_vertices } data2set <- function(X, Xcat, n_neighbors, metrics, nn_method, n_trees, search_k, method, set_op_mix_ratio, local_connectivity, bandwidth, perplexity, kernel, n_threads, grain_size, ret_model, n_vertices = x2nv(X), tmpdir = tempdir(), pca = NULL, pca_center = TRUE, verbose = FALSE) { V <- NULL nns <- list() nblocks <- length(metrics) # Check for precalculated NN data in nn_method if (is.list(nn_method)) { if (is.null(nn_method$idx)) { nblocks <- length(nn_method) if (nblocks == 0) { stop("Incorrect format for precalculated neighbor data") } } else { nblocks <- 1 # wrap nn data in a list so data is always a list of lists nn_method <- list(nn_method) } metrics <- replicate(nblocks, NULL, simplify = FALSE) names(metrics) <- rep("precomputed", nblocks) } if (nblocks > 1) { tsmessage("Found ", nblocks, " blocks of data") } mnames <- tolower(names(metrics)) if (is.null(nn_method)) { if (n_vertices < 4096 && !ret_model && all(mnames == "euclidean")) { tsmessage("Using FNN for neighbor search, n_neighbors = ", n_neighbors) nn_method <- "fnn" } else { tsmessage("Using Annoy for neighbor search, n_neighbors = ", n_neighbors) nn_method <- "annoy" } } pca_models <- list() for (i in 1:nblocks) { metric <- mnames[[i]] metric <- match.arg(metric, c( "euclidean", "cosine", "manhattan", "hamming", "precomputed" )) # Defaults for this block which can be overridden pca_i <- pca pca_center_i <- pca_center subset <- metrics[[i]] if (is.null(subset)) { Xsub <- X } else if (is.list(subset)) { # e.g. "euclidean" = list(1:10, pca_center = FALSE), lsres <- lsplit_unnamed(subset) if (is.null(lsres$unnamed)) { stop("Error: no subset provided for block ", i) } if (length(lsres$unnamed) != 1) { stop("Error: only one unnamed item should be provided for block ", i) } subset <- lsres$unnamed[[1]] # possible overrides if (!is.null(lsres$named)) { lsnamed <- lsres$named lsnames <- names(lsnamed) if (!is.null(lsnamed$pca_center)) { pca_center_i <- lsnamed$pca_center } # PCA argument can be NULL, so need to check if it was explicitly provided if ("pca" %in% lsnames) { pca_i <- lsnamed$pca } } Xsub <- X[, subset, drop = FALSE] } else { Xsub <- X[, subset, drop = FALSE] } if (!is.null(X) && is.matrix(X)) { block_size <- ncol(Xsub) if (block_size == 0) { stop("Block ", i, " has zero size") } if (nblocks > 1) { tsmessage( "Processing block ", i, " of ", nblocks, " with size ", block_size, " using metric '", metric, "'" ) } } else { # X is NULL or dist or something like that if (nblocks > 1) { tsmessage( "Processing block ", i, " of ", nblocks, " using metric '", metric, "'" ) } } if (!is.null(pca_i) && is.matrix(X) && metric != "hamming" && ncol(X) > pca_i && nrow(X) > pca_i) { tsmessage("Reducing column dimension to ", pca_i, " via PCA") pca_res <- pca_scores(Xsub, pca_i, ret_extra = ret_model, center = pca_center_i, verbose = verbose ) if (ret_model) { Xsub <- pca_res$scores pca_models[[as.character(i)]] <- pca_res[c("center", "rotation")] pca_res <- NULL } else { Xsub <- pca_res } } nn_sub <- nn_method # Extract this block of nn data from list of lists if (metric == "precomputed") { nn_sub <- nn_method[[i]] if (i == 1) { n_neighbors <- NULL } else { n_neighbors <- ncol(nn_method[[1]]$idx) } } x2set_res <- x2set(Xsub, n_neighbors, metric, nn_method = nn_sub, n_trees, search_k, method, set_op_mix_ratio, local_connectivity, bandwidth, perplexity, kernel, n_threads, grain_size, ret_model, n_vertices = n_vertices, tmpdir = tmpdir, verbose = verbose ) Vblock <- x2set_res$V nn <- x2set_res$nn nns[[i]] <- nn names(nns)[[i]] <- metric n_neighbors <- ncol(nn$idx) if (is.null(V)) { V <- Vblock } else { V <- set_intersect(V, Vblock, weight = 0.5, reset = TRUE) } } if (!is.null(Xcat)) { V <- categorical_intersection_df(Xcat, V, weight = 0.5, verbose = verbose) } list(V = V, nns = nns, pca_models = pca_models) } x2nn <- function(X, n_neighbors, metric, nn_method, n_trees, search_k, tmpdir = tempdir(), n_threads, grain_size, ret_model, n_vertices = x2nv(X), verbose = FALSE) { if (is.list(nn_method)) { # on first iteration n_neighbors is NULL # on subsequent iterations ensure n_neighbors is consistent for all data validate_nn(nn_method, n_vertices, n_neighbors = n_neighbors) nn <- nn_method } else { nn_method <- match.arg(tolower(nn_method), c("annoy", "fnn")) if (nn_method == "fnn" && metric != "euclidean") { stop( "nn_method = 'FNN' is only compatible with distance metric ", "'euclidean'" ) } if (nn_method == "fnn" && ret_model) { stop("nn_method = 'FNN' is incompatible with ret_model = TRUE") } nn <- find_nn(X, n_neighbors, method = nn_method, metric = metric, n_trees = n_trees, search_k = search_k, tmpdir = tmpdir, n_threads = n_threads, grain_size = grain_size, ret_index = ret_model, verbose = verbose ) } nn } validate_nn <- function(nn_method, n_vertices, n_neighbors = NULL) { if (!is.matrix(nn_method$idx)) { stop("Couldn't find precalculated 'idx' matrix") } if (nrow(nn_method$idx) != n_vertices) { stop( "Precalculated 'idx' matrix must have ", n_vertices, " rows, but found ", nrow(nn_method$idx) ) } # set n_neighbors from these matrices if it hasn't been already set if (is.null(n_neighbors)) { n_neighbors <- ncol(nn_method$idx) } if (!is.matrix(nn_method$dist)) { stop("Couldn't find precalculated 'dist' matrix") } if (nrow(nn_method$idx) != n_vertices) { stop("Precalculated 'dist' matrix must have ", n_vertices, " rows, but found ", nrow(nn_method$dist)) } if (ncol(nn_method$dist) != n_neighbors) { stop("Precalculated 'dist' matrix must have ", n_neighbors, " cols, but found ", ncol(nn_method$dist)) } } nn2set <- function(method, nn, set_op_mix_ratio, local_connectivity, bandwidth, perplexity, kernel, n_threads, grain_size, verbose = FALSE) { if (method == "largevis") { n_vertices <- nrow(nn$dist) if (perplexity >= n_vertices) { stop("perplexity can be no larger than ", n_vertices - 1) } V <- perplexity_similarities( nn = nn, perplexity = perplexity, n_threads = n_threads, grain_size = grain_size, kernel = kernel, verbose = verbose ) } else { V <- fuzzy_simplicial_set( nn = nn, set_op_mix_ratio = set_op_mix_ratio, local_connectivity = local_connectivity, bandwidth = bandwidth, n_threads = n_threads, grain_size = grain_size, verbose = verbose ) } } x2set <- function(X, n_neighbors, metric, nn_method, n_trees, search_k, method, set_op_mix_ratio, local_connectivity, bandwidth, perplexity, kernel, n_threads, grain_size, ret_model, n_vertices = x2nv(X), tmpdir = tempdir(), verbose = FALSE) { nn <- x2nn(X, n_neighbors = n_neighbors, metric = metric, nn_method = nn_method, n_trees = n_trees, search_k = search_k, tmpdir = tmpdir, n_threads = n_threads, grain_size = grain_size, ret_model = ret_model, n_vertices = n_vertices, verbose = verbose ) if (any(is.infinite(nn$dist))) { stop("Infinite distances found in nearest neighbors") } gc() V <- nn2set(method, nn, set_op_mix_ratio, local_connectivity, bandwidth, perplexity, kernel, n_threads, grain_size, verbose = verbose ) if (any(is.na(V))) { stop("Non-finite entries in the input matrix") } gc() list( nn = nn, V = V ) } set_intersect <- function(A, B, weight = 0.5, reset = TRUE) { A <- general_simplicial_set_intersection( A, B, weight ) # https://github.com/lmcinnes/umap/issues/58#issuecomment-437633658 # For now always reset if (reset) { A <- reset_local_connectivity(Matrix::drop0(A)) } A } categorical_intersection_df <- function(X, V, weight = 0.5, verbose = FALSE) { tsmessage( "Carrying out categorical intersection for ", pluralize("column", ncol(X)) ) for (i in 1:ncol(X)) { V <- categorical_intersection(X[, i], V, weight = weight, verbose = (verbose && i == 1) ) } V } categorical_intersection <- function(x, V, weight, verbose = FALSE) { if (is.null(V)) { stop("V cannot be null for categorical intersection") } if (weight < 1.0) { far_dist <- 2.5 * (1.0 / (1.0 - weight)) } else { far_dist <- 1.0e12 } tsmessage( "Applying categorical set intersection, weight = ", formatC(weight), " far distance = ", formatC(far_dist) ) V <- categorical_simplicial_set_intersection(V, x, far_dist = far_dist, verbose = verbose ) V } # Creates the number of epochs per sample for each weight # weights are the non-zero input affinities (1-simplex) # n_epoch the total number of epochs # There is an inverse relationship between the weights and the return vector. make_epochs_per_sample <- function(weights, n_epochs) { result <- rep(-1, length(weights)) n_samples <- n_epochs * (weights / max(weights)) result[n_samples > 0] <- n_epochs / n_samples[n_samples > 0] result } # Create the a/b parameters from spread and min_dist find_ab_params <- function(spread = 1, min_dist = 0.001) { xv <- seq(from = 0, to = spread * 3, length.out = 300) yv <- rep(0, length(xv)) yv[xv < min_dist] <- 1 yv[xv >= min_dist] <- exp(-(xv[xv >= min_dist] - min_dist) / spread) result <- try( { stats::nls(yv ~ 1 / (1 + a * xv^(2 * b)), start = list(a = 1, b = 1) )$m$getPars() }, silent = TRUE ) if (class(result) == "try-error") { stop( "Can't find a, b for provided spread = ", spread, " min_dist = ", min_dist ) } result } # The default number of edge samples used by LargeVis lvish_samples <- function(n_vertices) { n_samples <- 0 if (n_vertices < 10000) { n_samples <- 1000 } else if (n_vertices < 1000000) { n_samples <- (n_vertices - 10000) * 9000 / (1000000 - 10000) + 1000 } else { n_samples <- n_vertices / 100 } round(n_samples * 1000000) } # Returns the number of epochs required to generate the default number of edge samples # used in LargeVis lvish_epochs <- function(n_vertices, V) { n_samples <- lvish_samples(n_vertices) round(n_samples * max(V) / sum(V)) } # Scale X according to various strategies scale_input <- function(X, scale_type, ret_model = FALSE, verbose = FALSE) { if (is.null(scale_type)) { scale_type <- "none" } else if (is.logical(scale_type)) { scale_type <- ifelse(scale_type, "scale", "none") } else if (tolower(scale_type) == "z") { scale_type <- "scale" } scale_type <- match.arg( tolower(scale_type), c("none", "scale", "range", "colrange", "maxabs") ) switch(scale_type, range = { tsmessage("Range scaling X") min_X <- min(X) X <- X - min_X max_X <- max(X) X <- X / max_X if (ret_model) { attr(X, "scaled:range:min") <- min_X attr(X, "scaled:range:max") <- max_X } }, colrange = { tsmessage("Column range scaling X") min_X <- apply(X, 2, min) X <- sweep(X, 2, min_X) max_X <- apply(X, 2, max) X <- sweep(X, 2, max_X, `/`) if (ret_model) { attr(X, "scaled:colrange:min") <- min_X attr(X, "scaled:colrange:max") <- max_X } }, maxabs = { tsmessage("Normalizing by max-abs") X <- base::scale(X, scale = FALSE) max_abs <- max(abs(X)) X <- X / max_abs if (ret_model) { attr(X, "scaled:maxabs") <- max_abs } }, scale = { tsmessage("Scaling to zero mean and unit variance") varf <- function(x) { sum((x - sum(x) / length(x))^2) } non_zero_var_cols <- apply(X, 2, varf) >= .Machine$double.xmin if (length(non_zero_var_cols) == 0) { stop("Matrix has zero variance") } X <- X[, non_zero_var_cols] tsmessage("Kept ", ncol(X), " non-zero-variance columns") X <- base::scale(X, scale = TRUE) if (ret_model) { attr(X, "scaled:nzvcols") <- which(non_zero_var_cols) } } ) X } attr_to_scale_info <- function(X) { Xattr <- attributes(X) Xattr <- Xattr[startsWith(names(Xattr), "scaled:")] if (length(Xattr) == 0) { Xattr <- NULL } Xattr } #' @useDynLib uwot, .registration=TRUE #' @importFrom Rcpp sourceCpp #' @importFrom RcppParallel RcppParallelLibs .onUnload <- function(libpath) { library.dynam.unload("uwot", libpath) } uwot/R/neighbors.R0000644000176200001440000002032113571660256013565 0ustar liggesusersfind_nn <- function(X, k, include_self = TRUE, method = "fnn", metric = "euclidean", n_trees = 50, search_k = 2 * k * n_trees, tmpdir = tempdir(), n_threads = max(1, RcppParallel::defaultNumThreads() / 2), grain_size = 1, ret_index = FALSE, verbose = FALSE) { if (methods::is(X, "dist")) { res <- dist_nn(X, k, include_self = include_self) } else if (methods::is(X, "sparseMatrix")) { # sparse distance matrix res <- sparse_nn(X, k, include_self = include_self) } else { # normal matrix if (method == "fnn") { res <- FNN_nn(X, k = k, include_self = include_self) } else { res <- annoy_nn(X, k = k, metric = metric, n_trees = n_trees, search_k = search_k, tmpdir = tmpdir, n_threads = n_threads, ret_index = ret_index, verbose = verbose ) } } res } # n_trees - number of trees to build when constructing the index. The more trees # specified, the larger the index, but the better the results. largeVis uses 10 # trees for datasets with N = 10,000 observations, 20 trees for datasets up to N # = 1,000,000, 50 trees for N up to 5,000,000 and 100 trees otherwise # search_k - the number of nodes to search during the neighbor retrieval. The # larger k, the more accurate results, but the longer the search takes. Default # is k * n_trees. #' @importFrom methods new annoy_nn <- function(X, k = 10, metric = "euclidean", n_trees = 50, search_k = 2 * k * n_trees, tmpdir = tempdir(), n_threads = max(1, RcppParallel::defaultNumThreads() / 2), grain_size = 1, ret_index = FALSE, verbose = FALSE) { ann <- annoy_build(X, metric = metric, n_trees = n_trees, verbose = verbose ) res <- annoy_search(X, k = k, ann = ann, search_k = search_k, tmpdir = tmpdir, n_threads = n_threads, grain_size = grain_size, verbose = verbose ) nn_acc <- sum(res$idx == 1:nrow(X)) / nrow(X) tsmessage("Annoy recall = ", formatC(nn_acc * 100.0), "%") res <- list(idx = res$idx, dist = res$dist, recall = nn_acc) if (ret_index) { res$index <- ann } res } annoy_build <- function(X, metric = "euclidean", n_trees = 50, verbose = FALSE) { nr <- nrow(X) nc <- ncol(X) ann <- create_ann(metric, nc) tsmessage( "Building Annoy index with metric = ", metric, ", n_trees = ", n_trees ) progress <- Progress$new(max = nr, display = verbose) # Add items for (i in 1:nr) { ann$addItem(i - 1, X[i, ]) progress$increment() } # Build index ann$build(n_trees) ann } # create RcppAnnoy class from metric name with ndim dimensions create_ann <- function(name, ndim) { ann <- switch(name, cosine = methods::new(RcppAnnoy::AnnoyAngular, ndim), manhattan = methods::new(RcppAnnoy::AnnoyManhattan, ndim), euclidean = methods::new(RcppAnnoy::AnnoyEuclidean, ndim), hamming = methods::new(RcppAnnoy::AnnoyHamming, ndim), stop("BUG: unknown Annoy metric '", name, "'") ) ann } # Search a pre-built Annoy index for neighbors of X annoy_search <- function(X, k, ann, search_k = 100 * k, tmpdir = tempdir(), n_threads = max(1, RcppParallel::defaultNumThreads() / 2), grain_size = 1, verbose = FALSE) { if (n_threads > 0) { annoy_res <- annoy_search_parallel( X = X, k = k, ann = ann, search_k = search_k, tmpdir = tmpdir, n_threads = n_threads, grain_size = grain_size, verbose = verbose ) res <- list(idx = annoy_res$item + 1, dist = annoy_res$distance) } else { res <- annoy_search_serial( X = X, k = k, ann = ann, search_k = search_k, verbose = verbose ) } # Convert from Angular to Cosine distance if (methods::is(ann, "Rcpp_AnnoyAngular")) { res$dist <- 0.5 * (res$dist * res$dist) } res } annoy_search_serial <- function(X, k, ann, search_k = 100 * k, verbose = FALSE) { tsmessage("Searching Annoy index, search_k = ", search_k) nr <- nrow(X) search_progress <- Progress$new(max = nr, display = verbose) idx <- matrix(nrow = nr, ncol = k) dist <- matrix(nrow = nr, ncol = k) for (i in 1:nr) { res <- ann$getNNsByVectorList(X[i, ], k, search_k, TRUE) if (length(res$item) != k) { stop( "search_k/n_trees settings were unable to find ", k, " neighbors for item ", i ) } idx[i, ] <- res$item dist[i, ] <- res$distance search_progress$increment() } list(idx = idx + 1, dist = dist) } annoy_search_parallel <- function(X, k, ann, search_k = 100 * k, tmpdir = tempdir(), n_threads = max(1, RcppParallel::defaultNumThreads() / 2), grain_size = 1, verbose = FALSE) { index_file <- tempfile(tmpdir = tmpdir) tsmessage("Writing NN index file to temp file ", index_file) ann$save(index_file) fsize <- file.size(index_file) tsmessage( "Searching Annoy index using ", pluralize("thread", n_threads), ", search_k = ", search_k ) ann_class <- class(ann) search_nn_func <- switch(ann_class, Rcpp_AnnoyAngular = annoy_cosine_nns, Rcpp_AnnoyManhattan = annoy_manhattan_nns, Rcpp_AnnoyEuclidean = annoy_euclidean_nns, Rcpp_AnnoyHamming = annoy_hamming_nns, stop("BUG: unknown Annoy class '", ann_class, "'") ) res <- search_nn_func(index_file, X, k, search_k, grain_size = grain_size, verbose = verbose ) unlink(index_file) if (any(res$item == -1)) { msg <- paste0( "search_k/n_trees settings were unable to find ", k, " neighbors for all items." ) if (fsize > 2147483647) { msg <- paste0( msg, " Index file may have been too large to process.", " Try repeating with n_threads = 0, reducing n_trees,", " or reducing to a smaller dimensionality, e.g. pca = 50" ) } stop(msg) } res } FNN_nn <- function(X, k = 10, include_self = TRUE) { if (include_self) { k <- k - 1 } fnn <- FNN::get.knn(X, k) idx <- fnn$nn.index dist <- fnn$nn.dist if (include_self) { idx <- cbind(seq_len(nrow(X)), idx) dist <- cbind(rep(0, nrow(X)), dist) } list(idx = idx, dist = dist) } dist_nn <- function(X, k, include_self = TRUE) { X <- as.matrix(X) if (!include_self) { k <- k + 1 } nn_idx <- t(apply(X, 2, order))[, 1:k] nn_dist <- matrix(0, nrow = nrow(X), ncol = k) for (i in seq_len(nrow(nn_idx))) { nn_dist[i, ] <- X[i, nn_idx[i, ]] } if (!include_self) { nn_idx <- nn_idx[, 2:ncol(nn_idx)] nn_dist <- nn_dist[, 2:ncol(nn_dist)] } attr(nn_idx, "dimnames") <- NULL attr(nn_dist, "dimnames") <- NULL list(idx = nn_idx, dist = nn_dist) } sparse_nn <- function(X, k, include_self = TRUE) { if (include_self) { k <- k - 1 } n <- nrow(X) nn_idx <- matrix(0, nrow = n, ncol = k) nn_dist <- matrix(0, nrow = n, ncol = k) for (i in 1:n) { dists <- X[, i] is_nonzero <- dists != 0 dist_nonzero <- dists[is_nonzero] if (length(dist_nonzero) < k) { stop( "Row ", i, " of distance matrix has only ", length(dist_nonzero), " defined distances" ) } k_order <- order(dist_nonzero)[1:k] idx_nonzero <- which(is_nonzero, arr.ind = TRUE) nn_idx[i, ] <- idx_nonzero[k_order] nn_dist[i, ] <- dist_nonzero[k_order] } if (include_self) { nn_idx <- cbind(1:n, nn_idx) nn_dist <- cbind(rep(0, n), nn_dist) } list(idx = nn_idx, dist = nn_dist) } uwot/R/util.R0000644000176200001440000000503113571660267012565 0ustar liggesusersstime <- function() { format(Sys.time(), "%T") } # message with a time stamp # appears only if called from an environment where a logical verbose = TRUE # OR force = TRUE tsmessage <- function(..., domain = NULL, appendLF = TRUE, force = FALSE, time_stamp = TRUE) { verbose <- get0("verbose", envir = sys.parent()) if (force || (!is.null(verbose) && verbose)) { msg <- "" if (time_stamp) { msg <- paste0(stime(), " ") } message(msg, ..., domain = domain, appendLF = appendLF) utils::flush.console() } } # log vector information summarize <- function(X, msg = "") { summary_X <- summary(X, digits = max(3, getOption("digits") - 3)) tsmessage(msg, ": ", paste(names(summary_X), ":", summary_X, "|", collapse = "" ), force = get0("verbose", envir = sys.parent()) ) } # pluralize("thread", 1) => "1 thread" # pluralize("thread", 2) => "2 threads" pluralize <- function(str, n, prefix = NULL, inc_num = TRUE) { if (n == 0) { return("") } ret <- paste0(str, ifelse(n != 1, "s", "")) if (inc_num) { ret <- paste0(n, " ", ret) } if (!is.null(prefix)) { ret <- paste0(prefix, " ", ret) } ret } # convert data frame to matrix using numeric columns x2m <- function(X) { if (!methods::is(X, "matrix")) { m <- as.matrix(X[, which(vapply(X, is.numeric, logical(1)))]) } else { m <- X } m } # given a metric argument, returns a list containing: # metrics - the input list with any members called "categorical" removed # categoricals - a vector of the categorical ids find_categoricals <- function(metrics) { res <- list( metrics = metrics ) if (is.list(metrics)) { cat_pos <- grep("categorical", names(metrics)) if (length(cat_pos) > 0) { cat_ids <- unlist(metrics[cat_pos]) names(cat_ids) <- NULL res <- list( metrics = metrics[-cat_pos], categoricals = cat_ids ) } } res } # Splits a list into its named and unnamed components: # > lsplit_unnamed(list(1:10, pca_center = FALSE)) # $named # $named$pca_center # [1] FALSE # # # $unnamed # $unnamed[[1]] # [1] 1 2 3 4 5 6 7 8 9 10 lsplit_unnamed <- function(l) { lnames <- names(l) if (is.null(lnames)) { return(list(unnamed = l)) } is_named <- lnames != "" nids <- which(is_named) uids <- which(!is_named) if (length(uids) == 0) { return(list(named = l[nids])) } list( named = l[nids], unnamed = l[uids] ) } uwot/R/supervised.R0000644000176200001440000001215713555104657014006 0ustar liggesusers# Combine a fuzzy simplicial set with another fuzzy simplicial set # generated from categorical data using categorical distances. The target # data is assumed to be categorical label data (a vector of labels), # and this will update the fuzzy simplicial set to respect that label data. # TODO: optional category cardinality based weighting of distance # simplicial_set The input fuzzy simplicial set. # target The categorical labels to use in the intersection. # unknown_dist The distance an unknown label (-1) is assumed to be from any point. # far_dist The distance between unmatched labels. # Return The resulting intersected fuzzy simplicial set. categorical_simplicial_set_intersection <- function( simplicial_set, target, unknown_dist = 1.0, far_dist = 5.0, verbose = FALSE) { # Convert to dgTMatrix to get to the j indices simplicial_set <- methods::as(simplicial_set, "dgTMatrix") simplicial_set@x <- fast_intersection_cpp( simplicial_set@i, simplicial_set@j, simplicial_set@x, target, unknown_dist, far_dist ) # drop0 converts back to dgCMatrix reset_local_connectivity(Matrix::drop0(simplicial_set)) } # Reset the local connectivity requirement -- each data sample should # have complete confidence in at least one 1-simplex in the simplicial set. # We can enforce this by locally rescaling confidences, and then remerging the # different local simplicial sets together. reset_local_connectivity <- function(simplicial_set) { fuzzy_set_union(row_max_normalize(simplicial_set)) } # Under the assumption of categorical distance for the intersecting # simplicial set perform a fast intersection. # This is not at all fast in R, use fast_intersection_cpp instead fast_intersection <- function(rows, cols, values, target, unknown_dist = 1.0, far_dist = 5.0) { ex_unknown <- exp(-unknown_dist) ex_far <- exp(-far_dist) for (nz in seq_len(length(values))) { i <- rows[nz] j <- cols[nz] if (is.na(target[i]) || is.na(target[j])) { values[nz] <- values[nz] * ex_unknown } else if (target[i] != target[j]) { values[nz] <- values[nz] * ex_far } } values } general_simplicial_set_intersection <- function(left, right, weight) { result <- methods::as(left + right, "dgTMatrix") result@x <- general_sset_intersection_cpp( left@p, left@i, left@x, right@p, right@i, right@x, result@i, result@j, result@x, weight ) result } # An R translation of the Python function. Not very fast, # so use the C++ version instead general_sset_intersection <- function(indptr1, indices1, data1, indptr2, indices2, data2, result_row, result_col, result_val, mix_weight = 0.5) { left_min <- max(min(data1) / 2.0, 1.0e-8) right_min <- max(min(data2) / 2.0, 1.0e-8) for (idx in seq_len(length(result_row))) { i <- result_col[idx] + 1 j <- result_row[idx] left_val <- left_min for (k in (indptr1[i]):(indptr1[i + 1] - 1)) { if (indices1[k + 1] == j) { left_val <- data1[k + 1] } } right_val <- right_min for (k in (indptr2[i]):(indptr2[i + 1] - 1)) { if (indices2[k + 1] == j) { right_val <- data2[k + 1] } } if (left_val > left_min || right_val > right_min) { if (mix_weight < 0.5) { result_val[idx] <- left_val * right_val^(mix_weight / (1.0 - mix_weight)) } else { result_val[idx] <- right_val * left_val^(((1.0 - mix_weight) / mix_weight)) } } } result_val } # Sparse Matrix functions ------------------------------------------------- # normalize each column of a dgCMatrix by its maximum # https://stackoverflow.com/questions/39284774/column-rescaling-for-a-very-large-sparse-matrix-in-r col_max_normalize <- function(X) { X@x <- X@x / rep.int(colMaxs(X), diff(X@p)) X } # normalize each row of a dgCMatrix by its maximum row_max_normalize <- function(X) { Matrix::t(col_max_normalize(Matrix::t(X))) } col_sum_normalize <- function(X) { X@x <- X@x / rep.int(Matrix::colSums(X), diff(X@p)) X } row_sum_normalize <- function(X) { Matrix::t(col_sum_normalize(Matrix::t(X))) } # column maximums of a dgCMatrix colMaxs <- function(X) { nc <- ncol(X) result <- rep(0, nc) dX <- diff(X@p) for (i in 1:nc) { if (dX[i] > 0) { result[i] <- max(X@x[(X@p[i] + 1):X@p[i + 1]]) } } result } # row maximums of a dgCMatrix rowMaxs <- function(X) { colMaxs(Matrix::t(X)) } uwot/R/affinity.R0000644000176200001440000001255713571660243013426 0ustar liggesusers# set_op_mix_ratio = between 0 and 1 mixes in fuzzy set intersection # set to 0 for intersection only #' @import Matrix fuzzy_set_union <- function(X, set_op_mix_ratio = 1) { XX <- X * Matrix::t(X) if (set_op_mix_ratio == 0) { Matrix::drop0(XX) } else if (set_op_mix_ratio == 1) { Matrix::drop0(X + Matrix::t(X) - XX) } else { Matrix::drop0( set_op_mix_ratio * (X + Matrix::t(X) - XX) + (1 - set_op_mix_ratio) * XX ) } } # Abstracts over whether the smooth knn distances uses the multithreaded code # or not smooth_knn <- function(nn, local_connectivity = 1.0, bandwidth = 1.0, n_threads = max( 1, RcppParallel::defaultNumThreads() / 2 ), grain_size = 1, verbose = FALSE) { tsmessage( "Commencing smooth kNN distance calibration", pluralize("thread", n_threads, " using") ) parallelize <- n_threads > 0 affinity_matrix_res <- smooth_knn_distances_parallel( nn_dist = nn$dist, nn_idx = nn$idx, n_iter = 64, local_connectivity = local_connectivity, bandwidth = bandwidth, tol = 1e-5, min_k_dist_scale = 1e-3, parallelize = parallelize, grain_size = grain_size, verbose = verbose ) if (verbose && affinity_matrix_res$n_failures > 0) { tsmessage(affinity_matrix_res$n_failures, " smooth knn distance failures") } affinity_matrix_res$matrix } # Given nearest neighbor data and a measure of distance compute # the fuzzy simplicial set (here represented as a fuzzy graph in the form of a # sparse matrix) associated to the data. This is done by locally approximating # geodesic distance at each point, creating a fuzzy simplicial set for each such # point, and then combining all the local fuzzy simplicial sets into a global # one via a fuzzy union fuzzy_simplicial_set <- function(nn, set_op_mix_ratio = 1.0, local_connectivity = 1.0, bandwidth = 1.0, n_threads = max( 1, RcppParallel::defaultNumThreads() / 2 ), grain_size = 1, verbose = FALSE) { affinity_matrix <- smooth_knn(nn, local_connectivity = local_connectivity, bandwidth = bandwidth, n_threads = n_threads, grain_size = grain_size, verbose = verbose ) affinity_matrix <- nn_to_sparse(nn$idx, as.vector(affinity_matrix), self_nbr = TRUE, max_nbr_id = nrow(nn$idx) ) fuzzy_set_union(affinity_matrix, set_op_mix_ratio = set_op_mix_ratio) } symmetrize <- function(P) { 0.5 * (P + Matrix::t(P)) } perplexity_similarities <- function(nn, perplexity = NULL, n_threads = max( 1, RcppParallel::defaultNumThreads() / 2 ), grain_size = 1, kernel = "gauss", verbose = FALSE) { if (is.null(perplexity) && kernel != "knn") { stop("Must provide perplexity") } if (kernel == "gauss") { tsmessage( "Commencing calibration for perplexity = ", formatC(perplexity), pluralize("thread", n_threads, " using") ) parallelize <- n_threads > 0 affinity_matrix_res <- calc_row_probabilities_parallel( nn_dist = nn$dist, nn_idx = nn$idx, perplexity = perplexity, parallelize = parallelize, grain_size = grain_size, verbose = verbose ) affinity_matrix <- affinity_matrix_res$matrix if (verbose && affinity_matrix_res$n_failures > 0) { tsmessage(affinity_matrix_res$n_failures, " perplexity failures") } affinity_matrix <- nn_to_sparse(nn$idx, as.vector(affinity_matrix), self_nbr = TRUE, max_nbr_id = nrow(nn$idx) ) } else { # knn kernel tsmessage("Using knn graph for input weights with k = ", ncol(nn$idx)) # Make each row sum to 1, ignoring the self-index # i.e. diagonal will be zero affinity_matrix <- nn_to_sparse(nn$idx, val = 1 / (ncol(nn$idx) - 1)) Matrix::diag(affinity_matrix) <- 0 affinity_matrix <- Matrix::drop0(affinity_matrix) } symmetrize(affinity_matrix) } # Convert the matrix of NN indices to a sparse asymmetric matrix where each # edge has a weight of val (scalar or vector) # return a sparse matrix with dimensions of nrow(nn_idx) x max_nbr_id nn_to_sparse <- function(nn_idx, val = 1, self_nbr = FALSE, max_nbr_id = ifelse(self_nbr, nrow(nn_idx), max(nn_idx) )) { nd <- nrow(nn_idx) k <- ncol(nn_idx) if (length(val) == 1) { xs <- rep(val, nd * k) } else { xs <- val } is <- rep(1:nd, times = k) js <- as.vector(nn_idx) dims <- c(nrow(nn_idx), max_nbr_id) res <- sparseMatrix(i = is, j = js, x = xs, dims = dims) if (self_nbr) { Matrix::diag(res) <- 0 res <- Matrix::drop0(res) } res } uwot/R/RcppExports.R0000644000176200001440000001033113571663322014073 0ustar liggesusers# Generated by using Rcpp::compileAttributes() -> do not edit by hand # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 connected_components_undirected <- function(N, indices1, indptr1, indices2, indptr2) { .Call(`_uwot_connected_components_undirected`, N, indices1, indptr1, indices2, indptr2) } annoy_euclidean_nns <- function(index_name, mat, n_neighbors, search_k, grain_size = 1L, verbose = FALSE) { .Call(`_uwot_annoy_euclidean_nns`, index_name, mat, n_neighbors, search_k, grain_size, verbose) } annoy_cosine_nns <- function(index_name, mat, n_neighbors, search_k, grain_size = 1L, verbose = FALSE) { .Call(`_uwot_annoy_cosine_nns`, index_name, mat, n_neighbors, search_k, grain_size, verbose) } annoy_manhattan_nns <- function(index_name, mat, n_neighbors, search_k, grain_size = 1L, verbose = FALSE) { .Call(`_uwot_annoy_manhattan_nns`, index_name, mat, n_neighbors, search_k, grain_size, verbose) } annoy_hamming_nns <- function(index_name, mat, n_neighbors, search_k, grain_size = 1L, verbose = FALSE) { .Call(`_uwot_annoy_hamming_nns`, index_name, mat, n_neighbors, search_k, grain_size, verbose) } optimize_layout_umap <- function(head_embedding, tail_embedding, positive_head, positive_tail, n_epochs, n_vertices, epochs_per_sample, a, b, gamma, initial_alpha, negative_sample_rate, approx_pow, pcg_rand = TRUE, parallelize = TRUE, grain_size = 1L, move_other = TRUE, verbose = FALSE) { .Call(`_uwot_optimize_layout_umap`, head_embedding, tail_embedding, positive_head, positive_tail, n_epochs, n_vertices, epochs_per_sample, a, b, gamma, initial_alpha, negative_sample_rate, approx_pow, pcg_rand, parallelize, grain_size, move_other, verbose) } optimize_layout_tumap <- function(head_embedding, tail_embedding, positive_head, positive_tail, n_epochs, n_vertices, epochs_per_sample, initial_alpha, negative_sample_rate, pcg_rand = TRUE, parallelize = TRUE, grain_size = 1L, move_other = TRUE, verbose = FALSE) { .Call(`_uwot_optimize_layout_tumap`, head_embedding, tail_embedding, positive_head, positive_tail, n_epochs, n_vertices, epochs_per_sample, initial_alpha, negative_sample_rate, pcg_rand, parallelize, grain_size, move_other, verbose) } optimize_layout_largevis <- function(head_embedding, positive_head, positive_tail, n_epochs, n_vertices, epochs_per_sample, gamma, initial_alpha, negative_sample_rate, pcg_rand = TRUE, parallelize = TRUE, grain_size = 1L, verbose = FALSE) { .Call(`_uwot_optimize_layout_largevis`, head_embedding, positive_head, positive_tail, n_epochs, n_vertices, epochs_per_sample, gamma, initial_alpha, negative_sample_rate, pcg_rand, parallelize, grain_size, verbose) } calc_row_probabilities_parallel <- function(nn_dist, nn_idx, perplexity, n_iter = 200L, tol = 1e-5, parallelize = TRUE, grain_size = 1L, verbose = FALSE) { .Call(`_uwot_calc_row_probabilities_parallel`, nn_dist, nn_idx, perplexity, n_iter, tol, parallelize, grain_size, verbose) } smooth_knn_distances_parallel <- function(nn_dist, nn_idx, n_iter = 64L, local_connectivity = 1.0, bandwidth = 1.0, tol = 1e-5, min_k_dist_scale = 1e-3, parallelize = TRUE, grain_size = 1L, verbose = FALSE) { .Call(`_uwot_smooth_knn_distances_parallel`, nn_dist, nn_idx, n_iter, local_connectivity, bandwidth, tol, min_k_dist_scale, parallelize, grain_size, verbose) } fast_intersection_cpp <- function(rows, cols, values, target, unknown_dist = 1.0, far_dist = 5.0) { .Call(`_uwot_fast_intersection_cpp`, rows, cols, values, target, unknown_dist, far_dist) } general_sset_intersection_cpp <- function(indptr1, indices1, data1, indptr2, indices2, data2, result_row, result_col, result_val, mix_weight = 0.5) { .Call(`_uwot_general_sset_intersection_cpp`, indptr1, indices1, data1, indptr2, indices2, data2, result_row, result_col, result_val, mix_weight) } init_transform_av_parallel <- function(train_embedding, nn_index, parallelize = TRUE, grain_size = 1L) { .Call(`_uwot_init_transform_av_parallel`, train_embedding, nn_index, parallelize, grain_size) } init_transform_parallel <- function(train_embedding, nn_index, nn_weights, grain_size = 1L, parallelize = TRUE) { .Call(`_uwot_init_transform_parallel`, train_embedding, nn_index, nn_weights, grain_size, parallelize) } uwot/R/progress.R0000644000176200001440000000416113400360122013431 0ustar liggesusers# An RC class to produce a progress bar like RcppProgress does for C++ code. # create: # progress <- Progress$new(max, display) # max is the final value (e.g. last index in a loop) # display is a logical for whether to actually display the bar # use: # progress$increment() # increments the counter towards max. A new star will be displayed on the bar if sufficient # progress was made. Nothing happens if the bar was created with display = FALSE. # # Typical usage: # do_stuff_ntimes <- function(n, verbose = FALSE, ...) # progress <- Progress$new(max = n, display = verbose) # for (i in 1:n) { # do_more_stuff(i, ...) # progress$increment() # } # } # Progress <- setRefClass("Progress", fields = list( value = "numeric", max = "numeric", curr_stars = "numeric", max_stars = "numeric", display = "logical" ), methods = list( initialize = function(max, display = TRUE) { max_stars <<- 51 # length of the progress bar value <<- 0 curr_stars <<- 0 max <<- max display <<- display if (display) { message("0% 10 20 30 40 50 60 70 80 90 100%") message("[----|----|----|----|----|----|----|----|----|----|") } }, increment = function() { if (display && curr_stars < max_stars) { value <<- value + 1 num_stars <- round(max_stars * value / max) if (num_stars > curr_stars) { # Number of new stars to print num_new_stars <- num_stars - curr_stars # If we are going to reach the end of the progress bar # save space for the terminal "|" if (curr_stars + num_new_stars >= max_stars) { num_new_stars <- num_new_stars - 1 } new_stars <- paste(rep("*", num_new_stars), collapse = "") message(new_stars, appendLF = FALSE) flush.console() curr_stars <<- num_stars } if (curr_stars >= max_stars) { # The terminal "|" character that appears instead of a * message("|") } } } ) ) uwot/R/init.R0000644000176200001440000003333513571660252012555 0ustar liggesusers# Laplacian Eigenmap (Belkin & Niyogi, 2002) # Original formulation solves the generalized eigenvalue problem of the # unnormalized graph Laplacian: L v = lambda D v, where L = D - A # and uses the bottom eigenvectors v that result # (ignoring the constant eigenvector associated with the smallest eigenvalue). # # This is equivalent to using the top eigenvectors from the usual # eigendecomposition of a row-normalized Laplacian P = D^-1 A: P v = lambda' v # so we don't need to depend on an external package for generalized eigenvalues. # Note that while the eigenvectors are the same, the eigenvalues are # different: lambda' = 1 - lambda, but we don't use them with Laplacian # Eigenmaps anyway. # # As we only need to calculate the top ndim + 1 eigenvectors (i.e. normally 3) # it's incredibly wasteful to calculate all of them. # A must be symmetric and positive semi definite, but not necessarily # normalized in any specific way. #' @import Matrix laplacian_eigenmap <- function(A, ndim = 2, verbose = FALSE) { if (nrow(A) < 3) { tsmessage("Graph too small, using random initialization instead") return(rand_init(nrow(A), ndim)) } tsmessage("Initializing from Laplacian Eigenmap") # Equivalent to: D <- diag(colSums(A)); M <- solve(D) %*% A # This effectively row-normalizes A: colSums is normally faster than rowSums # and because A is symmetric, they're equivalent M <- A / colSums(A) connected <- connected_components(M) if (connected$n_components > 1) { tsmessage( "Found ", connected$n_components, " connected components, ", "initializing each component separately" ) fn_name <- as.character(match.call()[[1]]) return(subgraph_init(fn_name, connected, A = A, ndim = ndim, verbose = verbose )) } res <- NULL k <- ndim + 1 n <- nrow(M) suppressWarnings( res <- tryCatch(RSpectra::eigs(M, k = k, which = "LM", opt = list(tol = 1e-4) ), error = function(c) { NULL } ) ) if (is.null(res) || ncol(res$vectors) < ndim) { message( "Laplacian Eigenmap failed to converge, ", "using random initialization instead" ) return(rand_init(n, ndim)) } vecs <- as.matrix(res$vectors[, 2:(ndim + 1)]) Re(vecs) } # Use a normalized Laplacian. normalized_laplacian_init <- function(A, ndim = 2, verbose = FALSE) { if (nrow(A) < 3) { tsmessage("Graph too small, using random initialization instead") return(rand_init(nrow(A), ndim)) } tsmessage("Initializing from normalized Laplacian") connected <- connected_components(A) if (connected$n_components > 1) { tsmessage( "Found ", connected$n_components, " connected components, ", "initializing each component separately" ) fn_name <- as.character(match.call()[[1]]) return(subgraph_init(fn_name, connected, A = A, ndim = ndim, verbose = verbose )) } n <- nrow(A) # Normalized Laplacian: clear and close to UMAP code, but very slow in R # I <- diag(1, nrow = n, ncol = n) # D <- diag(1 / sqrt(colSums(A))) # L <- I - D %*% A %*% D # A lot faster (order of magnitude when n = 1000) Dsq <- sqrt(Matrix::colSums(A)) L <- -Matrix::t(A / Dsq) / Dsq Matrix::diag(L) <- 1 + Matrix::diag(L) k <- ndim + 1 opt <- list(tol = 1e-4) suppressWarnings( res <- tryCatch(RSpectra::eigs_sym(L, k = k, which = "SM", opt = opt), error = function(c) { NULL } ) ) if (is.null(res) || ncol(res$vectors) < ndim) { suppressWarnings( res <- tryCatch(RSpectra::eigs_sym(L, k = k, which = "LM", sigma = 0, opt = opt ), error = function(c) { NULL } ) ) if (is.null(res) || ncol(res$vectors) < ndim) { message( "Spectral initialization failed to converge, ", "using random initialization instead" ) return(rand_init(n, ndim)) } } vec_indices <- rev(order(res$values, decreasing = TRUE)[1:ndim]) as.matrix(Re(res$vectors[, vec_indices])) } # Use irlba's partial_eigen instead of RSpectra irlba_normalized_laplacian_init <- function(A, ndim = 2, verbose = FALSE) { if (nrow(A) < 3) { tsmessage("Graph too small, using random initialization instead") return(rand_init(nrow(A), ndim)) } tsmessage("Initializing from normalized Laplacian (using irlba)") n <- nrow(A) Dsq <- sqrt(Matrix::colSums(A)) L <- -Matrix::t(A / Dsq) / Dsq Matrix::diag(L) <- 1 + Matrix::diag(L) k <- ndim + 1 suppressWarnings( res <- tryCatch(res <- irlba::partial_eigen(L, n = k, symmetric = TRUE, smallest = TRUE, tol = 1e-3, maxit = 1000, verbose = TRUE ), error = function(c) { NULL } ) ) if (is.null(res) || ncol(res$vectors) < ndim) { message( "Spectral initialization failed to converge, ", "using random initialization instead" ) return(rand_init(n, ndim)) } vec_indices <- rev(order(res$values, decreasing = TRUE)[1:ndim]) as.matrix(Re(res$vectors[, vec_indices])) } # Default UMAP initialization # spectral decomposition of the normalized Laplacian + some noise spectral_init <- function(A, ndim = 2, verbose = FALSE) { if (nrow(A) < 3) { tsmessage("Graph too small, using random initialization instead") return(rand_init(nrow(A), ndim)) } tsmessage("Initializing from normalized Laplacian + noise") connected <- connected_components(A) if (connected$n_components > 1) { tsmessage( "Found ", connected$n_components, " connected components, ", "initializing each component separately" ) fn_name <- as.character(match.call()[[1]]) return(subgraph_init(fn_name, connected, A = A, ndim = ndim, verbose = verbose )) } coords <- normalized_laplacian_init(A, ndim, verbose = FALSE) expansion <- 10.0 / max(abs(coords)) (coords * expansion) + matrix(stats::rnorm(n = prod(dim(coords)), sd = 0.0001), ncol = ndim ) } irlba_spectral_init <- function(A, ndim = 2, verbose = FALSE) { if (nrow(A) < 3) { tsmessage("Graph too small, using random initialization instead") return(rand_init(nrow(A), ndim)) } tsmessage("Initializing from normalized Laplacian (using irlba) + noise") coords <- irlba_normalized_laplacian_init(A, ndim, verbose = FALSE) expansion <- 10.0 / max(coords) (coords * expansion) + matrix(stats::rnorm(n = prod(dim(coords)), sd = 0.001), ncol = ndim ) } # Recursively calls the spectral initialization function named fn_name # for each subgraph specified by connected subgraph_init <- function(fn_name, connected, A, ndim = 2, verbose = FALSE) { init <- NULL for (i in 1:connected$n_components) { subg_idx <- connected$labels == i - 1 subg <- A[subg_idx, subg_idx] tsmessage("Initializing subcomponent of size ", nrow(subg)) init_conn <- do.call(fn_name, list( A = subg, ndim = ndim, verbose = verbose )) if (is.null(init)) { init <- init_conn } else { init <- rbind(init, init_conn) } } init } # Return the number of connected components in a graph (respresented as a # sparse matrix). connected_components <- function(X) { Xt <- Matrix::t(X) connected_components_undirected(nrow(X), Xt@i, Xt@p, X@i, X@p) } # UMAP random initialization: uniform between +10 and -10 along each axis rand_init <- function(n, ndim, verbose = FALSE) { tsmessage("Initializing from uniform random") matrix(stats::runif(n = n * ndim, min = -10, max = 10), ncol = ndim) } # LargeVis random initialization: Gaussian with sd 1e-4 (like t-SNE) rand_init_lv <- function(n, ndim, verbose = FALSE) { tsmessage("Initializing from random Gaussian with sd = 1e-4") matrix(stats::rnorm(ndim * n, sd = 1e-4), n) } # Rescale embedding so that the standard deviation is the specified value. # Default gives initialization like t-SNE, but not random. Large initial # distances lead to small gradients, and hence small updates, so should be # avoided shrink_coords <- function(X, sdev = 1e-4) { scale(X, scale = apply(X, 2, stats::sd) / sdev) } # PCA pca_init <- function(X, ndim = 2, center = TRUE, verbose = FALSE) { tsmessage("Initializing from PCA") pca_scores(X, ncol = ndim, center = center, verbose = verbose) } # Calculates a matrix containing the first ncol columns of the PCA scores. # Returns the score matrix unless ret_extra is TRUE, in which case a list # is returned also containing the eigenvalues pca_scores <- function(X, ncol = min(dim(X)), center = TRUE, ret_extra = FALSE, verbose = FALSE) { if (methods::is(X, "dist")) { res_mds <- stats::cmdscale(X, x.ret = TRUE, eig = TRUE, k = ncol) if (ret_extra || verbose) { lambda <- res_mds$eig varex <- sum(lambda[1:ncol]) / sum(lambda) tsmessage( "PCA (using classical MDS): ", ncol, " components explained ", formatC(varex * 100), "% variance" ) } scores <- res_mds$points return(scores) } # irlba warns about using too large a percentage of total singular value # so don't use if dataset is small compared to ncol if (ncol < 0.5 * min(dim(X))) { return(irlba_scores(X, ncol = ncol, center = center, ret_extra = ret_extra, verbose = verbose )) } svd_scores(X = X, ncol = ncol, center = center, ret_extra = ret_extra, verbose = verbose) } # Get scores by SVD svd_scores <- function(X, ncol = min(dim(X)), center = TRUE, ret_extra = FALSE, verbose = FALSE) { # need extra data if we want to re-apply PCA to new points in umap_transform rotation <- NULL xcenter <- NULL X <- scale(X, center = center, scale = FALSE) # do SVD on X directly rather than forming covariance matrix s <- svd(X, nu = ncol, nv = ifelse(ret_extra, ncol, 0)) D <- diag(c(s$d[1:ncol]), ncol, ncol) if (verbose || ret_extra) { # calculate eigenvalues of covariance matrix from singular values lambda <- (s$d^2) / (nrow(X) - 1) varex <- sum(lambda[1:ncol]) / sum(lambda) tsmessage( "PCA: ", ncol, " components explained ", formatC(varex * 100), "% variance" ) } scores <- s$u %*% D if (ret_extra) { rotation <- s$v xcenter <- attr(X, "scaled:center") } if (ret_extra) { list( scores = scores, lambda = lambda[1:ncol], rotation = rotation, center = xcenter ) } else { scores } } # Get PCA scores via irlba irlba_scores <- function(X, ncol, center = TRUE, ret_extra = FALSE, verbose = FALSE) { res <- irlba::prcomp_irlba(X, n = ncol, retx = TRUE, center = center, scale = FALSE ) if (verbose) { varex <- sum(res$sdev[1:ncol]^2) / res$totalvar tsmessage( "PCA: ", ncol, " components explained ", formatC(varex * 100), "% variance" ) } if (ret_extra) { list(scores = res$x, rotation = res$rotation, center = res$center) } else { res$x } } init_is_spectral <- function(init) { res <- pmatch(tolower(init), c( "normlaplacian", "spectral", "laplacian", "inormlaplacian", "ispectral" )) length(res) > 0 && !is.na(res) } rand_nbr_graph <- function(n_vertices, n_nbrs, val) { nn_to_sparse(rand_nbr_idx(n_vertices, n_nbrs), val = val, max_nbr_id = n_vertices ) } rand_nbr_idx <- function(n_vertices, n_nbrs) { idx <- matrix(nrow = n_vertices, ncol = n_nbrs) nv1 <- n_vertices - 1 for (i in 1:n_vertices) { ids <- sample.int(nv1, n_nbrs) id_sel <- ids >= 1 ids[id_sel] <- ids[id_sel] + 1 idx[i, ] <- ids } idx } # V: the current affinity graph # n_pos: number of neighbors to retain per item # n_neg: number of "negative" (i.e. non-)neighbors per item # pos_affinity: value for the positive affinity (associated with nbrs) # neg_affinity: value for the negative affinity (associated with neg nbrs) approx_affinity_graph <- function(V, n_neg, pos_affinity = 1, neg_affinity = 0.1, verbose = FALSE) { pos_V <- V pos_V@x <- rep(pos_affinity, length(pos_V@x)) pos_V <- 0.5 * (pos_V + Matrix::t(pos_V)) neg_V <- rand_nbr_graph(nrow(pos_V), n_nbrs = n_neg, val = neg_affinity) neg_V <- 0.5 * (neg_V + Matrix::t(neg_V)) # the cleanup below will ensure that where the same value got a pos and neg # affinity it will end up positive graph <- pos_V + neg_V # clamp small values to neg_affinity graph@x[graph@x < pos_affinity] <- neg_affinity # and large values to pos_affinity graph@x <- pmin(graph@x, pos_affinity) Matrix::drop0(graph) } # Initialize using a spectral decomposition of an "approximate global" graph # Uses the same graph as standard UMAP, but with each entry set to 1. A measure # of global structure is added by randomly setting some of the remaining zero # to a smaller value (0.1 in this case). # This routine is inspired by some ideas in # 2-D Embedding of Large and High-dimensional Data with Minimal Memory and Computational Time Requirements # Witold Dzwinel, Rafal Wcislo, Stan Matwin # https://arxiv.org/abs/1902.01108 # # Randomized Near Neighbor Graphs, Giant Components, and Applications in Data Science # George C. Linderman, Gal Mishne, Yuval Kluger, Stefan Steinerberger # https://arxiv.org/abs/1711.04712 agspectral_init <- function(V, n_neg_nbrs, pos_affinity = 1, neg_affinity = 0.1, ndim = 2, verbose = FALSE) { graph <- approx_affinity_graph(V, n_neg_nbrs, pos_affinity = pos_affinity, neg_affinity = neg_affinity, verbose = verbose ) spectral_init(graph, ndim = ndim, verbose = verbose) } uwot/R/transform.R0000644000176200001440000002723713571660265013635 0ustar liggesusers#' Add New Points to an Existing Embedding #' #' Carry out an embedding of new data using an existing embedding. Requires #' using the result of calling \code{\link{umap}} or \code{\link{tumap}} with #' \code{ret_model = TRUE}. #' #' Note that some settings are incompatible with the production of a UMAP model #' via \code{\link{umap}}: external neighbor data (passed via a list to the #' argument of the \code{nn_method} parameter), and factor columns that were #' included in the UMAP calculation via the \code{metric} parameter. In the #' latter case, the model produced is based only on the numeric data. #' A transformation is possible, but factor columns in the new data are ignored. #' #' @param X The new data to be transformed, either a matrix of data frame. Must #' have the same columns in the same order as the input data used to generate #' the \code{model}. #' @param model Data associated with an existing embedding. #' @param init_weighted If \code{TRUE}, then initialize the embedded coordinates #' of \code{X} using a weighted average of the coordinates of the nearest #' neighbors from the original embedding in \code{model}, where the weights #' used are the edge weights from the UMAP smoothed knn distances. Otherwise, #' use an unweighted average. #' @param search_k Number of nodes to search during the neighbor retrieval. The #' larger k, the more the accurate results, but the longer the search takes. #' Default is the value used in building the \code{model} is used. #' @param tmpdir Temporary directory to store nearest neighbor indexes during #' nearest neighbor search. Default is \code{\link{tempdir}}. The index is #' only written to disk if \code{n_threads > 1}; otherwise, this parameter is #' ignored. #' @param n_epochs Number of epochs to use during the optimization of the #' embedded coordinates. A value between \code{30 - 100} is a reasonable trade #' off between speed and thoroughness. By default, this value is set to one #' third the number of epochs used to build the \code{model}. #' @param n_threads Number of threads to use, (except during stochastic gradient #' descent). Default is half that recommended by RcppParallel. #' @param n_sgd_threads Number of threads to use during stochastic gradient #' descent. If set to > 1, then results will not be reproducible, even if #' `set.seed` is called with a fixed seed before running. #' @param grain_size Minimum batch size for multithreading. If the number of #' items to process in a thread falls below this number, then no threads will #' be used. Used in conjunction with \code{n_threads} and #' \code{n_sgd_threads}. #' @param verbose If \code{TRUE}, log details to the console. #' @return A matrix of coordinates for \code{X} transformed into the space #' of the \code{model}. #' @examples #' #' iris_train <- iris[1:100, ] #' iris_test <- iris[101:150, ] #' #' # You must set ret_model = TRUE to return extra data needed #' iris_train_umap <- umap(iris_train, ret_model = TRUE) #' iris_test_umap <- umap_transform(iris_test, iris_train_umap) #' @export umap_transform <- function(X, model, init_weighted = TRUE, search_k = NULL, tmpdir = tempdir(), n_epochs = NULL, n_threads = max(1, RcppParallel::defaultNumThreads() / 2), n_sgd_threads = 0, grain_size = 1, verbose = FALSE) { if (is.null(n_epochs)) { n_epochs <- model$n_epochs } if (is.null(search_k)) { search_k <- model$search_k } nn_index <- model$nn_index n_neighbors <- model$n_neighbors local_connectivity <- model$local_connectivity train_embedding <- model$embedding method <- model$method scale_info <- model$scale_info metric <- model$metric pca_models <- model$pca_models a <- model$a b <- model$b gamma <- model$gamma alpha <- model$alpha negative_sample_rate <- model$negative_sample_rate approx_pow <- model$approx_pow norig_col <- model$norig_col pcg_rand <- model$pcg_rand if (is.null(pcg_rand)) { tsmessage("Using PCG for random number generation") pcg_rand <- TRUE } if (ncol(X) != norig_col) { stop("Incorrect dimensions: X must have ", norig_col, " columns") } if (methods::is(X, "data.frame")) { indexes <- which(vapply(X, is.numeric, logical(1))) if (length(indexes) == 0) { stop("No numeric columns found") } X <- as.matrix(X[, indexes]) } n_vertices <- nrow(X) tsmessage( "Read ", n_vertices, " rows and found ", ncol(X), " numeric columns" ) if (!is.null(scale_info)) { X <- apply_scaling(X, scale_info = scale_info, verbose = verbose) } if (n_threads > 0) { RcppParallel::setThreadOptions(numThreads = n_threads) } adjusted_local_connectivity <- max(0, local_connectivity - 1.0) nblocks <- length(metric) graph <- NULL embedding <- NULL for (i in 1:nblocks) { tsmessage("Processing block ", i, " of ", nblocks) if (nblocks == 1) { ann <- nn_index Xsub <- X } else { ann <- nn_index[[i]] subset <- metric[[i]] if (is.list(subset)) { subset <- lsplit_unnamed(subset)$unnamed[[1]] } Xsub <- X[, subset, drop = FALSE] } if (!is.null(pca_models) && !is.null(pca_models[[as.character(i)]])) { Xsub <- apply_pca( X = Xsub, pca_res = pca_models[[as.character(i)]], verbose = verbose ) } nn <- annoy_search(Xsub, k = n_neighbors, ann = ann, search_k = search_k, tmpdir = tmpdir, n_threads = n_threads, grain_size = grain_size, verbose = verbose ) graph_block <- smooth_knn(nn, local_connectivity = adjusted_local_connectivity, n_threads = n_threads, grain_size = grain_size, verbose = verbose ) embedding_block <- init_new_embedding(train_embedding, nn, graph_block, weighted = init_weighted, n_threads = n_threads, grain_size = grain_size, verbose = verbose ) if (is.null(embedding)) { embedding <- embedding_block } else { embedding <- embedding + embedding_block } graph_block <- nn_to_sparse(nn$idx, as.vector(graph_block), self_nbr = FALSE, max_nbr_id = nrow(train_embedding) ) if (is.null(graph)) { graph <- graph_block } else { graph <- set_intersect(graph, graph_block, weight = 0.5, reset = TRUE) } } if (nblocks > 1) { embedding <- embedding / nblocks } if (is.null(n_epochs)) { if (ncol(graph) <= 10000) { n_epochs <- 100 } else { n_epochs <- 30 } } else { n_epochs <- max(2, round(n_epochs / 3)) } if (n_epochs > 0) { graph@x[graph@x < max(graph@x) / n_epochs] <- 0 graph <- Matrix::drop0(graph) epochs_per_sample <- make_epochs_per_sample(graph@x, n_epochs) positive_head <- graph@i positive_tail <- Matrix::which(graph != 0, arr.ind = TRUE)[, 2] - 1 tsmessage( "Commencing optimization for ", n_epochs, " epochs, with ", length(positive_head), " positive edges", pluralize("thread", n_sgd_threads, " using") ) parallelize <- n_sgd_threads > 0 if (n_sgd_threads > 0) { RcppParallel::setThreadOptions(numThreads = n_sgd_threads) } embedding <- t(embedding) train_embedding <- t(train_embedding) if (tolower(method) == "umap") { embedding <- optimize_layout_umap( head_embedding = embedding, tail_embedding = train_embedding, positive_head = positive_head, positive_tail = positive_tail, n_epochs = n_epochs, n_vertices = n_vertices, epochs_per_sample = epochs_per_sample, a = a, b = b, gamma = gamma, initial_alpha = alpha, negative_sample_rate, approx_pow = approx_pow, pcg_rand = pcg_rand, parallelize = parallelize, grain_size = grain_size, move_other = FALSE, verbose = verbose ) } else { embedding <- optimize_layout_tumap( head_embedding = embedding, tail_embedding = train_embedding, positive_head = positive_head, positive_tail = positive_tail, n_epochs = n_epochs, n_vertices, epochs_per_sample, initial_alpha = alpha, negative_sample_rate = negative_sample_rate, pcg_rand = pcg_rand, parallelize = parallelize, grain_size = grain_size, move_other = FALSE, verbose = verbose ) } } tsmessage("Finished") t(embedding) } init_new_embedding <- function(train_embedding, nn, graph, weighted = TRUE, n_threads = max(1, RcppParallel::defaultNumThreads() / 2), grain_size = 1, verbose = FALSE) { parallelize <- n_threads > 0 if (weighted) { tsmessage( "Initializing by weighted average of neighbor coordinates", pluralize("thread", n_threads, " using") ) embedding <- init_transform_parallel(train_embedding, nn$idx, graph, parallelize = parallelize, grain_size = grain_size ) } else { tsmessage( "Initializing by average of neighbor coordinates", pluralize("thread", n_threads, " using") ) embedding <- init_transform_av_parallel(train_embedding, nn$idx, parallelize = parallelize, grain_size = grain_size ) } embedding } # Pure R implementation of (weighted) average. Superceded by C++ implementations init_transform <- function(train_embedding, nn_index, weights = NULL) { nr <- nrow(nn_index) nc <- ncol(train_embedding) embedding <- matrix(nrow = nr, ncol = nc) if (is.null(weights)) { for (i in 1:nr) { nbr_embedding <- train_embedding[nn_index[i, ], ] embedding[i, ] <- apply(nbr_embedding, 2, mean) } } else { for (i in 1:nr) { nbr_embedding <- train_embedding[nn_index[i, ], ] nbr_weights <- weights[nn_index[i, ], i] embedding[i, ] <- apply( nbr_embedding, 2, function(x) { stats::weighted.mean(x, nbr_weights) } ) } } embedding } apply_scaling <- function(X, scale_info, verbose = FALSE) { if (!is.null(scale_info[["scaled:range:min"]])) { tsmessage("Applying training data range scaling") X <- X - scale_info[["scaled:range:min"]] X <- X / scale_info[["scaled:range:max"]] } else if (!is.null(scale_info[["scaled:maxabs"]])) { tsmessage("Applying training data max-abs scaling") X <- scale(X, center = scale_info[["scaled:center"]], scale = FALSE) X <- X / scale_info[["scaled:maxabs"]] } else if (!is.null(scale_info[["scaled:colrange:min"]])) { tsmessage("Applying training data column range scaling") X <- sweep(X, 2, scale_info[["scaled:colrange:min"]]) X <- sweep(X, 2, scale_info[["scaled:colrange:max"]], `/`) } else { tsmessage("Applying training data column filtering/scaling") X <- X[, scale_info[["scaled:nzvcols"]]] X <- scale(X, center = scale_info[["scaled:center"]], scale = scale_info[["scaled:scale"]] ) } X } # Apply a previously calculated set of PCA rotations apply_pca <- function(X, pca_res, verbose = FALSE) { tsmessage("Applying PCA reducing to ", ncol(X), " dimensions") if (!is.null(pca_res$center)) { X <- sweep(X, 2, pca_res$center) } X %*% pca_res$rotation } uwot/NEWS.md0000644000176200001440000003274313571520503012360 0ustar liggesusers# uwot 0.1.5 ## Bug fixes and minor improvements * The R API was being accessed from inside multi-threaded code to seed the (non-R) random number generators. Probably this was causing users in downstream projects (seurat and monocle) to experience strange RcppParallel-related crashes. Thanks to [aldojongejan](https://github.com/aldojongejan) for reporting this (). * Passing a floating point value smaller than one to `n_threads` caused a crash. This was particularly insidious if running with a system with only one default thread available as the default `n_threads` becomes `0.5`. Now `n_threads` (and `n_sgd_threads`) are rounded to the nearest integer. * Initialization of supervised UMAP should now be faster (). Contributed by [Aaron Lun](https://github.com/LTLA). # uwot 0.1.4 ## Bug fixes and minor improvements * Fixed incorrect loading of Annoy indexes to be compatible with newer versions of RcppAnnoy (). My thanks to Dirk Eddelbuettel and Erik Bernhardsson for aid in identifying the problem. * Fix for `ERROR: there is already an InterruptableProgressMonitor instance defined`. * If `verbose = TRUE`, the `a`, `b` curve parameters are now logged. # uwot 0.1.3 ## Bug fixes and minor improvements * Fixed an issue where the session would crash if the Annoy nearest neighbor search was unable to find k neighbors for an item. ## Known issue Even with a fix for the bug mentioned above, if the nearest neighbor index file is larger than 2GB in size, Annoy may not be able to read the data back in. This should only occur with very large or high-dimensional datasets. The nearest neighbor search will fail under these conditions. A work-around is to set `n_threads = 0`, because the index will not be written to disk and re-loaded under these circumstances, at the cost of a longer search time. Alternatively, set the `pca` parameter to reduce the dimensionality or lower `n_trees`, both of which will reduce the size of the index on disk. However, either may lower the accuracy of the nearest neighbor results. # uwot 0.1.2 Initial CRAN release. ## New features * New parameter, `tmpdir`, which allows the user to specify the temporary directory where nearest neighbor indexes will be written during Annoy nearest neighbor search. The default is `base::tempdir()`. Only used if `n_threads > 1` and `nn_method = "annoy"`. ## Bug fixes and minor improvements * Fixed an issue with `lvish` where there was an off-by-one error when calculating input probabilities. * Added a safe-guard to `lvish` to prevent the gaussian precision, beta, becoming overly large when the binary search fails during perplexity calibration. * The `lvish` perplexity calibration uses the log-sum-exp trick to avoid numeric underflow if beta becomes large. # uwot 0.0.0.9010 (31 March 2019) ## New features * New parameter: `pcg_rand`. If `TRUE` (the default), then a random number generator from [the PCG family](http://www.pcg-random.org/) is used during the stochastic optimization phase. The old PRNG, a direct translation of an implementation of the Tausworthe "taus88" PRNG used in the Python version of UMAP, can be obtained by setting `pcg_rand = FALSE`. The new PRNG is slower, but is likely superior in its statistical randomness. This change in behavior will be break backwards compatibility: you will now get slightly different results even with the same seed. * New parameter: `fast_sgd`. If `TRUE`, then the following combination of parameters are set: `n_sgd_threads = "auto"`, `pcg_rand = FALSE` and `approx_pow = TRUE`. These will result in a substantially faster optimization phase, at the cost of being slightly less accurate and results not being exactly repeatable. `fast_sgd = FALSE` by default but if you are only interested in visualization, then `fast_sgd` gives perfectly good results. For more generic dimensionality reduction and reproducibility, keep `fast_sgd = FALSE`. * New parameter: `init_sdev` which specifies how large the standard deviation of each column of the initial coordinates should be. This will scale any input coordinates (including user-provided matrix coordinates). `init = "spca"` can now be thought of as an alias of `init = "pca", init_sdev = 1e-4`. This may be too aggressive scaling for some datasets. The typical UMAP spectral initializations tend to result in standard deviations of around `2` to `5`, so this might be more appropriate in some cases. If spectral initialization detects multiple components in the affinity graph and falls back to scaled PCA, it uses `init_sdev = 1`. * As a result of adding `init_sdev`, the `init` options `sspectral`, `slaplacian` and `snormlaplacian` have been removed (they weren't around for very long anyway). You can get the same behavior by e.g. `init = "spectral", init_sdev = 1e-4`. `init = "spca"` is sticking around because I use it a lot. ## Bug fixes and minor improvements * Spectral initialization (the default) was sometimes generating coordinates that had too large a range, due to an erroneous scale factor that failed to account for negative coordinate values. This could give rise to embeddings with very noticeable outliers distant from the main clusters. * Also during spectral initialization, the amount of noise being added had a standard deviation an order of magnitude too large compared to the Python implementation (this probably didn't make any difference though). * If requesting a spectral initialization, but multiple disconnected components are present, fall back to `init = "spca"`. * Removed dependency on C++ `` header. This breaks backwards compatibility even if you set `pcg_rand = FALSE`. * `metric = "cosine"` results were incorrectly using the unmodified Annoy angular distance. * Numeric matrix columns can be specified as the target for the `categorical` metric (fixes ). # uwot 0.0.0.9009 (1 January 2019) * Data is now stored column-wise during optimization, which should result in an increase in performance for larger values of `n_components` (e.g. approximately 50% faster optimization time with MNIST and `n_components = 50`). * New parameter: `pca_center`, which controls whether to center the data before applying PCA. It would be typical to set this to `FALSE` if you are applying PCA to binary data (although note you can't use this with setting with `metric = "hamming"`) * PCA will now be used when the `metric` is `"manhattan"` and `"cosine"`. It's still *not* applied when using `"hamming"` (data still needs to be in binary format, not real-valued). * If using mixed datatypes, you may override the `pca` and `pca_center` parameter values for a given data block by using a list for the value of the metric, with the column ids/names as an unnamed item and the overriding values as named items, e.g. instead of `manhattan = 1:100`, use `manhattan = list(1:100, pca_center = FALSE)` to turn off PCA centering for just that block. This functionality exists mainly for the case where you have mixed binary and real-valued data and want to apply PCA to both data types. It's normal to apply centering to real-valued data but not to binary data. ## Bug fixes and minor improvements * Fixed bug that affected `umap_transform`, where negative sampling was over the size of the test data (should be the training data). * Some other performance improvements (around 10% faster for the optimization stage with MNIST). * When `verbose = TRUE`, log the Annoy recall accuracy, which may help tune values of `n_trees` and `search_k`. # uwot 0.0.0.9008 (December 23 2018) ## New features * New parameter: `n_sgd_threads`, which controls the number of threads used in the stochastic gradient descent. By default this is now single-threaded and should result in reproducible results when using `set.seed`. To get back the old, less consistent, but faster settings, set `n_sgd_threads = "auto"`. * API change for consistency with Python UMAP: * `alpha` is now `learning_rate`. * `gamma` is now `repulsion_strength`. * Default spectral initialization now looks for disconnected components and initializes them separately (also applies to `laplacian` and `normlaplacian`). * New `init` options: `sspectral`, `snormlaplacian` and `slaplacian`. These are like `spectral`, `normlaplacian`, `laplacian` respectively, but scaled so that each dimension has a standard deviation of 1e-4. This is like the difference between the `pca` and `spca` options. ## Bug fixes and minor improvements * Hamming distance support (was actually using Euclidean distance). * Smooth knn/perplexity calibration results had a small dependency on the number of threads used. * Anomalously long spectral initialization times should now be reduced. * Internal changes and fixes thanks to a code review by [Aaron Lun](https://github.com/ltla). # uwot 0.0.0.9007 (December 9 2018) ## New features * New parameter `pca`: set this to a positive integer to reduce matrix of data frames to that number of columns using PCA. Only works if `metric = "euclidean"`. If you have > 100 columns, this can substantially improve the speed of the nearest neighbor search. t-SNE implementations often set this value to 50. ## Bug fixes and minor improvements * Laplacian Eigenmap initialization convergence failure is now correctly detected. * C++ code was over-writing data passed from R as a function argument. # uwot 0.0.0.9006 (December 5 2018) ## New features * Highly experimental mixed data type support for `metric`: instead of specifying a single metric name (e.g. `metric = "euclidean"`), you can pass a list, where the name of each item is the metric to use and the value is a vector of the names of the columns to use with that metric, e.g. `metric = list("euclidean" = c("A1", "A2"), "cosine" = c("B1", "B2", "B3"))` treats columns `A1` and `A2` as one block, using the Euclidean distance to find nearest neighbors, whereas `B1`, `B2` and `B3` are treated as a second block, using the cosine distance. * Factor columns can also be used in the metric, using the metric name `categorical`. * `y` may now be a data frame or matrix if multiple target data is available. * New parameter `target_metric`, to specify the distance metric to use with numerical `y`. This has the same capabilities as `metric`. * Multiple external nearest neighbor data sources are now supported. Instead of passing a list of two matrices, pass a list of lists, one for each external metric. * More details on mixed data types can be found at . * Compatibility with older versions of RcppParallel (contributed by [sirusb](https://github.com/sirusb)). * `scale = "Z"` To Z-scale each column of input (synonym for `scale = TRUE` or `scale = "scale"`). * New scaling option, `scale = "colrange"` to scale columns in the range (0, 1). # uwot 0.0.0.9005 (November 4 2018) ## New features * Hamming distance is now supported, due to upgrade to RcppAnnoy 0.0.11. # uwot 0.0.0.9004 (October 21 2018) ## New features * For supervised UMAP with numeric `y`, you may pass nearest neighbor data directly, in the same format as that supported by `X`-related nearest neighbor data. This may be useful if you don't want to use Euclidean distances for the `y` data, or if you have missing data (and have a way to assign nearest neighbors for those cases, obviously). See the [Nearest Neighbor Data Format](https://github.com/jlmelville/uwot#nearest-neighbor-data-format) section for details. # uwot 0.0.0.9003 (September 22 2018) ## New features * New parameter `ret_nn`: when `TRUE` returns nearest neighbor matrices as a `nn` list: indices in item `idx` and distances in item `dist`. Embedded coordinates are in `embedding`. Both `ret_nn` and `ret_model` can be `TRUE`, and should not cause any compatibility issues with supervised embeddings. * `nn_method` can now take precomputed nearest neighbor data. Must be a list of two matrices: `idx`, containing integer indexes, and `dist` containing distances. By no coincidence, this is the format return by `ret_nn`. ## Bug fixes and minor improvements * Embedding to `n_components = 1` was broken () * User-supplied matrices to `init` parameter were being modified, in defiance of basic R pass-by-copy semantics. # uwot 0.0.0.9002 (August 14 2018) ## Bug fixes and minor improvements * `metric = "cosine"` is working again for `n_threads` greater than `0` () # uwot 0.0.0.9001 ## New features * *August 5 2018*. You can now use an existing embedding to add new points via `umap_transform`. See the example section below. * *August 1 2018*. Numerical vectors are now supported for supervised dimension reduction. * *July 31 2018*. (Very) initial support for supervised dimension reduction: categorical data only at the moment. Pass in a factor vector (use `NA` for unknown labels) as the `y` parameter and edges with bad (or unknown) labels are down-weighted, hopefully leading to better separation of classes. This works remarkably well for the Fashion MNIST dataset. * *July 22 2018*. You can now use the cosine and Manhattan distances with the Annoy nearest neighbor search, via `metric = "cosine"` and `metric = "manhattan"`, respectively. Hamming distance is not supported because RcppAnnoy doesn't yet support it. uwot/MD50000644000176200001440000000556113571756262011605 0ustar liggesusers17c783de6a096cec14508033d3ee985f *DESCRIPTION cac5006a55ff422e898deb373ca1acb5 *NAMESPACE fe74ad7f11938178b7c2c06256397935 *NEWS.md 30b89a123e22deaf8dbfed03ee317f96 *R/RcppExports.R 64c02b71984ef37d9891320f687dd088 *R/affinity.R 1badef676d44497787d884a6d9213418 *R/init.R d78fc4c3fbac93e40b542623f10825d4 *R/neighbors.R d6ceca0b9b353a9097c7c957472ec6cb *R/progress.R ba63b10e43d5868f4d0fb2d85e43e02f *R/supervised.R 939b1811af3f640586019f00e540fddd *R/transform.R 4f71ade05255618d2f0ce4f292732547 *R/util.R 2cfb54a1c558cd83e100fdbca46df325 *R/uwot.R 79a209ee93b72456ce6a3f1c4f6f22a3 *man/load_uwot.Rd d5be49ac64d737a8bbcd9cd84e9f22ab *man/lvish.Rd 8811b5f5f43824d9c24dd879af7b285b *man/save_uwot.Rd e5d2086d16212d968fe0c0dbdc96adf6 *man/tumap.Rd 866e19b8698ba2bcd6acbf2dfdd9486e *man/umap.Rd bbcecadf9d1cc7f16bbf271feb9f30b6 *man/umap_transform.Rd 926872a7d6913edfc42c9409a4a42074 *src/Makevars 3481ee78c6d8b71aa644661617e8ca92 *src/Makevars.win 688c8dc689162b42ba90b3fbff99f3c8 *src/RcppExports.cpp 070cdc25e0445a88a484fc96c9a324f8 *src/connected_components.cpp 1c121f4f1b0c2127b7787b2376d7ba35 *src/gradient.cpp 79d4fb128a721e08a9de8578ca36451a *src/gradient.h c8c6f28e5e1253ad25c6e5f817e326f7 *src/nn_parallel.cpp 0e4289c389d20c7316a1902565ce9eed *src/optimize.cpp 548dfa0591aa1bcea69b08baa85840ac *src/perplexity.cpp ddf50ff6bff49c60f5e6560bc8dc31f0 *src/sampler.cpp e56d26d48cc016ea87609bd9e0acaeb4 *src/sampler.h 651bfb7adb073b58832ac163ae6e746f *src/smooth_knn.cpp 2557cf0ebcf7cbba68b21c4bd87c3dae *src/supervised.cpp beb02dc6cff2621a6d6f1d055253224c *src/tauprng.h cdf67b1135a697b19f1556f7044fe4d1 *src/transform.cpp d5b47dc43e1e70a8e4159fc6f8e12ae0 *tests/testthat.R e79f859e59680320ec1c8ac07025508d *tests/testthat/helper_data.R 9276dc0a7cbdbec66b0bcd486faf37a1 *tests/testthat/helper_fuzzy_sets.R 7e61ccac826e1f333e860c93929711de *tests/testthat/test_curve.R 4ecbd44ecf8cca9b6ff86200140d9384 *tests/testthat/test_epochs.R 4c522997a46b0b28a497fec2345b9cd2 *tests/testthat/test_errors.R bbdeaa84c99c221f1975bad3026608c3 *tests/testthat/test_fuzzy_simplicial_set.R e79a2456737672ced0c8581350873b0a *tests/testthat/test_knn_aff.R 887e3f7cb9d5f4b331ad641887536649 *tests/testthat/test_mixed_distances.R 2c287a2cb7df7a7d870cad61976d2249 *tests/testthat/test_neighbors.R 356cdc108c30172aa3560f98bb27034a *tests/testthat/test_output.R f12c7619938065c2a366cf6bcd109c47 *tests/testthat/test_pca.R 1b8175d8b62d749ffb764ac03dbb7c3f *tests/testthat/test_perplexity.R d8b53bf4154f29dfbd0523601950e099 *tests/testthat/test_rand_init.R b8707aa7755da10090ebaf1636749e87 *tests/testthat/test_saveload.R dd0b5c4a8d80d9388301126dd83c0d0c *tests/testthat/test_scale.R b1d673889815d906cd006744dbe30603 *tests/testthat/test_smooth_knn_dists.R be81ce243a43606a3b19b6972f5e549e *tests/testthat/test_spectral.R 89f36d15d5006ca51dd374d19d4b42d6 *tests/testthat/test_supervised.R aa8df99e31988c9738af52a3174cc62a *tests/testthat/test_transform.R