RcppML/0000755000176200001440000000000014122425662011413 5ustar liggesusersRcppML/NAMESPACE0000644000176200001440000000042614111717072012631 0ustar liggesusersuseDynLib(RcppML) import(Rcpp) import(Matrix) importFrom("methods", "as", "is", "canCoerce", "new") importFrom("stats", "runif") export(nnls) export(project) export(mse) export(nmf) export(bipartition) export(dclust) export(setRcppMLthreads) export(getRcppMLthreads)RcppML/man/0000755000176200001440000000000014122412744012163 5ustar liggesusersRcppML/man/bipartition.Rd0000644000176200001440000000651714122422775015014 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/bipartition.R \name{bipartition} \alias{bipartition} \title{Bipartition a sample set} \usage{ bipartition( A, tol = 1e-05, maxit = 100, nonneg = TRUE, samples = 1:ncol(A), seed = NULL, verbose = FALSE, calc_dist = FALSE, diag = TRUE ) } \arguments{ \item{A}{matrix of features-by-samples in dense or sparse format (preferred classes are "matrix" or "Matrix::dgCMatrix", respectively). Prefer sparse storage when more than half of all values are zero.} \item{tol}{stopping criteria, the correlation distance between \eqn{w} across consecutive iterations, \eqn{1 - cor(w_i, w_{i-1})}} \item{maxit}{stopping criteria, maximum number of alternating updates of \eqn{w} and \eqn{h}} \item{nonneg}{enforce non-negativity} \item{samples}{samples to include in bipartition, numbered from 1 to \code{ncol(A)}. Default is \code{NULL} for all samples.} \item{seed}{random seed for model initialization} \item{verbose}{print model tolerances between iterations} \item{calc_dist}{calculate the relative cosine distance of samples within a cluster to either cluster centroid. If \code{TRUE}, centers for clusters will also be calculated.} \item{diag}{scale factors in \eqn{w} and \eqn{h} to sum to 1 by introducing a diagonal, \eqn{d}. This should generally never be set to \code{FALSE}. Diagonalization enables symmetry of models in factorization of symmetric matrices, convex L1 regularization, and consistent factor scalings.} } \value{ A list giving the bipartition and useful statistics: \itemize{ \item v : vector giving difference between sample loadings between factors in a rank-2 factorization \item dist : relative cosine distance of samples within a cluster to centroids of assigned vs. not-assigned cluster \item size1 : number of samples in first cluster (positive loadings in 'v') \item size2 : number of samples in second cluster (negative loadings in 'v') \item samples1: indices of samples in first cluster \item samples2: indices of samples in second cluster \item center1 : mean feature loadings across samples in first cluster \item center2 : mean feature loadings across samples in second cluster } } \description{ Spectral biparitioning by rank-2 matrix factorization } \details{ Spectral bipartitioning is a popular subroutine in divisive clustering. The sign of the difference between sample loadings in factors of a rank-2 matrix factorization gives a bipartition that is nearly identical to an SVD. Rank-2 matrix factorization by alternating least squares is faster than rank-2-truncated SVD (i.e. \emph{irlba}). This function is a specialization of rank-2 \code{\link{nmf}} with support for factorization of only a subset of samples, and with additional calculations on the factorization model relevant to bipartitioning. See \code{\link{nmf}} for details regarding rank-2 factorization. } \examples{ \dontrun{ library(Matrix) data(iris) A <- as(as.matrix(iris[,1:4]), "dgCMatrix") bipartition(A, calc_dist = TRUE) } } \references{ Kuang, D, Park, H. (2013). "Fast rank-2 nonnegative matrix factorization for hierarchical document clustering." Proc. 19th ACM SIGKDD intl. conf. on Knowledge discovery and data mining. } \seealso{ \code{\link{nmf}}, \code{\link{dclust}} } \author{ Zach DeBruine } RcppML/man/getRcppMLthreads.Rd0000644000176200001440000000156314122404674015672 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/threads.R \name{getRcppMLthreads} \alias{getRcppMLthreads} \title{Get the number of threads RcppML should use} \usage{ getRcppMLthreads() } \value{ integer giving number of threads to be used by RcppML functions. \code{0} corresponds to all available threads, as determined by OpenMP. } \description{ Get the number of threads that will be used by RcppML functions supporting parallelization with OpenMP. Use \code{\link{setRcppMLthreads}} to set the number of threads to be used. } \examples{ \dontrun{ # set serial configuration setRcppMLthreads(1) getRcppMLthreads() # restore default parallel configuration, # letting OpenMP decide how many threads to use setRcppMLthreads(0) getRcppMLthreads() } } \seealso{ \code{\link{setRcppMLthreads}} } \author{ Zach DeBruine } RcppML/man/RcppML.Rd0000644000176200001440000000061414122412744013610 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/RcppML.R \docType{package} \name{RcppML} \alias{RcppML} \alias{RcppML-package} \title{RcppML: Rcpp Machine Learning Library} \description{ High-performance non-negative matrix factorization and linear model projection for sparse matrices, and fast non-negative least squares implementations } \author{ Zach DeBruine } RcppML/man/project.Rd0000644000176200001440000001057014122422775014130 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/project.R \name{project} \alias{project} \title{Project a linear factor model} \usage{ project(A, w = NULL, h = NULL, nonneg = TRUE, L1 = 0, mask_zeros = FALSE) } \arguments{ \item{A}{matrix of features-by-samples in dense or sparse format (preferred classes are "matrix" or "Matrix::dgCMatrix", respectively). Prefer sparse storage when more than half of all values are zero.} \item{w}{dense matrix of factors x features giving the linear model to be projected (if \code{h = NULL})} \item{h}{dense matrix of factors x samples giving the linear model to be projected (if \code{w = NULL})} \item{nonneg}{enforce non-negativity} \item{L1}{L1/LASSO penalty to be applied. No scaling is performed. See details.} \item{mask_zeros}{handle zeros as missing values, available only when \code{A} is sparse} } \value{ matrix \eqn{h} or \eqn{w} } \description{ Solves the equation \eqn{A = wh} for either \eqn{h} or \eqn{w} given either \eqn{w} or \eqn{h} and \eqn{A} } \details{ For the classical alternating least squares matrix factorization update problem \eqn{A = wh}, the updates (or projection) of \eqn{h} is given by the equation: \deqn{w^Twh = wA_j} which is in the form \eqn{ax = b} where \eqn{a = w^Tw} \eqn{x = h} and \eqn{b = wA_j} for all columns \eqn{j} in \eqn{A}. Given \eqn{A}, \code{project} can solve for either \eqn{w} or \eqn{h} given the other: \itemize{ \item When given \eqn{A} and \eqn{w}, \eqn{h} is found using a highly efficient parallelization scheme. \item When given \eqn{A} and \eqn{h}, \eqn{w} is found without transposition of \eqn{A} (as would be the case in traditional block-pivoting matrix factorization) by accumulating the right-hand sides of linear systems in-place in \eqn{A}, then solving the systems. Note that \eqn{w} may also be found by inputting the transpose of \eqn{A} and \eqn{h} in place of \eqn{w}, (i.e. \code{A = t(A), w = h, h = NULL}). However, for most applications, the cost of a single projection in-place is less than transposition of \eqn{A}. However, for matrix factorization, it is desirable to transpose \eqn{A} if possible because many projections are needed. } \strong{Parallelization.} Least squares projections in factorizations of rank-3 and greater are parallelized using the number of threads set by \code{\link{setRcppMLthreads}}. By default, all available threads are used, see \code{\link{getRcppMLthreads}}. The overhead of parallization is too great for rank-1 and -2 factorization. \strong{L1 Regularization.} Any L1 penalty is subtracted from \eqn{b} and should generally be scaled to \code{max(b)}, where \eqn{b = WA_j} for all columns \eqn{j} in \eqn{A}. An easy way to properly scale an L1 penalty is to normalize all columns in \eqn{w} to sum to 1. No scaling is applied in this function. Such scaling guarantees that \code{L1 = 1} gives a completely sparse solution. \strong{Specializations.} There are specializations for symmetric input matrices, and for rank-1 and rank-2 projections. See documentation for \code{\link{nmf}} for theoretical details and guidance. \strong{Publication reference.} For theoretical and practical considerations, please see our manuscript: "DeBruine ZJ, Melcher K, Triche TJ (2021) High-performance non-negative matrix factorization for large single cell data." on BioRXiv. } \examples{ \dontrun{ library(Matrix) w <- matrix(runif(1000 * 10), 1000, 10) h_true <- matrix(runif(10 * 100), 10, 100) # A is the crossproduct of "w" and "h" with 10\% signal dropout A <- (w \%*\% h_true) * (rsparsematrix(1000, 100, 0.9) > 0) h <- project(A, w) cor(as.vector(h_true), as.vector(h)) # alternating projections refine solution (like NMF) mse_bad <- mse(A, w, rep(1, ncol(w)), h) # mse before alternating updates h <- project(A, w = w) w <- project(A, h = h) h <- project(A, w) w <- project(A, h = h) h <- project(A, w) w <- t(project(A, h = h)) mse_better <- mse(A, w, rep(1, ncol(w)), h) # mse after alternating updates mse_better < mse_bad # two ways to solve for "w" that give the same solution w <- project(A, h = h) w2 <- project(t(A), w = t(h)) all.equal(w, w2) } } \references{ DeBruine, ZJ, Melcher, K, and Triche, TJ. (2021). "High-performance non-negative matrix factorization for large single-cell data." BioRXiv. } \seealso{ \code{\link{nnls}}, \code{\link{nmf}} } \author{ Zach DeBruine } RcppML/man/nmf.Rd0000644000176200001440000002306414122422775013244 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/nmf.R \name{nmf} \alias{nmf} \title{Non-negative matrix factorization} \usage{ nmf( A, k, tol = 1e-04, maxit = 100, verbose = TRUE, L1 = c(0, 0), seed = NULL, mask_zeros = FALSE, diag = TRUE, nonneg = TRUE ) } \arguments{ \item{A}{matrix of features-by-samples in dense or sparse format (preferred classes are "matrix" or "Matrix::dgCMatrix", respectively). Prefer sparse storage when more than half of all values are zero.} \item{k}{rank} \item{tol}{stopping criteria, the correlation distance between \eqn{w} across consecutive iterations, \eqn{1 - cor(w_i, w_{i-1})}} \item{maxit}{stopping criteria, maximum number of alternating updates of \eqn{w} and \eqn{h}} \item{verbose}{print model tolerances between iterations} \item{L1}{L1/LASSO penalties between 0 and 1, array of length two for \code{c(w, h)}} \item{seed}{random seed for model initialization} \item{mask_zeros}{handle zeros as missing values, available only when \code{A} is sparse} \item{diag}{scale factors in \eqn{w} and \eqn{h} to sum to 1 by introducing a diagonal, \eqn{d}. This should generally never be set to \code{FALSE}. Diagonalization enables symmetry of models in factorization of symmetric matrices, convex L1 regularization, and consistent factor scalings.} \item{nonneg}{enforce non-negativity} } \value{ A list giving the factorization model: \itemize{ \item w : feature factor matrix \item d : scaling diagonal vector \item h : sample factor matrix \item tol : tolerance between models at final update \item iter : number of alternating updates run } } \description{ Sparse matrix factorization of the form \eqn{A = wdh} by alternating least squares with optional non-negativity constraints. } \details{ This fast non-negative matrix factorization (NMF) implementation decomposes a matrix \eqn{A} into lower-rank non-negative matrices \eqn{w} and \eqn{h}, with factors scaled to sum to 1 via multiplication by a diagonal, \eqn{d}: \deqn{A = wdh} The scaling diagonal enables symmetric factorization, convex L1 regularization, and consistent factor scalings regardless of random initialization. The factorization model is randomly initialized, and \eqn{w} and \eqn{h} are updated alternately using least squares. Given \eqn{A} and \eqn{w}, \eqn{h} is updated according to the equation: \deqn{w^Twh = wA_j} This equation is in the form \eqn{ax = b} where \eqn{a = w^Tw}, \eqn{x = h}, and \eqn{b = wA_j} for all columns \eqn{j} in \eqn{A}. The corresponding update for \eqn{w} is \deqn{hh^Tw = hA^T_j} \strong{Stopping criteria.} Alternating least squares projections (see \code{\link{project}} subroutine) are repeated until a stopping criteria is satisfied, which is either a maximum number of iterations or a tolerance based on the correlation distance between models (\eqn{1 - cor(w_i, w_{i-1})}) across consecutive iterations. Use the \code{tol} parameter to control the stopping criteria for alternating updates: \itemize{ \item \code{tol = 1e-2} is appropriate for approximate mean squared error determination and coarse cross-validation, useful for rank determination. \item \code{tol = 1e-3} to \code{1e-4} are suitable for rapid expermentation, cross-validation, and preliminary analysis. \item \code{tol = 1e-5} and smaller for publication-quality runs \item \code{tol = 1e-10} and smaller for robust factorizations at or near machine-precision } \strong{Parallelization.} Least squares projections in factorizations of rank-3 and greater are parallelized using the number of threads set by \code{\link{setRcppMLthreads}}. By default, all available threads are used, see \code{\link{getRcppMLthreads}}. The overhead of parallization is too great to benefit rank-1 and rank-2 factorization. \strong{Specializations.} There are specializations for symmetric matrices, and for rank-1 and rank-2 factorization. \strong{L1 regularization}. L1 penalization increases the sparsity of factors, but does not change the information content of the model or the relative contributions of the leading coefficients in each factor to the model. L1 regularization only slightly increases the error of a model. L1 penalization should be used to aid interpretability. Penalty values should range from 0 to 1, where 1 gives complete sparsity. In this implementation of NMF, a scaling diagonal ensures that the L1 penalty is equally applied across all factors regardless of random initialization and the distribution of the model. Many other implementations of matrix factorization claim to apply L1, but the magnitude of the penalty is at the mercy of the random distribution and more significantly affects factors with lower overall contribution to the model. L1 regularization of rank-1 and rank-2 factorizations has no effect. \strong{Rank-2 factorization.} When \eqn{k = 2}, a very fast optimized algorithm is used. Two-variable least squares solutions to the problem \eqn{ax = b} are found by direct substitution: \deqn{x_1 = \frac{a_{22}b_1 - a_{12}b_2}{a_{11}a_{22} - a_{12}^2}} \deqn{x_2 = \frac{a_{11}b_2 - a_{12}b_1}{a_{11}a_{22} - a_{12}^2}} In the above equations, the denominator is constant and thus needs to be calculated only once. Additionally, if non-negativity constraints are to be imposed, if \eqn{x_1 < 0} then \eqn{x_1 = 0} and \eqn{x_2 = \frac{b_1}{a_{11}}}. Similarly, if \eqn{x_2 < 0} then \eqn{x_2 = 0} and \eqn{x_1 = \frac{b_2}{a_{22}}}. Rank-2 NMF is useful for bipartitioning, and is a subroutine in \code{\link{bipartition}}, where the sign of the difference between sample loadings in both factors gives the partitioning. \strong{Rank-1 factorization.} Rank-1 factorization by alternating least squares gives vectors equivalent to the first singular vectors in an SVD. It is a normalization of the data to a middle point, and may be useful for ordering samples based on the most significant axis of variation (i.e. pseudotime trajectories). Diagonal scaling guarantees consistent linear scaling of the factor across random restarts. \strong{Random seed and reproducibility.} Results of a rank-1 and rank-2 factorizations should be reproducible regardless of random seed. For higher-rank models, results across random restarts should, in theory, be comparable at very high tolerances (i.e. machine precision for \emph{double}, corresponding to about \code{tol = 1e-10}). However, to guarantee reproducibility without such low tolerances, set the \code{seed} argument. Note that \code{set.seed()} will not work. Only random initialization is supported, as other methods incur unnecessary overhead and sometimes trap updates into local minima. \strong{Rank determination.} This function does not attempt to provide a method for rank determination. Like any clustering algorithm or dimensional reduction, finding the optimal rank can be subjective. An easy way to estimate rank uses the "elbow method", where the inflection point on a plot of Mean Squared Error loss (MSE) vs. rank gives a good idea of the rank at which most of the signal has been captured in the model. Unfortunately, this inflection point is not often as obvious for NMF as it is for SVD or PCA. k-fold cross-validation is a better method. Missing value of imputation has previously been proposed, but is arguably no less subjective than test-training splits and requires computationally slower factorization updates using missing values, which are not supported here. \strong{Symmetric factorization.} Special optimization for symmetric matrices is automatically applied. Specifically, alternating updates of \code{w} and \code{h} require transposition of \code{A}, but \code{A == t(A)} when \code{A} is symmetric, thus no up-front transposition is performed. \strong{Zero-masking}. When zeros in a data structure can be regarded as "missing", \code{mask_zeros = TRUE} may be set. However, this requires a slower algorithm, and tolerances will fluctuate more dramatically. \strong{Publication reference.} For theoretical and practical considerations, please see our manuscript: "DeBruine ZJ, Melcher K, Triche TJ (2021) High-performance non-negative matrix factorization for large single cell data." on BioRXiv. } \examples{ \dontrun{ library(Matrix) # basic NMF model <- nmf(rsparsematrix(1000, 100, 0.1), k = 10) # compare rank-2 NMF to second left vector in an SVD data(iris) A <- as(as.matrix(iris[,1:4]), "dgCMatrix") nmf_model <- nmf(A, 2, tol = 1e-5) bipartitioning_vector <- apply(nmf_model$w, 1, diff) second_left_svd_vector <- base::svd(A, 2)$u[,2] abs(cor(bipartitioning_vector, second_left_svd_vector)) # compare rank-1 NMF with first singular vector in an SVD abs(cor(nmf(A, 1)$w[,1], base::svd(A, 2)$u[,1])) # symmetric NMF A <- crossprod(rsparsematrix(100, 100, 0.02)) model <- nmf(A, 10, tol = 1e-5, maxit = 1000) plot(model$w, t(model$h)) # see package vignette for more examples } } \references{ DeBruine, ZJ, Melcher, K, and Triche, TJ. (2021). "High-performance non-negative matrix factorization for large single-cell data." BioRXiv. Lin, X, and Boutros, PC (2020). "Optimization and expansion of non-negative matrix factorization." BMC Bioinformatics. Lee, D, and Seung, HS (1999). "Learning the parts of objects by non-negative matrix factorization." Nature. Franc, VC, Hlavac, VC, Navara, M. (2005). "Sequential Coordinate-Wise Algorithm for the Non-negative Least Squares Problem". Proc. Int'l Conf. Computer Analysis of Images and Patterns. Lecture Notes in Computer Science. } \seealso{ \code{\link{nnls}}, \code{\link{project}}, \code{\link{mse}} } \author{ Zach DeBruine } RcppML/man/dclust.Rd0000644000176200001440000001102714122422775013756 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/dclust.R \name{dclust} \alias{dclust} \title{Divisive clustering} \usage{ dclust( A, min_samples, min_dist = 0, verbose = TRUE, tol = 1e-05, maxit = 100, nonneg = TRUE, seed = NULL ) } \arguments{ \item{A}{matrix of features-by-samples in sparse format (preferred class is "Matrix::dgCMatrix")} \item{min_samples}{stopping criteria giving the minimum number of samples permitted in a cluster} \item{min_dist}{stopping criteria giving the minimum cosine distance of samples within a cluster to the center of their assigned vs. unassigned cluster. If \code{0}, neither this distance nor cluster centroids will be calculated.} \item{verbose}{print number of divisions in each generation} \item{tol}{in rank-2 NMF, the correlation distance (\eqn{1 - R^2}) between \eqn{w} across consecutive iterations at which to stop factorization} \item{maxit}{stopping criteria, maximum number of alternating updates of \eqn{w} and \eqn{h}} \item{nonneg}{in rank-2 NMF, enforce non-negativity} \item{seed}{random seed for rank-2 NMF model initialization} } \value{ A list of lists corresponding to individual clusters: \itemize{ \item id : character sequence of "0" and "1" giving position of clusters along splitting hierarchy \item samples : indices of samples in the cluster \item center : mean feature expression of all samples in the cluster \item dist : if applicable, relative cosine distance of samples in cluster to assigned/unassigned cluster center. \item leaf : is cluster a leaf node } } \description{ Recursive bipartitioning by rank-2 matrix factorization with an efficient modularity-approximate stopping criteria } \details{ Divisive clustering is a sensitive and fast method for sample classification. Samples are recursively partitioned into two groups until a stopping criteria is satisfied and prevents successful partitioning. See \code{\link{nmf}} and \code{\link{bipartition}} for technical considerations and optimizations relevant to bipartitioning. \strong{Stopping criteria}. Two stopping criteria are used to prevent indefinite division of clusters and tune the clustering resolution to a desirable range: \itemize{ \item \code{min_samples}: Minimum number of samples permitted in a cluster \item \code{min_dist}: Minimum cosine distance of samples to their cluster center relative to their unassigned cluster center (an approximation of Newman-Girvan modularity) } Newman-Girvan modularity (\eqn{Q}) is an interpretable and widely used measure of modularity for a bipartition. However, it requires the calculation of distance between all within-cluster and between-cluster sample pairs. This is computationally intensive, especially for large sample sets. \code{dclust} uses a measure which linearly approximates Newman-Girvan modularity, and simply requires the calculation of distance between all samples in a cluster and both cluster centers (the assigned and unassigned center), which is orders of magnitude faster to compute. Cosine distance is used instead of Euclidean distance since it handles outliers and sparsity well. A bipartition is rejected if either of the two clusters contains fewer than \code{min_samples} or if the mean relative cosine distance of the bipartition is less than \code{min_dist}. A bipartition will only be attempted if there are more than \code{2 * min_samples} samples in the cluster, meaning that \code{dist} may not be calculated for some clusters. \strong{Reproducibility.} Because rank-2 NMF is approximate and requires random initialization, results may vary slightly across restarts. Therefore, specify a \code{seed} to guarantee absolute reproducibility. Other than setting the seed, reproducibility may be improved by setting \code{tol} to a smaller number to increase the exactness of each bipartition. } \examples{ \dontrun{ library(Matrix) data(USArrests) A <- as(as.matrix(t(USArrests)), "dgCMatrix") clusters <- dclust(A, min_samples = 2, min_dist = 0.001) str(clusters) } } \references{ Schwartz, G. et al. "TooManyCells identifies and visualizes relationships of single-cell clades". Nature Methods (2020). Newman, MEJ. "Modularity and community structure in networks". PNAS (2006) Kuang, D, Park, H. (2013). "Fast rank-2 nonnegative matrix factorization for hierarchical document clustering." Proc. 19th ACM SIGKDD intl. conf. on Knowledge discovery and data mining. } \seealso{ \code{\link{bipartition}}, \code{\link{nmf}} } \author{ Zach DeBruine } RcppML/man/setRcppMLthreads.Rd0000644000176200001440000000242114122404674015700 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/threads.R \name{setRcppMLthreads} \alias{setRcppMLthreads} \title{Set the number of threads RcppML should use} \usage{ setRcppMLthreads(threads) } \arguments{ \item{threads}{number of threads to be used in RcppML functions that are parallelized with OpenMP.} } \description{ The number of threads is 0 by default (corresponding to all available threads), but can be set manually using this function. If you clear environment variables or affect the "RcppMLthreads" environment variable specifically, you will need to set your number of preferred threads again. } \details{ The number of threads set affects OpenMP parallelization only for functions in the RcppML package. It does not affect other R packages that use OpenMP. Parallelization is used for projection of linear factor models with rank > 2, calculation of mean squared error for linear factor models, and for divisive clustering. } \examples{ \dontrun{ # set serial configuration setRcppMLthreads(1) getRcppMLthreads() # restore default parallel configuration, # letting OpenMP decide how many threads to use setRcppMLthreads(0) getRcppMLthreads() } } \seealso{ \code{\link{getRcppMLthreads}} } \author{ Zach DeBruine } RcppML/man/mse.Rd0000644000176200001440000000373614122422775013254 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/mse.R \name{mse} \alias{mse} \title{Mean Squared Error loss of a factor model} \usage{ mse(A, w, d = NULL, h, mask_zeros = FALSE) } \arguments{ \item{A}{matrix of features-by-samples in dense or sparse format (preferred classes are "matrix" or "Matrix::dgCMatrix", respectively). Prefer sparse storage when more than half of all values are zero.} \item{w}{dense matrix of class \code{matrix} with factors (columns) by features (rows)} \item{d}{diagonal scaling vector of rank length} \item{h}{dense matrix of class \code{matrix} with samples (columns) by factors (rows)} \item{mask_zeros}{handle zeros as missing values, available only when \code{A} is sparse} } \value{ mean squared error of the factorization model } \description{ MSE of factor models \code{w} and \code{h} given sparse matrix \eqn{A} } \details{ Mean squared error of a matrix factorization of the form \eqn{A = wdh} is given by \deqn{\frac{\sum_{i,j}{(A - wdh)^2}}{ij}} where \eqn{i} and \eqn{j} are the number of rows and columns in \eqn{A}. Thus, this function simply calculates the cross-product of \eqn{wh} or \eqn{wdh} (if \eqn{d} is specified), subtracts that from \eqn{A}, squares the result, and calculates the mean of all values. If no diagonal scaling vector is present in the model, input \code{d = rep(1, k)} where \code{k} is the rank of the model. \strong{Parallelization.} Calculation of mean squared error is performed in parallel across columns in \code{A} using the number of threads set by \code{\link{setRcppMLthreads}}. By default, all available threads are used, see \code{\link{getRcppMLthreads}}. } \examples{ \dontrun{ library(Matrix) A <- Matrix::rsparsematrix(1000, 1000, 0.1) model <- nmf(A, k = 10, tol = 0.01) c_mse <- mse(A, model$w, model$d, model$h) R_mse <- mean((A - model$w \%*\% Diagonal(x = model$d) \%*\% model$h)^2) all.equal(c_mse, R_mse) } } \author{ Zach DeBruine } RcppML/man/nnls.Rd0000644000176200001440000001120614111717072013424 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/RcppExports.R \name{nnls} \alias{nnls} \title{Non-negative least squares} \usage{ nnls(a, b, cd_maxit = 100L, cd_tol = 1e-08, fast_nnls = FALSE, L1 = 0) } \arguments{ \item{a}{symmetric positive definite matrix giving coefficients of the linear system} \item{b}{matrix giving the right-hand side(s) of the linear system} \item{cd_maxit}{maximum number of coordinate descent iterations} \item{cd_tol}{stopping criteria, difference in \eqn{x} across consecutive solutions over the sum of \eqn{x}} \item{fast_nnls}{initialize coordinate descent with a FAST NNLS approximation} \item{L1}{L1/LASSO penalty to be subtracted from \code{b}} } \value{ vector or matrix giving solution for \code{x} } \description{ Solves the equation \code{a \%*\% x = b} for \code{x} subject to \eqn{x > 0}. } \details{ This is a very fast implementation of non-negative least squares (NNLS), suitable for very small or very large systems. \strong{Algorithm}. Sequential coordinate descent (CD) is at the core of this implementation, and requires an initialization of \eqn{x}. There are two supported methods for initialization of \eqn{x}: \enumerate{ \item \strong{Zero-filled initialization} when \code{fast_nnls = FALSE} and \code{cd_maxit > 0}. This is generally very efficient for well-conditioned and small systems. \item \strong{Approximation with FAST} when \code{fast_nnls = TRUE}. Forward active set tuning (FAST), described below, finds an approximate active set using unconstrained least squares solutions found by Cholesky decomposition and substitution. To use only FAST approximation, set \code{cd_maxit = 0}. } \code{a} must be symmetric positive definite if FAST NNLS is used, but this is not checked. See our BioRXiv manuscript (references) for benchmarking against Lawson-Hanson NNLS and for a more technical introduction to these methods. \strong{Coordinate Descent NNLS}. Least squares by \strong{sequential coordinate descent} is used to ensure the solution returned is exact. This algorithm was introduced by Franc et al. (2005), and our implementation is a vectorized and optimized rendition of that found in the NNLM R package by Xihui Lin (2020). \strong{FAST NNLS.} Forward active set tuning (FAST) is an exact or near-exact NNLS approximation initialized by an unconstrained least squares solution. Negative values in this unconstrained solution are set to zero (the "active set"), and all other values are added to a "feasible set". An unconstrained least squares solution is then solved for the "feasible set", any negative values in the resulting solution are set to zero, and the process is repeated until the feasible set solution is strictly positive. The FAST algorithm has a definite convergence guarantee because the feasible set will either converge or become smaller with each iteration. The result is generally exact or nearly exact for small well-conditioned systems (< 50 variables) within 2 iterations and thus sets up coordinate descent for very rapid convergence. The FAST method is similar to the first phase of the so-called "TNT-NN" algorithm (Myre et al., 2017), but the latter half of that method relies heavily on heuristics to refine the approximate active set, which we avoid by using coordinate descent instead. } \examples{ \dontrun{ # compare solution to base::solve for a random system X <- matrix(runif(100), 10, 10) a <- crossprod(X) b <- crossprod(X, runif(10)) unconstrained_soln <- solve(a, b) nonneg_soln <- nnls(a, b) unconstrained_err <- mean((a \%*\% unconstrained_soln - b)^2) nonnegative_err <- mean((a \%*\% nonneg_soln - b)^2) unconstrained_err nonnegative_err all.equal(solve(a, b), nnls(a, b)) # example adapted from multiway::fnnls example 1 X <- matrix(1:100,50,2) y <- matrix(101:150,50,1) beta <- solve(crossprod(X)) \%*\% crossprod(X, y) beta beta <- nnls(crossprod(X), crossprod(X, y)) beta } } \references{ DeBruine, ZJ, Melcher, K, and Triche, TJ. (2021). "High-performance non-negative matrix factorization for large single-cell data." BioRXiv. Franc, VC, Hlavac, VC, and Navara, M. (2005). "Sequential Coordinate-Wise Algorithm for the Non-negative Least Squares Problem. Proc. Int'l Conf. Computer Analysis of Images and Patterns." Lin, X, and Boutros, PC (2020). "Optimization and expansion of non-negative matrix factorization." BMC Bioinformatics. Myre, JM, Frahm, E, Lilja DJ, and Saar, MO. (2017) "TNT-NN: A Fast Active Set Method for Solving Large Non-Negative Least Squares Problems". Proc. Computer Science. } \seealso{ \code{\link{nmf}}, \code{\link{project}} } \author{ Zach DeBruine } RcppML/DESCRIPTION0000644000176200001440000000167614122425662013133 0ustar liggesusersPackage: RcppML Type: Package Title: Rcpp Machine Learning Library Version: 0.3.7 Date: 2021-09-21 Authors@R: person("Zachary", "DeBruine", email = "zacharydebruine@gmail.com", role = c("aut", "cre"), comment = c(ORCID = "0000-0003-2234-4827")) Description: Fast machine learning algorithms including matrix factorization and divisive clustering for large sparse and dense matrices. License: GPL (>= 2) Imports: Rcpp, Matrix, methods, stats LinkingTo: Rcpp, RcppEigen VignetteBuilder: knitr RoxygenNote: 7.1.1 Suggests: knitr, rmarkdown, testthat (>= 3.0.0) Config/testthat/edition: 3 URL: https://github.com/zdebruine/RcppML BugReports: https://github.com/zdebruine/RcppML/issues NeedsCompilation: yes Packaged: 2021-09-21 18:39:44 UTC; Owner Author: Zachary DeBruine [aut, cre] () Maintainer: Zachary DeBruine Repository: CRAN Date/Publication: 2021-09-21 19:00:02 UTC RcppML/build/0000755000176200001440000000000014122423360012503 5ustar liggesusersRcppML/build/vignette.rds0000644000176200001440000000033314122423360015041 0ustar liggesusersmP 0, FSCWJDҿ<;E^vvr !&%&0* U֛BU2E*SDJ]E |蕨= 0, TRUE) expect_equal(min(model$h) >= 0, TRUE) # test that non-negativity constraints are not applied when nonneg = FALSE (SPARSE) model <- nmf(A, 5, nonneg = FALSE, v = F) expect_equal(min(model$w) < 0, TRUE) expect_equal(min(model$h) < 0, TRUE) # test that diagonalization enforces symmetry (SPARSE) sim <- as(crossprod(A), "dgCMatrix") model <- nmf(sim, 5, diag = F, seed = 123, tol = 1e-6, v = F) model2 <- nmf(sim, 5, diag = T, seed = 123, tol = 1e-6, v = F) cor_model <- cor(as.vector(model$w), as.vector(t(model$h))) cor_model2 <- cor(as.vector(model2$w), as.vector(t(model2$h))) expect_lt(cor_model, cor_model2) expect_equal(mean(model$d), 1) # test that matrix factorization converges over time (SPARSE SYMMETRIC) A <- as(A, "dgCMatrix") model_bad <- nmf(A, 5, maxit = 2, tol = 1e-10, v = F) model_good <- nmf(A, 5, maxit = 20, tol = 1e-10, v = F) model_bad_mse <- mse(A, model_bad$w, model_bad$d, model_bad$h) model_good_mse <- mse(A, model_good$w, model_good$d, model_good$h) expect_lt(model_good_mse, model_bad_mse) # test that L1 regularization promotes factor sparsity (SPARSE) model_with_no_L1 <- nmf(A, 5, maxit = 5, L1 = c(0, 0), seed = 123, v = F) model_with_L1 <- nmf(A, 5, maxit = 5, L1 = c(0.1, 0.1), seed = 123, v = F) sparsity_with_no_L1 <- sum(model_with_no_L1$w == 0) sparsity_with_L1 <- sum(model_with_L1$w == 0) expect_lt(sparsity_with_no_L1, sparsity_with_L1) # test that setting the random seed gives identical models (SPARSE) A <- as(A, "dgCMatrix") model1 <- nmf(A, 5, maxit = 5, seed = 123, v = F) model1_repeat <- nmf(A, 5, maxit = 5, seed = 123, v = F) model2 <- nmf(A, 5, maxit = 5, seed = 234, v = F) expect_equal(all(model1$w == model1_repeat$w), TRUE) expect_equal(all(model1$w == model2$w), FALSE) # ###################################### # test rank-2 specialization # test that matrix factorization converges over time (SPARSE) A <- abs(Matrix::rsparsematrix(100, 100, 0.1)) model_bad <- nmf(A, 2, maxit = 1, tol = 1e-10, seed = 123, v = F) model_good <- nmf(A, 2, maxit = 20, tol = 1e-10, seed = 123, v = F) model_bad_mse <- mse(A, model_bad$w, model_bad$d, model_bad$h) model_good_mse <- mse(A, model_good$w, model_good$d, model_good$h) expect_lt(model_good_mse, model_bad_mse) # test that non-negativity constraints are applied (SPARSE) model <- model_bad expect_equal(min(model$w) >= 0, TRUE) expect_equal(min(model$h) >= 0, TRUE) # test that non-negativity constraints are not applied when nonneg = FALSE (SPARSE) model <- nmf(A, 2, nonneg = FALSE, v = F) expect_equal(min(model$w) < 0, TRUE) expect_equal(min(model$h) < 0, TRUE) # test that diagonalization enforces symmetry (SPARSE) sim <- as(crossprod(A), "dgCMatrix") model <- nmf(sim, 2, diag = F, seed = 123, tol = 1e-6, v = F) model2 <- nmf(sim, 2, diag = T, seed = 123, tol = 1e-6, v = F) cor_model <- cor(as.vector(model$w), as.vector(t(model$h))) cor_model2 <- cor(as.vector(model2$w), as.vector(t(model2$h))) expect_lt(cor_model, cor_model2) expect_equal(mean(model$d), 1) # test that setting the random seed gives identical models (SPARSE) A <- as(A, "dgCMatrix") model1 <- nmf(A, 2, maxit = 5, seed = 123, v = F) model1_repeat <- nmf(A, 2, maxit = 5, seed = 123, v = F) model2 <- nmf(A, 2, maxit = 5, seed = 234, v = F) expect_equal(all(model1$w == model1_repeat$w), TRUE) expect_equal(all(model1$w == model2$w), FALSE) })RcppML/tests/testthat/test_bipartition.R0000644000176200001440000000361714111717072020127 0ustar liggesuserstest_that("Testing RcppML::nmf", { setRcppMLthreads(1) A <- rbind(c(1.7009878, 0.00000000, 0.9542116, 0.68738015, 1.1690738012, 1.4190147, 0.8697167, 0.72520486, 0.00000000, 0.0000000), c(1.5901502, 0.61081441, 0.6841591, 0.00000000, 0.9110474099, 0.5809928, 0.0000000, 0.99028990, 0.01800186, 0.3915861), c(1.1629335, 2.13608402, 2.3780564, 2.71694620, 1.3074841130, 1.7009005, 0.0000000, 0.01130724, 1.59745476, 0.0000000), c(0.0000000, 0.00000000, 0.4145758, 0.01870036, 0.0000000000, 0.0000000, 0.0000000, 0.00000000, 0.00000000, 0.0000000), c(0.0000000, 0.90947564, 0.6896592, 0.00000000, 1.5718641979, 0.2359384, 0.9782686, 3.03143394, 0.00000000, 0.6474887), c(1.3965336, 0.00000000, 0.0000000, 0.00000000, 1.0002093844, 0.7561340, 0.0000000, 0.00000000, 0.89331994, 1.1115118), c(0.6743350, 0.08068094, 0.0000000, 1.40130937, 1.4421851828, 1.1075104, 0.1342477, 3.17786538, 2.56604624, 2.8281075), c(0.4790339, 0.89483168, 0.0000000, 0.33670458, 0.7380427447, 0.0000000, 2.9913736, 1.87699437, 2.86377032, 1.9204602), c(0.0000000, 0.00000000, 0.0000000, 1.30777912, 0.0000000000, 0.5205217, 2.2561151, 3.71541181, 1.19395162, 3.0003078), c(0.0000000, 0.00000000, 0.0000000, 0.00000000, 0.0001228284, 0.0000000, 3.3094624, 2.97985598, 2.85893710, 2.6126768)) samples1 <- 0:5 samples2 <- 6:9 size1 <- 4 size2 <- 6 center1 <- rowMeans(A[,samples1 + 1]) center2 <- rowMeans(A[,samples2 + 1]) # test that the bipartition is as expected (SPARSE) A <- as(A, "dgCMatrix") model <- bipartition(A, calc_dist = TRUE) expect_equal(model$dist, 0.3197266, tolerance = 1e-4) expect_equal(model$size1 == 4 || model$size1 == 6, TRUE) if(model$size1 == 6){ expect_equal(model$size2 == 4, TRUE) } else { expect_equal(model$size2 == 6, TRUE) } })RcppML/tests/testthat.R0000644000176200001440000000011514111717072014532 0ustar liggesuserslibrary(testthat) library(RcppML) library(Matrix) test_check("RcppML") RcppML/src/0000755000176200001440000000000014122423360012173 5ustar liggesusersRcppML/src/Makevars0000644000176200001440000000033014111717072013667 0ustar liggesusersPKG_CPPFLAGS = -I../inst/include/ PKG_LIBS = $(SHLIB_OPENMP_CXXFLAGS) $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) PKG_CXXFLAGS = $(SHLIB_OPENMP_CXXFLAGS) -DEIGEN_INITIALIZE_MATRICES_BY_ZERO -DEIGEN_NO_DEBUG CXX_STD = CXX11 RcppML/src/Makevars.win0000644000176200001440000000033014111717072014463 0ustar liggesusersPKG_CPPFLAGS = -I../inst/include/ PKG_LIBS = $(SHLIB_OPENMP_CXXFLAGS) $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) PKG_CXXFLAGS = $(SHLIB_OPENMP_CXXFLAGS) -DEIGEN_INITIALIZE_MATRICES_BY_ZERO -DEIGEN_NO_DEBUG CXX_STD = CXX11 RcppML/src/RcppFunctions.cpp0000644000176200001440000003351114122422740015500 0ustar liggesusers// This file is part of RcppML, a Rcpp Machine Learning library // // Copyright (C) 2021 Zach DeBruine // // This source code is subject to the terms of the GNU // Public License v. 2.0. #include // PROJECT LINEAR FACTOR MODELS //[[Rcpp::export]] Eigen::MatrixXd Rcpp_projectW_sparse(const Rcpp::S4& A, const Eigen::MatrixXd w, const bool nonneg, const double L1, const unsigned int threads, const bool mask_zeros) { RcppML::SparseMatrix A_(A); Eigen::MatrixXd h(w.rows(), A_.cols()); project(A_, w, h, nonneg, L1, threads, mask_zeros); return h; } //[[Rcpp::export]] Eigen::MatrixXd Rcpp_projectH_sparse(const Rcpp::S4& A, const Eigen::MatrixXd h, const bool nonneg, const double L1, const unsigned int threads, const bool mask_zeros) { RcppML::SparseMatrix A_(A); Eigen::MatrixXd w(h.rows(), A_.rows()); projectInPlace(A_, h, w, nonneg, L1, threads, mask_zeros); return w; } //[[Rcpp::export]] Eigen::MatrixXd Rcpp_projectW_dense(const Eigen::MatrixXd& A, const Eigen::MatrixXd w, const bool nonneg, const double L1, const unsigned int threads, const bool mask_zeros) { Eigen::MatrixXd h(w.rows(), A.cols()); project(A, w, h, nonneg, L1, threads, mask_zeros); return h; } //[[Rcpp::export]] Eigen::MatrixXd Rcpp_projectH_dense(Eigen::MatrixXd& A, const Eigen::MatrixXd h, const bool nonneg, const double L1, const unsigned int threads, const bool mask_zeros) { Eigen::MatrixXd w(h.rows(), A.rows()); projectInPlace(A, h, w, nonneg, L1, threads, mask_zeros); return w; } // MEAN SQUARED ERROR LOSS OF FACTORIZATION //[[Rcpp::export]] double Rcpp_mse_sparse(const Rcpp::S4& A, Eigen::MatrixXd& w, Eigen::VectorXd& d, Eigen::MatrixXd& h, const bool mask_zeros, const unsigned int threads) { RcppML::SparseMatrix A_(A); RcppML::MatrixFactorization m(w, d, h); m.threads = threads; m.mask_zeros = mask_zeros; return m.mse(A_); } //[[Rcpp::export]] double Rcpp_mse_dense(Eigen::MatrixXd& A, Eigen::MatrixXd& w, Eigen::VectorXd& d, Eigen::MatrixXd& h, const bool mask_zeros, const unsigned int threads) { RcppML::MatrixFactorization m(w, d, h); m.threads = threads; m.mask_zeros = mask_zeros; return m.mse(A); } // NON_NEGATIVE MATRIX FACTORIZATION //[[Rcpp::export]] Rcpp::List Rcpp_nmf_sparse(const Rcpp::S4& A, const unsigned int k, const double tol, const unsigned int maxit, const bool verbose, const bool nonneg, const Rcpp::NumericVector L1, const unsigned int seed, const bool diag, const bool mask_zeros, const unsigned int threads) { RcppML::SparseMatrix A_(A); RcppML::MatrixFactorization m(k, A_.rows(), A_.cols(), seed); // set model parameters m.tol = tol; m.updateInPlace = false; m.nonneg = nonneg; m.L1_w = L1(0); m.L1_h = L1(1); m.maxit = maxit; m.diag = diag; m.verbose = verbose; m.mask_zeros = mask_zeros; m.threads = threads; // fit the model by alternating least squares m.fit(A_); return Rcpp::List::create( Rcpp::Named("w") = m.matrixW().transpose(), Rcpp::Named("d") = m.vectorD(), Rcpp::Named("h") = m.matrixH(), Rcpp::Named("tol") = m.fit_tol(), Rcpp::Named("iter") = m.fit_iter()); } //[[Rcpp::export]] Rcpp::List Rcpp_nmf_dense(Eigen::MatrixXd& A, const unsigned int k, const double tol, const unsigned int maxit, const bool verbose, const bool nonneg, const Rcpp::NumericVector L1, const unsigned int seed, const bool diag, const bool mask_zeros, const unsigned int threads) { RcppML::MatrixFactorization m(k, A.rows(), A.cols(), seed); // set model parameters m.tol = tol; m.updateInPlace = false; m.nonneg = nonneg; m.L1_w = L1(0); m.L1_h = L1(1); m.maxit = maxit; m.diag = diag; m.verbose = verbose; m.mask_zeros = mask_zeros; m.threads = threads; // fit the model by alternating least squares m.fit(A); return Rcpp::List::create( Rcpp::Named("w") = m.matrixW().transpose(), Rcpp::Named("d") = m.vectorD(), Rcpp::Named("h") = m.matrixH(), Rcpp::Named("tol") = m.fit_tol(), Rcpp::Named("iter") = m.fit_iter()); } // BIPARTITION A SAMPLE SET BY RANK-2 NMF //[[Rcpp::export]] Rcpp::List Rcpp_bipartition_sparse(const Rcpp::S4& A, const double tol, const unsigned int maxit, const bool nonneg, Rcpp::IntegerVector samples, const unsigned int seed, const bool verbose = false, const bool calc_dist = false, const bool diag = true) { RcppML::SparseMatrix A_(A); Eigen::MatrixXd w = randomMatrix(2, A_.rows(), seed); const std::vector samples_ = Rcpp::as>(samples); bipartitionModel m = c_bipartition_sparse(A_, w, samples_, tol, nonneg, calc_dist, maxit, verbose); return Rcpp::List::create(Rcpp::Named("v") = m.v, Rcpp::Named("dist") = m.dist, Rcpp::Named("size1") = m.size1, Rcpp::Named("size2") = m.size2, Rcpp::Named("samples1") = m.samples1, Rcpp::Named("samples2") = m.samples2, Rcpp::Named("center1") = m.center1, Rcpp::Named("center2") = m.center2); } //[[Rcpp::export]] Rcpp::List Rcpp_bipartition_dense(const Eigen::MatrixXd& A, const double tol, const unsigned int maxit, const bool nonneg, Rcpp::IntegerVector samples, const unsigned int seed, const bool verbose = false, const bool calc_dist = false, const bool diag = true) { Eigen::MatrixXd w = randomMatrix(2, A.rows(), seed); const std::vector samples_ = Rcpp::as>(samples); bipartitionModel m = c_bipartition_dense(A, w, samples_, tol, nonneg, calc_dist, maxit, verbose); return Rcpp::List::create(Rcpp::Named("v") = m.v, Rcpp::Named("dist") = m.dist, Rcpp::Named("size1") = m.size1, Rcpp::Named("size2") = m.size2, Rcpp::Named("samples1") = m.samples1, Rcpp::Named("samples2") = m.samples2, Rcpp::Named("center1") = m.center1, Rcpp::Named("center2") = m.center2); } // DIVISIVE CLUSTERING BY RECURSIVE BIPARTITIONING //[[Rcpp::export]] Rcpp::List Rcpp_dclust_sparse(const Rcpp::S4& A, const unsigned int min_samples, const double min_dist, const bool verbose, const double tol, const unsigned int maxit, const bool nonneg, const unsigned int seed, const unsigned int threads) { RcppML::SparseMatrix A_(A); RcppML::clusterModel m = RcppML::clusterModel(A_, min_samples, min_dist); m.nonneg = nonneg; m.verbose = verbose; m.tol = tol; m.min_dist = min_dist; m.seed = seed; m.maxit = maxit; m.threads = threads; m.min_samples = min_samples; m.dclust(); std::vector clusters = m.getClusters(); Rcpp::List result(clusters.size()); for (unsigned int i = 0; i < clusters.size(); ++i) { result[i] = Rcpp::List::create(Rcpp::Named("id") = clusters[i].id, Rcpp::Named("samples") = clusters[i].samples, Rcpp::Named("center") = clusters[i].center, Rcpp::Named("dist") = clusters[i].dist, Rcpp::Named("leaf") = clusters[i].leaf); } return result; } //' @title Non-negative least squares //' //' @description Solves the equation \code{a %*% x = b} for \code{x} subject to \eqn{x > 0}. //' //' @details //' This is a very fast implementation of non-negative least squares (NNLS), suitable for very small or very large systems. //' //' **Algorithm**. Sequential coordinate descent (CD) is at the core of this implementation, and requires an initialization of \eqn{x}. There are two supported methods for initialization of \eqn{x}: //' 1. **Zero-filled initialization** when \code{fast_nnls = FALSE} and \code{cd_maxit > 0}. This is generally very efficient for well-conditioned and small systems. //' 2. **Approximation with FAST** when \code{fast_nnls = TRUE}. Forward active set tuning (FAST), described below, finds an approximate active set using unconstrained least squares solutions found by Cholesky decomposition and substitution. To use only FAST approximation, set \code{cd_maxit = 0}. //' //' \code{a} must be symmetric positive definite if FAST NNLS is used, but this is not checked. //' //' See our BioRXiv manuscript (references) for benchmarking against Lawson-Hanson NNLS and for a more technical introduction to these methods. //' //' **Coordinate Descent NNLS**. Least squares by **sequential coordinate descent** is used to ensure the solution returned is exact. This algorithm was //' introduced by Franc et al. (2005), and our implementation is a vectorized and optimized rendition of that found in the NNLM R package by Xihui Lin (2020). //' //' **FAST NNLS.** Forward active set tuning (FAST) is an exact or near-exact NNLS approximation initialized by an unconstrained //' least squares solution. Negative values in this unconstrained solution are set to zero (the "active set"), and all //' other values are added to a "feasible set". An unconstrained least squares solution is then solved for the //' "feasible set", any negative values in the resulting solution are set to zero, and the process is repeated until //' the feasible set solution is strictly positive. //' //' The FAST algorithm has a definite convergence guarantee because the //' feasible set will either converge or become smaller with each iteration. The result is generally exact or nearly //' exact for small well-conditioned systems (< 50 variables) within 2 iterations and thus sets up coordinate //' descent for very rapid convergence. The FAST method is similar to the first phase of the so-called "TNT-NN" algorithm (Myre et al., 2017), //' but the latter half of that method relies heavily on heuristics to refine the approximate active set, which we avoid by using //' coordinate descent instead. //' //' @param a symmetric positive definite matrix giving coefficients of the linear system //' @param b matrix giving the right-hand side(s) of the linear system //' @param L1 L1/LASSO penalty to be subtracted from \code{b} //' @param fast_nnls initialize coordinate descent with a FAST NNLS approximation //' @param cd_maxit maximum number of coordinate descent iterations //' @param cd_tol stopping criteria, difference in \eqn{x} across consecutive solutions over the sum of \eqn{x} //' @return vector or matrix giving solution for \code{x} //' @export //' @author Zach DeBruine //' @seealso \code{\link{nmf}}, \code{\link{project}} //' @md //' //' @references //' //' DeBruine, ZJ, Melcher, K, and Triche, TJ. (2021). "High-performance non-negative matrix factorization for large single-cell data." BioRXiv. //' //' Franc, VC, Hlavac, VC, and Navara, M. (2005). "Sequential Coordinate-Wise Algorithm for the Non-negative Least Squares Problem. Proc. Int'l Conf. Computer Analysis of Images and Patterns." //' //' Lin, X, and Boutros, PC (2020). "Optimization and expansion of non-negative matrix factorization." BMC Bioinformatics. //' //' Myre, JM, Frahm, E, Lilja DJ, and Saar, MO. (2017) "TNT-NN: A Fast Active Set Method for Solving Large Non-Negative Least Squares Problems". Proc. Computer Science. //' //' @examples //' \dontrun{ //' # compare solution to base::solve for a random system //' X <- matrix(runif(100), 10, 10) //' a <- crossprod(X) //' b <- crossprod(X, runif(10)) //' unconstrained_soln <- solve(a, b) //' nonneg_soln <- nnls(a, b) //' unconstrained_err <- mean((a %*% unconstrained_soln - b)^2) //' nonnegative_err <- mean((a %*% nonneg_soln - b)^2) //' unconstrained_err //' nonnegative_err //' all.equal(solve(a, b), nnls(a, b)) //' //' # example adapted from multiway::fnnls example 1 //' X <- matrix(1:100,50,2) //' y <- matrix(101:150,50,1) //' beta <- solve(crossprod(X)) %*% crossprod(X, y) //' beta //' beta <- nnls(crossprod(X), crossprod(X, y)) //' beta //' } //[[Rcpp::export]] Eigen::MatrixXd nnls(const Eigen::MatrixXd& a, Eigen::MatrixXd b, unsigned int cd_maxit = 100, const double cd_tol = 1e-8, const bool fast_nnls = false, const double L1 = 0) { if (a.rows() != a.cols()) Rcpp::stop("'a' is not symmetric"); if (a.rows() != b.rows()) Rcpp::stop("dimensions of 'b' and 'a' are not compatible!"); if (L1 != 0) b.array() -= L1; Eigen::LLT a_llt; Eigen::MatrixXd x(b.rows(), b.cols()); if (fast_nnls) a_llt = a.llt(); for (unsigned int col = 0; col < b.cols(); ++col) { if (fast_nnls) { // initialize with unconstrained least squares solution x.col(col) = a_llt.solve(b.col(col)); // iterative feasible set reduction while unconstrained least squares solutions at feasible indices contain negative values while ((x.col(col).array() < 0).any()) { Eigen::VectorXi gtz_ind = find_gtz(x, col); // get indices in "x" greater than zero (the "feasible set") Eigen::VectorXd bsub = subvec(b, gtz_ind, col); // subset "a" and "b" to those indices in the feasible set Eigen::MatrixXd asub = submat(a, gtz_ind, gtz_ind); Eigen::VectorXd xsub = asub.llt().solve(bsub); // solve for those indices in "x" x.setZero(); for (unsigned int i = 0; i < gtz_ind.size(); ++i) x(gtz_ind(i), col) = xsub(i); } b.col(col) -= a * x.col(col); // adjust gradient for current solution } // refine FAST solution by coordinate descent, or find solution from zero-initialized "x" matrix if (cd_maxit > 0) { double tol = 1; for (unsigned int it = 0; it < cd_maxit && (tol / b.rows()) > cd_tol; ++it) { tol = 0; for (unsigned int i = 0; i < b.rows(); ++i) { double diff = b(i, col) / a(i, i); if (-diff > x(i, col)) { if (x(i, col) != 0) { b.col(col) -= a.col(i) * -x(i, col); tol = 1; x(i, col) = 0; } } else if (diff != 0) { x(i, col) += diff; b.col(col) -= a.col(i) * diff; tol += std::abs(diff / (x(i, col) + TINY_NUM)); } } } } } return x; } RcppML/src/RcppExports.cpp0000644000176200001440000003652314122422743015205 0ustar liggesusers// Generated by using Rcpp::compileAttributes() -> do not edit by hand // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 #include #include using namespace Rcpp; #ifdef RCPP_USE_GLOBAL_ROSTREAM Rcpp::Rostream& Rcpp::Rcout = Rcpp::Rcpp_cout_get(); Rcpp::Rostream& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get(); #endif // Rcpp_projectW_sparse Eigen::MatrixXd Rcpp_projectW_sparse(const Rcpp::S4& A, const Eigen::MatrixXd w, const bool nonneg, const double L1, const unsigned int threads, const bool mask_zeros); RcppExport SEXP _RcppML_Rcpp_projectW_sparse(SEXP ASEXP, SEXP wSEXP, SEXP nonnegSEXP, SEXP L1SEXP, SEXP threadsSEXP, SEXP mask_zerosSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< const Rcpp::S4& >::type A(ASEXP); Rcpp::traits::input_parameter< const Eigen::MatrixXd >::type w(wSEXP); Rcpp::traits::input_parameter< const bool >::type nonneg(nonnegSEXP); Rcpp::traits::input_parameter< const double >::type L1(L1SEXP); Rcpp::traits::input_parameter< const unsigned int >::type threads(threadsSEXP); Rcpp::traits::input_parameter< const bool >::type mask_zeros(mask_zerosSEXP); rcpp_result_gen = Rcpp::wrap(Rcpp_projectW_sparse(A, w, nonneg, L1, threads, mask_zeros)); return rcpp_result_gen; END_RCPP } // Rcpp_projectH_sparse Eigen::MatrixXd Rcpp_projectH_sparse(const Rcpp::S4& A, const Eigen::MatrixXd h, const bool nonneg, const double L1, const unsigned int threads, const bool mask_zeros); RcppExport SEXP _RcppML_Rcpp_projectH_sparse(SEXP ASEXP, SEXP hSEXP, SEXP nonnegSEXP, SEXP L1SEXP, SEXP threadsSEXP, SEXP mask_zerosSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< const Rcpp::S4& >::type A(ASEXP); Rcpp::traits::input_parameter< const Eigen::MatrixXd >::type h(hSEXP); Rcpp::traits::input_parameter< const bool >::type nonneg(nonnegSEXP); Rcpp::traits::input_parameter< const double >::type L1(L1SEXP); Rcpp::traits::input_parameter< const unsigned int >::type threads(threadsSEXP); Rcpp::traits::input_parameter< const bool >::type mask_zeros(mask_zerosSEXP); rcpp_result_gen = Rcpp::wrap(Rcpp_projectH_sparse(A, h, nonneg, L1, threads, mask_zeros)); return rcpp_result_gen; END_RCPP } // Rcpp_projectW_dense Eigen::MatrixXd Rcpp_projectW_dense(const Eigen::MatrixXd& A, const Eigen::MatrixXd w, const bool nonneg, const double L1, const unsigned int threads, const bool mask_zeros); RcppExport SEXP _RcppML_Rcpp_projectW_dense(SEXP ASEXP, SEXP wSEXP, SEXP nonnegSEXP, SEXP L1SEXP, SEXP threadsSEXP, SEXP mask_zerosSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< const Eigen::MatrixXd& >::type A(ASEXP); Rcpp::traits::input_parameter< const Eigen::MatrixXd >::type w(wSEXP); Rcpp::traits::input_parameter< const bool >::type nonneg(nonnegSEXP); Rcpp::traits::input_parameter< const double >::type L1(L1SEXP); Rcpp::traits::input_parameter< const unsigned int >::type threads(threadsSEXP); Rcpp::traits::input_parameter< const bool >::type mask_zeros(mask_zerosSEXP); rcpp_result_gen = Rcpp::wrap(Rcpp_projectW_dense(A, w, nonneg, L1, threads, mask_zeros)); return rcpp_result_gen; END_RCPP } // Rcpp_projectH_dense Eigen::MatrixXd Rcpp_projectH_dense(Eigen::MatrixXd& A, const Eigen::MatrixXd h, const bool nonneg, const double L1, const unsigned int threads, const bool mask_zeros); RcppExport SEXP _RcppML_Rcpp_projectH_dense(SEXP ASEXP, SEXP hSEXP, SEXP nonnegSEXP, SEXP L1SEXP, SEXP threadsSEXP, SEXP mask_zerosSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< Eigen::MatrixXd& >::type A(ASEXP); Rcpp::traits::input_parameter< const Eigen::MatrixXd >::type h(hSEXP); Rcpp::traits::input_parameter< const bool >::type nonneg(nonnegSEXP); Rcpp::traits::input_parameter< const double >::type L1(L1SEXP); Rcpp::traits::input_parameter< const unsigned int >::type threads(threadsSEXP); Rcpp::traits::input_parameter< const bool >::type mask_zeros(mask_zerosSEXP); rcpp_result_gen = Rcpp::wrap(Rcpp_projectH_dense(A, h, nonneg, L1, threads, mask_zeros)); return rcpp_result_gen; END_RCPP } // Rcpp_mse_sparse double Rcpp_mse_sparse(const Rcpp::S4& A, Eigen::MatrixXd& w, Eigen::VectorXd& d, Eigen::MatrixXd& h, const bool mask_zeros, const unsigned int threads); RcppExport SEXP _RcppML_Rcpp_mse_sparse(SEXP ASEXP, SEXP wSEXP, SEXP dSEXP, SEXP hSEXP, SEXP mask_zerosSEXP, SEXP threadsSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< const Rcpp::S4& >::type A(ASEXP); Rcpp::traits::input_parameter< Eigen::MatrixXd& >::type w(wSEXP); Rcpp::traits::input_parameter< Eigen::VectorXd& >::type d(dSEXP); Rcpp::traits::input_parameter< Eigen::MatrixXd& >::type h(hSEXP); Rcpp::traits::input_parameter< const bool >::type mask_zeros(mask_zerosSEXP); Rcpp::traits::input_parameter< const unsigned int >::type threads(threadsSEXP); rcpp_result_gen = Rcpp::wrap(Rcpp_mse_sparse(A, w, d, h, mask_zeros, threads)); return rcpp_result_gen; END_RCPP } // Rcpp_mse_dense double Rcpp_mse_dense(Eigen::MatrixXd& A, Eigen::MatrixXd& w, Eigen::VectorXd& d, Eigen::MatrixXd& h, const bool mask_zeros, const unsigned int threads); RcppExport SEXP _RcppML_Rcpp_mse_dense(SEXP ASEXP, SEXP wSEXP, SEXP dSEXP, SEXP hSEXP, SEXP mask_zerosSEXP, SEXP threadsSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< Eigen::MatrixXd& >::type A(ASEXP); Rcpp::traits::input_parameter< Eigen::MatrixXd& >::type w(wSEXP); Rcpp::traits::input_parameter< Eigen::VectorXd& >::type d(dSEXP); Rcpp::traits::input_parameter< Eigen::MatrixXd& >::type h(hSEXP); Rcpp::traits::input_parameter< const bool >::type mask_zeros(mask_zerosSEXP); Rcpp::traits::input_parameter< const unsigned int >::type threads(threadsSEXP); rcpp_result_gen = Rcpp::wrap(Rcpp_mse_dense(A, w, d, h, mask_zeros, threads)); return rcpp_result_gen; END_RCPP } // Rcpp_nmf_sparse Rcpp::List Rcpp_nmf_sparse(const Rcpp::S4& A, const unsigned int k, const double tol, const unsigned int maxit, const bool verbose, const bool nonneg, const Rcpp::NumericVector L1, const unsigned int seed, const bool diag, const bool mask_zeros, const unsigned int threads); RcppExport SEXP _RcppML_Rcpp_nmf_sparse(SEXP ASEXP, SEXP kSEXP, SEXP tolSEXP, SEXP maxitSEXP, SEXP verboseSEXP, SEXP nonnegSEXP, SEXP L1SEXP, SEXP seedSEXP, SEXP diagSEXP, SEXP mask_zerosSEXP, SEXP threadsSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< const Rcpp::S4& >::type A(ASEXP); Rcpp::traits::input_parameter< const unsigned int >::type k(kSEXP); Rcpp::traits::input_parameter< const double >::type tol(tolSEXP); Rcpp::traits::input_parameter< const unsigned int >::type maxit(maxitSEXP); Rcpp::traits::input_parameter< const bool >::type verbose(verboseSEXP); Rcpp::traits::input_parameter< const bool >::type nonneg(nonnegSEXP); Rcpp::traits::input_parameter< const Rcpp::NumericVector >::type L1(L1SEXP); Rcpp::traits::input_parameter< const unsigned int >::type seed(seedSEXP); Rcpp::traits::input_parameter< const bool >::type diag(diagSEXP); Rcpp::traits::input_parameter< const bool >::type mask_zeros(mask_zerosSEXP); Rcpp::traits::input_parameter< const unsigned int >::type threads(threadsSEXP); rcpp_result_gen = Rcpp::wrap(Rcpp_nmf_sparse(A, k, tol, maxit, verbose, nonneg, L1, seed, diag, mask_zeros, threads)); return rcpp_result_gen; END_RCPP } // Rcpp_nmf_dense Rcpp::List Rcpp_nmf_dense(Eigen::MatrixXd& A, const unsigned int k, const double tol, const unsigned int maxit, const bool verbose, const bool nonneg, const Rcpp::NumericVector L1, const unsigned int seed, const bool diag, const bool mask_zeros, const unsigned int threads); RcppExport SEXP _RcppML_Rcpp_nmf_dense(SEXP ASEXP, SEXP kSEXP, SEXP tolSEXP, SEXP maxitSEXP, SEXP verboseSEXP, SEXP nonnegSEXP, SEXP L1SEXP, SEXP seedSEXP, SEXP diagSEXP, SEXP mask_zerosSEXP, SEXP threadsSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< Eigen::MatrixXd& >::type A(ASEXP); Rcpp::traits::input_parameter< const unsigned int >::type k(kSEXP); Rcpp::traits::input_parameter< const double >::type tol(tolSEXP); Rcpp::traits::input_parameter< const unsigned int >::type maxit(maxitSEXP); Rcpp::traits::input_parameter< const bool >::type verbose(verboseSEXP); Rcpp::traits::input_parameter< const bool >::type nonneg(nonnegSEXP); Rcpp::traits::input_parameter< const Rcpp::NumericVector >::type L1(L1SEXP); Rcpp::traits::input_parameter< const unsigned int >::type seed(seedSEXP); Rcpp::traits::input_parameter< const bool >::type diag(diagSEXP); Rcpp::traits::input_parameter< const bool >::type mask_zeros(mask_zerosSEXP); Rcpp::traits::input_parameter< const unsigned int >::type threads(threadsSEXP); rcpp_result_gen = Rcpp::wrap(Rcpp_nmf_dense(A, k, tol, maxit, verbose, nonneg, L1, seed, diag, mask_zeros, threads)); return rcpp_result_gen; END_RCPP } // Rcpp_bipartition_sparse Rcpp::List Rcpp_bipartition_sparse(const Rcpp::S4& A, const double tol, const unsigned int maxit, const bool nonneg, Rcpp::IntegerVector samples, const unsigned int seed, const bool verbose, const bool calc_dist, const bool diag); RcppExport SEXP _RcppML_Rcpp_bipartition_sparse(SEXP ASEXP, SEXP tolSEXP, SEXP maxitSEXP, SEXP nonnegSEXP, SEXP samplesSEXP, SEXP seedSEXP, SEXP verboseSEXP, SEXP calc_distSEXP, SEXP diagSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< const Rcpp::S4& >::type A(ASEXP); Rcpp::traits::input_parameter< const double >::type tol(tolSEXP); Rcpp::traits::input_parameter< const unsigned int >::type maxit(maxitSEXP); Rcpp::traits::input_parameter< const bool >::type nonneg(nonnegSEXP); Rcpp::traits::input_parameter< Rcpp::IntegerVector >::type samples(samplesSEXP); Rcpp::traits::input_parameter< const unsigned int >::type seed(seedSEXP); Rcpp::traits::input_parameter< const bool >::type verbose(verboseSEXP); Rcpp::traits::input_parameter< const bool >::type calc_dist(calc_distSEXP); Rcpp::traits::input_parameter< const bool >::type diag(diagSEXP); rcpp_result_gen = Rcpp::wrap(Rcpp_bipartition_sparse(A, tol, maxit, nonneg, samples, seed, verbose, calc_dist, diag)); return rcpp_result_gen; END_RCPP } // Rcpp_bipartition_dense Rcpp::List Rcpp_bipartition_dense(const Eigen::MatrixXd& A, const double tol, const unsigned int maxit, const bool nonneg, Rcpp::IntegerVector samples, const unsigned int seed, const bool verbose, const bool calc_dist, const bool diag); RcppExport SEXP _RcppML_Rcpp_bipartition_dense(SEXP ASEXP, SEXP tolSEXP, SEXP maxitSEXP, SEXP nonnegSEXP, SEXP samplesSEXP, SEXP seedSEXP, SEXP verboseSEXP, SEXP calc_distSEXP, SEXP diagSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< const Eigen::MatrixXd& >::type A(ASEXP); Rcpp::traits::input_parameter< const double >::type tol(tolSEXP); Rcpp::traits::input_parameter< const unsigned int >::type maxit(maxitSEXP); Rcpp::traits::input_parameter< const bool >::type nonneg(nonnegSEXP); Rcpp::traits::input_parameter< Rcpp::IntegerVector >::type samples(samplesSEXP); Rcpp::traits::input_parameter< const unsigned int >::type seed(seedSEXP); Rcpp::traits::input_parameter< const bool >::type verbose(verboseSEXP); Rcpp::traits::input_parameter< const bool >::type calc_dist(calc_distSEXP); Rcpp::traits::input_parameter< const bool >::type diag(diagSEXP); rcpp_result_gen = Rcpp::wrap(Rcpp_bipartition_dense(A, tol, maxit, nonneg, samples, seed, verbose, calc_dist, diag)); return rcpp_result_gen; END_RCPP } // Rcpp_dclust_sparse Rcpp::List Rcpp_dclust_sparse(const Rcpp::S4& A, const unsigned int min_samples, const double min_dist, const bool verbose, const double tol, const unsigned int maxit, const bool nonneg, const unsigned int seed, const unsigned int threads); RcppExport SEXP _RcppML_Rcpp_dclust_sparse(SEXP ASEXP, SEXP min_samplesSEXP, SEXP min_distSEXP, SEXP verboseSEXP, SEXP tolSEXP, SEXP maxitSEXP, SEXP nonnegSEXP, SEXP seedSEXP, SEXP threadsSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< const Rcpp::S4& >::type A(ASEXP); Rcpp::traits::input_parameter< const unsigned int >::type min_samples(min_samplesSEXP); Rcpp::traits::input_parameter< const double >::type min_dist(min_distSEXP); Rcpp::traits::input_parameter< const bool >::type verbose(verboseSEXP); Rcpp::traits::input_parameter< const double >::type tol(tolSEXP); Rcpp::traits::input_parameter< const unsigned int >::type maxit(maxitSEXP); Rcpp::traits::input_parameter< const bool >::type nonneg(nonnegSEXP); Rcpp::traits::input_parameter< const unsigned int >::type seed(seedSEXP); Rcpp::traits::input_parameter< const unsigned int >::type threads(threadsSEXP); rcpp_result_gen = Rcpp::wrap(Rcpp_dclust_sparse(A, min_samples, min_dist, verbose, tol, maxit, nonneg, seed, threads)); return rcpp_result_gen; END_RCPP } // nnls Eigen::MatrixXd nnls(const Eigen::MatrixXd& a, Eigen::MatrixXd b, unsigned int cd_maxit, const double cd_tol, const bool fast_nnls, const double L1); RcppExport SEXP _RcppML_nnls(SEXP aSEXP, SEXP bSEXP, SEXP cd_maxitSEXP, SEXP cd_tolSEXP, SEXP fast_nnlsSEXP, SEXP L1SEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< const Eigen::MatrixXd& >::type a(aSEXP); Rcpp::traits::input_parameter< Eigen::MatrixXd >::type b(bSEXP); Rcpp::traits::input_parameter< unsigned int >::type cd_maxit(cd_maxitSEXP); Rcpp::traits::input_parameter< const double >::type cd_tol(cd_tolSEXP); Rcpp::traits::input_parameter< const bool >::type fast_nnls(fast_nnlsSEXP); Rcpp::traits::input_parameter< const double >::type L1(L1SEXP); rcpp_result_gen = Rcpp::wrap(nnls(a, b, cd_maxit, cd_tol, fast_nnls, L1)); return rcpp_result_gen; END_RCPP } static const R_CallMethodDef CallEntries[] = { {"_RcppML_Rcpp_projectW_sparse", (DL_FUNC) &_RcppML_Rcpp_projectW_sparse, 6}, {"_RcppML_Rcpp_projectH_sparse", (DL_FUNC) &_RcppML_Rcpp_projectH_sparse, 6}, {"_RcppML_Rcpp_projectW_dense", (DL_FUNC) &_RcppML_Rcpp_projectW_dense, 6}, {"_RcppML_Rcpp_projectH_dense", (DL_FUNC) &_RcppML_Rcpp_projectH_dense, 6}, {"_RcppML_Rcpp_mse_sparse", (DL_FUNC) &_RcppML_Rcpp_mse_sparse, 6}, {"_RcppML_Rcpp_mse_dense", (DL_FUNC) &_RcppML_Rcpp_mse_dense, 6}, {"_RcppML_Rcpp_nmf_sparse", (DL_FUNC) &_RcppML_Rcpp_nmf_sparse, 11}, {"_RcppML_Rcpp_nmf_dense", (DL_FUNC) &_RcppML_Rcpp_nmf_dense, 11}, {"_RcppML_Rcpp_bipartition_sparse", (DL_FUNC) &_RcppML_Rcpp_bipartition_sparse, 9}, {"_RcppML_Rcpp_bipartition_dense", (DL_FUNC) &_RcppML_Rcpp_bipartition_dense, 9}, {"_RcppML_Rcpp_dclust_sparse", (DL_FUNC) &_RcppML_Rcpp_dclust_sparse, 9}, {"_RcppML_nnls", (DL_FUNC) &_RcppML_nnls, 6}, {NULL, NULL, 0} }; RcppExport void R_init_RcppML(DllInfo *dll) { R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); R_useDynamicSymbols(dll, FALSE); } RcppML/vignettes/0000755000176200001440000000000014122423360013414 5ustar liggesusersRcppML/vignettes/RcppML.Rmd0000644000176200001440000001060414111717072015222 0ustar liggesusers--- title: "Introduction to the RcppML package" author: "Zach DeBruine" date: "`r Sys.Date()`" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Introduction to the RcppML package} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r setup, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ``` The 'RcppML' package provides high-performance machine learning algorithms using Rcpp with a focus on matrix factorization. ## Installation Install the latest development version of RcppML from github: ```{R, eval = FALSE} library(devtools) install_github("zdebruine/RcppML") ``` ```{R} library(RcppML) library(Matrix) ``` ## Non-Negative Least Squares RcppML contains extremely fast NNLS solvers. Use the `nnls` function to solve systems of equations subject to non-negativity constraints. The `RcppML::solve` function solves the equation \eqn{ax = b} for \eqn{x} where \eqn{a} is symmetric positive definite matrix of dimensions \eqn{m x m} and \eqn{b} is a vector of length \eqn{m} or a matrix of dimensions \eqn{m x n}. ```{R} # construct a system of equations X <- matrix(rnorm(2000),100,20) btrue <- runif(20) y <- X %*% btrue + rnorm(100) a <- crossprod(X) b <- crossprod(X, y) # solve the system of equations x <- RcppML::nnls(a, b) # use only coordinate descent x <- RcppML::nnls(a, b, fast_nnls = FALSE, cd_maxit = 1000, cd_tol = 1e-8) ``` `RcppML::solve` implements a new and fastest-in-class algorithm for non-negative least squares: 1. *initialization* is done by solving for the unconstrained least squares solution. 2. *forward active set tuning* (FAST) provides a near-exact solution (often exact for well-conditioned systems) by setting all negative values in the unconstrained solution to zero, re-solving the system for only positive values, and repeating the process until the solution for values not constrained to zero is strictly positive. Set `cd_maxit = 0` to use only the FAST solver. 3. *Coordinate descent* refines the FAST solution and finds the best solution discoverable by gradient descent. The coordinate descent solution is only used if it gives a better error than the FAST solution. Generally, coordinate descent re-introduces variables constrained to zero by FAST back into the feasible set, but does not dramatically change the solution. ## Projecting Linear Models Project dense linear factor models onto real-valued sparse matrices (or any matrix coercible to `Matrix::dgCMatrix`) using `RcppML::project`. `RcppML::project` solves the equation \eqn{A = WH} for \eqn{H}. ```{R} # simulate a sparse matrix A <- rsparsematrix(1000, 100, 0.1) # simulate a linear factor model w <- matrix(runif(1000 * 10), 1000, 10) # project the model h <- RcppML::project(A, w) ``` ## Non-negative Matrix Factorization `RcppML::nmf` finds a non-negative matrix factorization by alternating least squares (alternating projections of linear models \eqn{w} and \eqn{h}). There are several ways in which the NMF algorithm differs from other currently available methods: * Diagonalized scaling of factors to sum to 1, permitting convex L1 regularization along the entire solution path * Fast stopping criteria, based on correlation between models across consecutive iterations * Extremely fast algorithms using the Eigen C++ library, optimized for matrices that are >90% sparse * Support for NMF or unconstrained matrix factorization * Parallelized using OpenMP multithreading The following example runs rank-10 NMF on a random 1000 x 1000 matrix that is 90% sparse: ```{R} A <- rsparsematrix(100, 100, 0.1) model <- RcppML::nmf(A, 10, verbose = F) w <- model$w d <- model$d h <- model$h model_tolerance <- tail(model$tol, 1) ``` Tolerance is simply a measure of the average correlation between \eqn{w_{i-1} and \eqn{w_i} and \eqn{h_{i-1}} and \eqn{h_i} for a given iteration \eqn{i}. For symmetric factorizations (when \code{symmetric = TRUE}), tolerance becomes a measure of the correlation between \eqn{w_i} and \eqn{h_i}, and diagonalization is automatically performed to enforce symmetry: ```{R} A_sym <- as(crossprod(A), "dgCMatrix") model <- RcppML::nmf(A_sym, 10, verbose = F) ``` Mean squared error of a factorization can be calculated for a given model using the `RcppML::mse` function: ```{R} RcppML::mse(A_sym, model$w, model$d, model$h) ``` RcppML/R/0000755000176200001440000000000014122423332011604 5ustar liggesusersRcppML/R/RcppML.R0000644000176200001440000000073414122412737013077 0ustar liggesusers#' RcppML: Rcpp Machine Learning Library #' #' @description #' High-performance non-negative matrix factorization and linear model projection for sparse matrices, and fast non-negative least squares implementations #' #' @import knitr Matrix RcppEigen #' @importFrom Rcpp evalCpp #' @importFrom methods as canCoerce #' @useDynLib RcppML, .registration = TRUE #' @docType package #' @name RcppML #' @author Zach DeBruine #' @aliases RcppML-package #' @md #' NULLRcppML/R/nmf.R0000644000176200001440000002455514122422734012527 0ustar liggesusers#' @title Non-negative matrix factorization #' #' @description Sparse matrix factorization of the form \eqn{A = wdh} by alternating least squares with optional non-negativity constraints. #' #' @details #' This fast non-negative matrix factorization (NMF) implementation decomposes a matrix \eqn{A} into lower-rank #' non-negative matrices \eqn{w} and \eqn{h}, with factors scaled to sum to 1 via multiplication by a diagonal, \eqn{d}: \deqn{A = wdh} #' #' The scaling diagonal enables symmetric factorization, convex L1 regularization, and consistent factor scalings regardless of random initialization. #' #' The factorization model is randomly initialized, and \eqn{w} and \eqn{h} are updated alternately using least squares. #' Given \eqn{A} and \eqn{w}, \eqn{h} is updated according to the equation: \deqn{w^Twh = wA_j} #' #' This equation is in the form \eqn{ax = b} where \eqn{a = w^Tw}, \eqn{x = h}, and \eqn{b = wA_j} for all columns \eqn{j} in \eqn{A}. #' #' The corresponding update for \eqn{w} is \deqn{hh^Tw = hA^T_j} #' #' **Stopping criteria.** Alternating least squares projections (see \code{\link{project}} subroutine) are repeated until a stopping criteria is satisfied, which is either a maximum number of #' iterations or a tolerance based on the correlation distance between models (\eqn{1 - cor(w_i, w_{i-1})}) across consecutive iterations. Use the \code{tol} parameter to control the stopping criteria for alternating updates: #' * \code{tol = 1e-2} is appropriate for approximate mean squared error determination and coarse cross-validation, useful for rank determination. #' * \code{tol = 1e-3} to \code{1e-4} are suitable for rapid expermentation, cross-validation, and preliminary analysis. #' * \code{tol = 1e-5} and smaller for publication-quality runs #' * \code{tol = 1e-10} and smaller for robust factorizations at or near machine-precision #' #' **Parallelization.** Least squares projections in factorizations of rank-3 and greater are parallelized using the number of threads set by \code{\link{setRcppMLthreads}}. #' By default, all available threads are used, see \code{\link{getRcppMLthreads}}. #' The overhead of parallization is too great to benefit rank-1 and rank-2 factorization. #' #' **Specializations.** There are specializations for symmetric matrices, and for rank-1 and rank-2 factorization. #' #' **L1 regularization**. L1 penalization increases the sparsity of factors, but does not change the information content of the model #' or the relative contributions of the leading coefficients in each factor to the model. L1 regularization only slightly increases the error of a model. #' L1 penalization should be used to aid interpretability. Penalty values should range from 0 to 1, where 1 gives complete sparsity. In this implementation of NMF, #' a scaling diagonal ensures that the L1 penalty is equally applied across all factors regardless of random initialization and the distribution of the model. #' Many other implementations of matrix factorization claim to apply L1, but the magnitude of the penalty is at the mercy of the random distribution and #' more significantly affects factors with lower overall contribution to the model. L1 regularization of rank-1 and rank-2 factorizations has no effect. #' #' **Rank-2 factorization.** When \eqn{k = 2}, a very fast optimized algorithm is used. Two-variable least squares solutions to the problem \eqn{ax = b} are found by direct substitution: #' \deqn{x_1 = \frac{a_{22}b_1 - a_{12}b_2}{a_{11}a_{22} - a_{12}^2}} #' \deqn{x_2 = \frac{a_{11}b_2 - a_{12}b_1}{a_{11}a_{22} - a_{12}^2}} #' #' In the above equations, the denominator is constant and thus needs to be calculated only once. Additionally, if non-negativity constraints are to be imposed, #' if \eqn{x_1 < 0} then \eqn{x_1 = 0} and \eqn{x_2 = \frac{b_1}{a_{11}}}. #' Similarly, if \eqn{x_2 < 0} then \eqn{x_2 = 0} and \eqn{x_1 = \frac{b_2}{a_{22}}}. #' #' Rank-2 NMF is useful for bipartitioning, and is a subroutine in \code{\link{bipartition}}, where the sign of the difference between sample loadings #' in both factors gives the partitioning. #' #' **Rank-1 factorization.** Rank-1 factorization by alternating least squares gives vectors equivalent to the first singular vectors in an SVD. It is a normalization of the data to a middle point, #' and may be useful for ordering samples based on the most significant axis of variation (i.e. pseudotime trajectories). Diagonal scaling guarantees consistent linear #' scaling of the factor across random restarts. #' #' **Random seed and reproducibility.** Results of a rank-1 and rank-2 factorizations should be reproducible regardless of random seed. For higher-rank models, #' results across random restarts should, in theory, be comparable at very high tolerances (i.e. machine precision for _double_, corresponding to about \code{tol = 1e-10}). However, to guarantee #' reproducibility without such low tolerances, set the \code{seed} argument. Note that \code{set.seed()} will not work. Only random initialization is supported, as other methods #' incur unnecessary overhead and sometimes trap updates into local minima. #' #' **Rank determination.** This function does not attempt to provide a method for rank determination. Like any clustering algorithm or dimensional reduction, #' finding the optimal rank can be subjective. An easy way to #' estimate rank uses the "elbow method", where the inflection point on a plot of Mean Squared Error loss (MSE) vs. rank #' gives a good idea of the rank at which most of the signal has been captured in the model. Unfortunately, this inflection point #' is not often as obvious for NMF as it is for SVD or PCA. #' #' k-fold cross-validation is a better method. Missing value of imputation has previously been proposed, but is arguably no less subjective #' than test-training splits and requires computationally slower factorization updates using missing values, which are not supported here. #' #' **Symmetric factorization.** Special optimization for symmetric matrices is automatically applied. Specifically, alternating updates of \code{w} and \code{h} #' require transposition of \code{A}, but \code{A == t(A)} when \code{A} is symmetric, thus no up-front transposition is performed. #' #' **Zero-masking**. When zeros in a data structure can be regarded as "missing", \code{mask_zeros = TRUE} may be set. However, this requires a slower #' algorithm, and tolerances will fluctuate more dramatically. #' #' **Publication reference.** For theoretical and practical considerations, please see our manuscript: "DeBruine ZJ, Melcher K, Triche TJ (2021) #' High-performance non-negative matrix factorization for large single cell data." on BioRXiv. #' #' @param A matrix of features-by-samples in dense or sparse format (preferred classes are "matrix" or "Matrix::dgCMatrix", respectively). Prefer sparse storage when more than half of all values are zero. #' @param nonneg enforce non-negativity #' @param k rank #' @param diag scale factors in \eqn{w} and \eqn{h} to sum to 1 by introducing a diagonal, \eqn{d}. This should generally never be set to \code{FALSE}. Diagonalization enables symmetry of models in factorization of symmetric matrices, convex L1 regularization, and consistent factor scalings. #' @param tol stopping criteria, the correlation distance between \eqn{w} across consecutive iterations, \eqn{1 - cor(w_i, w_{i-1})} #' @param maxit stopping criteria, maximum number of alternating updates of \eqn{w} and \eqn{h} #' @param L1 L1/LASSO penalties between 0 and 1, array of length two for \code{c(w, h)} #' @param seed random seed for model initialization #' @param verbose print model tolerances between iterations #' @param mask_zeros handle zeros as missing values, available only when \code{A} is sparse #' @return #' A list giving the factorization model: #' \itemize{ #' \item w : feature factor matrix #' \item d : scaling diagonal vector #' \item h : sample factor matrix #' \item tol : tolerance between models at final update #' \item iter : number of alternating updates run #' } #' @references #' #' DeBruine, ZJ, Melcher, K, and Triche, TJ. (2021). "High-performance non-negative matrix factorization for large single-cell data." BioRXiv. #' #' Lin, X, and Boutros, PC (2020). "Optimization and expansion of non-negative matrix factorization." BMC Bioinformatics. #' #' Lee, D, and Seung, HS (1999). "Learning the parts of objects by non-negative matrix factorization." Nature. #' #' Franc, VC, Hlavac, VC, Navara, M. (2005). "Sequential Coordinate-Wise Algorithm for the Non-negative Least Squares Problem". Proc. Int'l Conf. Computer Analysis of Images and Patterns. Lecture Notes in Computer Science. #' #' @author Zach DeBruine #' #' @export #' @seealso \code{\link{nnls}}, \code{\link{project}}, \code{\link{mse}} #' @md #' @examples #' \dontrun{ #' library(Matrix) #' # basic NMF #' model <- nmf(rsparsematrix(1000, 100, 0.1), k = 10) #' #' # compare rank-2 NMF to second left vector in an SVD #' data(iris) #' A <- as(as.matrix(iris[,1:4]), "dgCMatrix") #' nmf_model <- nmf(A, 2, tol = 1e-5) #' bipartitioning_vector <- apply(nmf_model$w, 1, diff) #' second_left_svd_vector <- base::svd(A, 2)$u[,2] #' abs(cor(bipartitioning_vector, second_left_svd_vector)) #' #' # compare rank-1 NMF with first singular vector in an SVD #' abs(cor(nmf(A, 1)$w[,1], base::svd(A, 2)$u[,1])) #' #' # symmetric NMF #' A <- crossprod(rsparsematrix(100, 100, 0.02)) #' model <- nmf(A, 10, tol = 1e-5, maxit = 1000) #' plot(model$w, t(model$h)) #' # see package vignette for more examples #' } nmf <- function(A, k, tol = 1e-4, maxit = 100, verbose = TRUE, L1 = c(0, 0), seed = NULL, mask_zeros = FALSE, diag = TRUE, nonneg = TRUE) { if (!is.numeric(seed)) seed <- 0 if (length(L1) == 1) L1 <- rep(L1, 2) if (max(L1) >= 1 || min(L1) < 0) stop("L1 penalties must be strictly in the range [0,1)") threads <- getRcppMLthreads() if (is(A, "sparseMatrix")) { A <- as(A, "dgCMatrix") Rcpp_nmf_sparse(A, k, tol, maxit, verbose, nonneg, L1, seed, diag, mask_zeros, threads) } else { if (canCoerce(A, "matrix")) { A <- as.matrix(A) if(mask_zeros) stop("mask_zeros = TRUE not supported when 'A' is in dense format") Rcpp_nmf_dense(A, k, tol, maxit, verbose, nonneg, L1, seed, diag, mask_zeros, threads) } else { stop("'A' was not coercible to a matrix") } } }RcppML/R/RcppExports.R0000644000176200001440000001612314122423332014223 0ustar liggesusers# Generated by using Rcpp::compileAttributes() -> do not edit by hand # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 Rcpp_projectW_sparse <- function(A, w, nonneg, L1, threads, mask_zeros) { .Call('_RcppML_Rcpp_projectW_sparse', PACKAGE = 'RcppML', A, w, nonneg, L1, threads, mask_zeros) } Rcpp_projectH_sparse <- function(A, h, nonneg, L1, threads, mask_zeros) { .Call('_RcppML_Rcpp_projectH_sparse', PACKAGE = 'RcppML', A, h, nonneg, L1, threads, mask_zeros) } Rcpp_projectW_dense <- function(A, w, nonneg, L1, threads, mask_zeros) { .Call('_RcppML_Rcpp_projectW_dense', PACKAGE = 'RcppML', A, w, nonneg, L1, threads, mask_zeros) } Rcpp_projectH_dense <- function(A, h, nonneg, L1, threads, mask_zeros) { .Call('_RcppML_Rcpp_projectH_dense', PACKAGE = 'RcppML', A, h, nonneg, L1, threads, mask_zeros) } Rcpp_mse_sparse <- function(A, w, d, h, mask_zeros, threads) { .Call('_RcppML_Rcpp_mse_sparse', PACKAGE = 'RcppML', A, w, d, h, mask_zeros, threads) } Rcpp_mse_dense <- function(A, w, d, h, mask_zeros, threads) { .Call('_RcppML_Rcpp_mse_dense', PACKAGE = 'RcppML', A, w, d, h, mask_zeros, threads) } Rcpp_nmf_sparse <- function(A, k, tol, maxit, verbose, nonneg, L1, seed, diag, mask_zeros, threads) { .Call('_RcppML_Rcpp_nmf_sparse', PACKAGE = 'RcppML', A, k, tol, maxit, verbose, nonneg, L1, seed, diag, mask_zeros, threads) } Rcpp_nmf_dense <- function(A, k, tol, maxit, verbose, nonneg, L1, seed, diag, mask_zeros, threads) { .Call('_RcppML_Rcpp_nmf_dense', PACKAGE = 'RcppML', A, k, tol, maxit, verbose, nonneg, L1, seed, diag, mask_zeros, threads) } Rcpp_bipartition_sparse <- function(A, tol, maxit, nonneg, samples, seed, verbose = FALSE, calc_dist = FALSE, diag = TRUE) { .Call('_RcppML_Rcpp_bipartition_sparse', PACKAGE = 'RcppML', A, tol, maxit, nonneg, samples, seed, verbose, calc_dist, diag) } Rcpp_bipartition_dense <- function(A, tol, maxit, nonneg, samples, seed, verbose = FALSE, calc_dist = FALSE, diag = TRUE) { .Call('_RcppML_Rcpp_bipartition_dense', PACKAGE = 'RcppML', A, tol, maxit, nonneg, samples, seed, verbose, calc_dist, diag) } Rcpp_dclust_sparse <- function(A, min_samples, min_dist, verbose, tol, maxit, nonneg, seed, threads) { .Call('_RcppML_Rcpp_dclust_sparse', PACKAGE = 'RcppML', A, min_samples, min_dist, verbose, tol, maxit, nonneg, seed, threads) } #' @title Non-negative least squares #' #' @description Solves the equation \code{a %*% x = b} for \code{x} subject to \eqn{x > 0}. #' #' @details #' This is a very fast implementation of non-negative least squares (NNLS), suitable for very small or very large systems. #' #' **Algorithm**. Sequential coordinate descent (CD) is at the core of this implementation, and requires an initialization of \eqn{x}. There are two supported methods for initialization of \eqn{x}: #' 1. **Zero-filled initialization** when \code{fast_nnls = FALSE} and \code{cd_maxit > 0}. This is generally very efficient for well-conditioned and small systems. #' 2. **Approximation with FAST** when \code{fast_nnls = TRUE}. Forward active set tuning (FAST), described below, finds an approximate active set using unconstrained least squares solutions found by Cholesky decomposition and substitution. To use only FAST approximation, set \code{cd_maxit = 0}. #' #' \code{a} must be symmetric positive definite if FAST NNLS is used, but this is not checked. #' #' See our BioRXiv manuscript (references) for benchmarking against Lawson-Hanson NNLS and for a more technical introduction to these methods. #' #' **Coordinate Descent NNLS**. Least squares by **sequential coordinate descent** is used to ensure the solution returned is exact. This algorithm was #' introduced by Franc et al. (2005), and our implementation is a vectorized and optimized rendition of that found in the NNLM R package by Xihui Lin (2020). #' #' **FAST NNLS.** Forward active set tuning (FAST) is an exact or near-exact NNLS approximation initialized by an unconstrained #' least squares solution. Negative values in this unconstrained solution are set to zero (the "active set"), and all #' other values are added to a "feasible set". An unconstrained least squares solution is then solved for the #' "feasible set", any negative values in the resulting solution are set to zero, and the process is repeated until #' the feasible set solution is strictly positive. #' #' The FAST algorithm has a definite convergence guarantee because the #' feasible set will either converge or become smaller with each iteration. The result is generally exact or nearly #' exact for small well-conditioned systems (< 50 variables) within 2 iterations and thus sets up coordinate #' descent for very rapid convergence. The FAST method is similar to the first phase of the so-called "TNT-NN" algorithm (Myre et al., 2017), #' but the latter half of that method relies heavily on heuristics to refine the approximate active set, which we avoid by using #' coordinate descent instead. #' #' @param a symmetric positive definite matrix giving coefficients of the linear system #' @param b matrix giving the right-hand side(s) of the linear system #' @param L1 L1/LASSO penalty to be subtracted from \code{b} #' @param fast_nnls initialize coordinate descent with a FAST NNLS approximation #' @param cd_maxit maximum number of coordinate descent iterations #' @param cd_tol stopping criteria, difference in \eqn{x} across consecutive solutions over the sum of \eqn{x} #' @return vector or matrix giving solution for \code{x} #' @export #' @author Zach DeBruine #' @seealso \code{\link{nmf}}, \code{\link{project}} #' @md #' #' @references #' #' DeBruine, ZJ, Melcher, K, and Triche, TJ. (2021). "High-performance non-negative matrix factorization for large single-cell data." BioRXiv. #' #' Franc, VC, Hlavac, VC, and Navara, M. (2005). "Sequential Coordinate-Wise Algorithm for the Non-negative Least Squares Problem. Proc. Int'l Conf. Computer Analysis of Images and Patterns." #' #' Lin, X, and Boutros, PC (2020). "Optimization and expansion of non-negative matrix factorization." BMC Bioinformatics. #' #' Myre, JM, Frahm, E, Lilja DJ, and Saar, MO. (2017) "TNT-NN: A Fast Active Set Method for Solving Large Non-Negative Least Squares Problems". Proc. Computer Science. #' #' @examples #' \dontrun{ #' # compare solution to base::solve for a random system #' X <- matrix(runif(100), 10, 10) #' a <- crossprod(X) #' b <- crossprod(X, runif(10)) #' unconstrained_soln <- solve(a, b) #' nonneg_soln <- nnls(a, b) #' unconstrained_err <- mean((a %*% unconstrained_soln - b)^2) #' nonnegative_err <- mean((a %*% nonneg_soln - b)^2) #' unconstrained_err #' nonnegative_err #' all.equal(solve(a, b), nnls(a, b)) #' #' # example adapted from multiway::fnnls example 1 #' X <- matrix(1:100,50,2) #' y <- matrix(101:150,50,1) #' beta <- solve(crossprod(X)) %*% crossprod(X, y) #' beta #' beta <- nnls(crossprod(X), crossprod(X, y)) #' beta #' } nnls <- function(a, b, cd_maxit = 100L, cd_tol = 1e-8, fast_nnls = FALSE, L1 = 0) { .Call('_RcppML_nnls', PACKAGE = 'RcppML', a, b, cd_maxit, cd_tol, fast_nnls, L1) } RcppML/R/threads.R0000644000176200001440000000407714122415071013372 0ustar liggesusers#' Get the number of threads RcppML should use #' #' @description Get the number of threads that will be used by RcppML functions supporting parallelization with OpenMP. Use \code{\link{setRcppMLthreads}} to set the number of threads to be used. #' #' @returns integer giving number of threads to be used by RcppML functions. \code{0} corresponds to all available threads, as determined by OpenMP. #' @export #' @seealso \code{\link{setRcppMLthreads}} #' @author Zach DeBruine #' @examples #' \dontrun{ #' # set serial configuration #' setRcppMLthreads(1) #' getRcppMLthreads() #' #' # restore default parallel configuration, #' # letting OpenMP decide how many threads to use #' setRcppMLthreads(0) #' getRcppMLthreads() #' } getRcppMLthreads <- function() { threads <- Sys.getenv("RcppMLthreads") ifelse(threads == "", 0L, as.integer(threads)) } #' Set the number of threads RcppML should use #' #' @description The number of threads is 0 by default (corresponding to all available threads), but can be set manually using this function. If you clear environment variables or affect the "RcppMLthreads" environment variable specifically, you will need to set your number of preferred threads again. #' #' @details #' The number of threads set affects OpenMP parallelization only for functions in the RcppML package. It does not affect other R packages that use OpenMP. Parallelization is used for projection of linear factor models with rank > 2, calculation of mean squared error for linear factor models, and for divisive clustering. #' #' @param threads number of threads to be used in RcppML functions that are parallelized with OpenMP. #' @export #' @seealso \code{\link{getRcppMLthreads}} #' @author Zach DeBruine #' @examples #' \dontrun{ #' # set serial configuration #' setRcppMLthreads(1) #' getRcppMLthreads() #' #' # restore default parallel configuration, #' # letting OpenMP decide how many threads to use #' setRcppMLthreads(0) #' getRcppMLthreads() #' } setRcppMLthreads <- function(threads) { Sys.setenv("RcppMLthreads" = threads) } RcppML/R/project.R0000644000176200001440000001262514122413042013400 0ustar liggesusers#' @title Project a linear factor model #' #' @description Solves the equation \eqn{A = wh} for either \eqn{h} or \eqn{w} given either \eqn{w} or \eqn{h} and \eqn{A} #' #' @details #' For the classical alternating least squares matrix factorization update problem \eqn{A = wh}, the updates #' (or projection) of \eqn{h} is given by the equation: \deqn{w^Twh = wA_j} #' which is in the form \eqn{ax = b} where \eqn{a = w^Tw} \eqn{x = h} and \eqn{b = wA_j} for all columns \eqn{j} in \eqn{A}. #' #' Given \eqn{A}, \code{project} can solve for either \eqn{w} or \eqn{h} given the other: #' * When given \eqn{A} and \eqn{w}, \eqn{h} is found using a highly efficient parallelization scheme. #' * When given \eqn{A} and \eqn{h}, \eqn{w} is found without transposition of \eqn{A} (as would be the case in traditional block-pivoting matrix factorization) by accumulating the right-hand sides of linear systems in-place in \eqn{A}, then solving the systems. Note that \eqn{w} may also be found by inputting the transpose of \eqn{A} and \eqn{h} in place of \eqn{w}, (i.e. \code{A = t(A), w = h, h = NULL}). However, for most applications, the cost of a single projection in-place is less than transposition of \eqn{A}. However, for matrix factorization, it is desirable to transpose \eqn{A} if possible because many projections are needed. #' #' **Parallelization.** Least squares projections in factorizations of rank-3 and greater are parallelized using the number of threads set by \code{\link{setRcppMLthreads}}. #' By default, all available threads are used, see \code{\link{getRcppMLthreads}}. The overhead of parallization is too great for rank-1 and -2 factorization. #' #' **L1 Regularization.** Any L1 penalty is subtracted from \eqn{b} and should generally be scaled to \code{max(b)}, where \eqn{b = WA_j} for all columns \eqn{j} in \eqn{A}. An easy way to properly scale an L1 penalty is to normalize all columns in \eqn{w} to sum to 1. No scaling is applied in this function. Such scaling guarantees that \code{L1 = 1} gives a completely sparse solution. #' #' **Specializations.** There are specializations for symmetric input matrices, and for rank-1 and rank-2 projections. See documentation for \code{\link{nmf}} for theoretical details and guidance. #' #' **Publication reference.** For theoretical and practical considerations, please see our manuscript: "DeBruine ZJ, Melcher K, Triche TJ (2021) #' High-performance non-negative matrix factorization for large single cell data." on BioRXiv. #' #' @inheritParams nmf #' @param w dense matrix of factors x features giving the linear model to be projected (if \code{h = NULL}) #' @param h dense matrix of factors x samples giving the linear model to be projected (if \code{w = NULL}) #' @param L1 L1/LASSO penalty to be applied. No scaling is performed. See details. #' @returns matrix \eqn{h} or \eqn{w} #' @references #' DeBruine, ZJ, Melcher, K, and Triche, TJ. (2021). "High-performance non-negative matrix factorization for large single-cell data." BioRXiv. #' @author Zach DeBruine #' @export #' @seealso \code{\link{nnls}}, \code{\link{nmf}} #' @md #' @examples #' \dontrun{ #' library(Matrix) #' w <- matrix(runif(1000 * 10), 1000, 10) #' h_true <- matrix(runif(10 * 100), 10, 100) #' # A is the crossproduct of "w" and "h" with 10% signal dropout #' A <- (w %*% h_true) * (rsparsematrix(1000, 100, 0.9) > 0) #' h <- project(A, w) #' cor(as.vector(h_true), as.vector(h)) #' #' # alternating projections refine solution (like NMF) #' mse_bad <- mse(A, w, rep(1, ncol(w)), h) # mse before alternating updates #' h <- project(A, w = w) #' w <- project(A, h = h) #' h <- project(A, w) #' w <- project(A, h = h) #' h <- project(A, w) #' w <- t(project(A, h = h)) #' mse_better <- mse(A, w, rep(1, ncol(w)), h) # mse after alternating updates #' mse_better < mse_bad #' #' # two ways to solve for "w" that give the same solution #' w <- project(A, h = h) #' w2 <- project(t(A), w = t(h)) #' all.equal(w, w2) #' } project <- function(A, w = NULL, h = NULL, nonneg = TRUE, L1 = 0, mask_zeros = FALSE) { threads <- getRcppMLthreads() if ((is.null(w) && is.null(h)) || (!is.null(w) && !is.null(h))) stop("specify one of 'w' or 'h', leaving the other 'NULL'") if (!is.null(w) && mask_zeros) stop("'mask_zeros = TRUE' is not supported for projections of 'h'. Use 'w' <- project(t(A), w = h)' instead.") # get 'A' in either sparse or dense matrix format if (is(A, "sparseMatrix")) { A <- as(A, "dgCMatrix") } else if (canCoerce(A, "matrix")) { A <- as.matrix(A) } else stop("'A' was not coercible to a matrix") # check that dimensions of w or h are compatible with A if (!is.null(w)) { if (nrow(A) == nrow(w) && nrow(A) != ncol(A)) w <- t(w) if (nrow(A) != ncol(w)) stop("dimensions of 'A' and 'w' are incompatible!") } else { if (ncol(A) == nrow(h) && nrow(A) != ncol(A)) h <- t(h) if (ncol(A) != ncol(h)) stop("dimensions of 'A' and 'h' are incompatible!") } # select backend based on whether 'A' is dense or sparse if (class(A) == "dgCMatrix") { if (!is.null(w)) { Rcpp_projectW_sparse(A, w, nonneg, L1, threads, mask_zeros) } else { Rcpp_projectH_sparse(A, h, nonneg, L1, threads, mask_zeros) } } else { if (!is.null(w)) { Rcpp_projectW_dense(A, w, nonneg, L1, threads, mask_zeros) } else { Rcpp_projectH_dense(A, h, nonneg, L1, threads, mask_zeros) } } }RcppML/R/mse.R0000644000176200001440000000471514122413456012530 0ustar liggesusers#' Mean Squared Error loss of a factor model #' #' @description MSE of factor models \code{w} and \code{h} given sparse matrix \eqn{A} #' #' @details Mean squared error of a matrix factorization of the form \eqn{A = wdh} is given by \deqn{\frac{\sum_{i,j}{(A - wdh)^2}}{ij}} where \eqn{i} and \eqn{j} are the number of rows and columns in \eqn{A}. #' #' Thus, this function simply calculates the cross-product of \eqn{wh} or \eqn{wdh} (if \eqn{d} is specified), #' subtracts that from \eqn{A}, squares the result, and calculates the mean of all values. #' #' If no diagonal scaling vector is present in the model, input \code{d = rep(1, k)} where \code{k} is the rank of the model. #' #' **Parallelization.** Calculation of mean squared error is performed in parallel across columns in \code{A} using the number of threads set by \code{\link{setRcppMLthreads}}. #' By default, all available threads are used, see \code{\link{getRcppMLthreads}}. #' #' @inheritParams nmf #' @param w dense matrix of class \code{matrix} with factors (columns) by features (rows) #' @param d diagonal scaling vector of rank length #' @param h dense matrix of class \code{matrix} with samples (columns) by factors (rows) #' @return mean squared error of the factorization model #' @export #' @md #' @author Zach DeBruine #' @examples #' \dontrun{ #' library(Matrix) #' A <- Matrix::rsparsematrix(1000, 1000, 0.1) #' model <- nmf(A, k = 10, tol = 0.01) #' c_mse <- mse(A, model$w, model$d, model$h) #' R_mse <- mean((A - model$w %*% Diagonal(x = model$d) %*% model$h)^2) #' all.equal(c_mse, R_mse) #' } mse <- function(A, w, d = NULL, h, mask_zeros = FALSE) { threads <- getRcppMLthreads() if (is(A, "sparseMatrix")) { A <- as(A, "dgCMatrix") } else if (canCoerce(A, "matrix")) { A <- as.matrix(A) } else stop("'A' was not coercible to a matrix") if (nrow(w) == nrow(A)) w <- t(w) if (nrow(h) == ncol(A)) h <- t(h) if (nrow(w) != nrow(h)) stop("'w' and 'h' are not of equal rank") if (ncol(w) != nrow(A)) stop("dimensions of 'w' and 'A' are incompatible") if (ncol(h) != ncol(A)) stop("dimensions of 'h' and 'A' are incompatible") if (is.null(d)) { d <- rep(1, ncol(w)) } else { if (length(d) != nrow(w)) stop("length of 'd' and rank of 'w' and 'h' are not equivalent") } if (class(A) == "dgCMatrix") { Rcpp_mse_sparse(A, w, d, h, mask_zeros, threads) } else { Rcpp_mse_dense(A, w, d, h, mask_zeros, threads) } }RcppML/R/dclust.R0000644000176200001440000001152014122422714013227 0ustar liggesusers#' @title Divisive clustering #' #' @description Recursive bipartitioning by rank-2 matrix factorization with an efficient modularity-approximate stopping criteria #' #' @details #' Divisive clustering is a sensitive and fast method for sample classification. Samples are recursively partitioned into two groups until a stopping criteria is satisfied and prevents successful partitioning. #' #' See \code{\link{nmf}} and \code{\link{bipartition}} for technical considerations and optimizations relevant to bipartitioning. #' #' **Stopping criteria**. Two stopping criteria are used to prevent indefinite division of clusters and tune the clustering resolution to a desirable range: #' * \code{min_samples}: Minimum number of samples permitted in a cluster #' * \code{min_dist}: Minimum cosine distance of samples to their cluster center relative to their unassigned cluster center (an approximation of Newman-Girvan modularity) #' #' Newman-Girvan modularity (\eqn{Q}) is an interpretable and widely used measure of modularity for a bipartition. However, it requires the calculation of distance between all within-cluster and between-cluster sample pairs. This is computationally intensive, especially for large sample sets. #' #' \code{dclust} uses a measure which linearly approximates Newman-Girvan modularity, and simply requires the calculation of distance between all samples in a cluster and both cluster centers (the assigned and unassigned center), which is orders of magnitude faster to compute. Cosine distance is used instead of Euclidean distance since it handles outliers and sparsity well. #' #' A bipartition is rejected if either of the two clusters contains fewer than \code{min_samples} or if the mean relative cosine distance of the bipartition is less than \code{min_dist}. #' #' A bipartition will only be attempted if there are more than \code{2 * min_samples} samples in the cluster, meaning that \code{dist} may not be calculated for some clusters. #' #' **Reproducibility.** Because rank-2 NMF is approximate and requires random initialization, results may vary slightly across restarts. Therefore, specify a \code{seed} to guarantee absolute reproducibility. #' #' Other than setting the seed, reproducibility may be improved by setting \code{tol} to a smaller number to increase the exactness of each bipartition. #' #' @inheritParams nmf #' @param A matrix of features-by-samples in sparse format (preferred class is "Matrix::dgCMatrix") #' @param min_dist stopping criteria giving the minimum cosine distance of samples within a cluster to the center of their assigned vs. unassigned cluster. If \code{0}, neither this distance nor cluster centroids will be calculated. #' @param min_samples stopping criteria giving the minimum number of samples permitted in a cluster #' @param verbose print number of divisions in each generation #' @param tol in rank-2 NMF, the correlation distance (\eqn{1 - R^2}) between \eqn{w} across consecutive iterations at which to stop factorization #' @param nonneg in rank-2 NMF, enforce non-negativity #' @param seed random seed for rank-2 NMF model initialization #' @return #' A list of lists corresponding to individual clusters: #' \itemize{ #' \item id : character sequence of "0" and "1" giving position of clusters along splitting hierarchy #' \item samples : indices of samples in the cluster #' \item center : mean feature expression of all samples in the cluster #' \item dist : if applicable, relative cosine distance of samples in cluster to assigned/unassigned cluster center. #' \item leaf : is cluster a leaf node #' } #' #' @author Zach DeBruine #' #' @references #' #' Schwartz, G. et al. "TooManyCells identifies and visualizes relationships of single-cell clades". Nature Methods (2020). #' #' Newman, MEJ. "Modularity and community structure in networks". PNAS (2006) #' #' Kuang, D, Park, H. (2013). "Fast rank-2 nonnegative matrix factorization for hierarchical document clustering." Proc. 19th ACM SIGKDD intl. conf. on Knowledge discovery and data mining. #' #' @export #' @seealso \code{\link{bipartition}}, \code{\link{nmf}} #' @md #' @examples #' \dontrun{ #' library(Matrix) #' data(USArrests) #' A <- as(as.matrix(t(USArrests)), "dgCMatrix") #' clusters <- dclust(A, min_samples = 2, min_dist = 0.001) #' str(clusters) #' } dclust <- function(A, min_samples, min_dist = 0, verbose = TRUE, tol = 1e-5, maxit = 100, nonneg = TRUE, seed = NULL){ threads <- getRcppMLthreads() if (!is.numeric(seed)) seed <- 0 if(canCoerce(A, "dgCMatrix")){ A <- as(A, "dgCMatrix") } else if(canCoerce(A, "matrix")){ A <- as.matrix(A) A <- as(A, "dgCMatrix") } else stop("'A' could not be coerced to a dgCMatrix") Rcpp_dclust_sparse(A, min_samples, min_dist, verbose, tol, maxit, nonneg, seed, threads) } RcppML/R/bipartition.R0000644000176200001440000000615714122413110014255 0ustar liggesusers#' @title Bipartition a sample set #' #' @description Spectral biparitioning by rank-2 matrix factorization #' #' @details #' Spectral bipartitioning is a popular subroutine in divisive clustering. The sign of the difference between sample loadings in factors of a rank-2 matrix factorization #' gives a bipartition that is nearly identical to an SVD. #' #' Rank-2 matrix factorization by alternating least squares is faster than rank-2-truncated SVD (i.e. _irlba_). #' #' This function is a specialization of rank-2 \code{\link{nmf}} with support for factorization of only a subset of samples, and with additional calculations on the factorization model relevant to bipartitioning. See \code{\link{nmf}} for details regarding rank-2 factorization. #' #' @inheritParams nmf #' @param samples samples to include in bipartition, numbered from 1 to \code{ncol(A)}. Default is \code{NULL} for all samples. #' @param calc_dist calculate the relative cosine distance of samples within a cluster to either cluster centroid. If \code{TRUE}, centers for clusters will also be calculated. #' @return #' A list giving the bipartition and useful statistics: #' \itemize{ #' \item v : vector giving difference between sample loadings between factors in a rank-2 factorization #' \item dist : relative cosine distance of samples within a cluster to centroids of assigned vs. not-assigned cluster #' \item size1 : number of samples in first cluster (positive loadings in 'v') #' \item size2 : number of samples in second cluster (negative loadings in 'v') #' \item samples1: indices of samples in first cluster #' \item samples2: indices of samples in second cluster #' \item center1 : mean feature loadings across samples in first cluster #' \item center2 : mean feature loadings across samples in second cluster #' } #' #' @references #' #' Kuang, D, Park, H. (2013). "Fast rank-2 nonnegative matrix factorization for hierarchical document clustering." Proc. 19th ACM SIGKDD intl. conf. on Knowledge discovery and data mining. #' #' @author Zach DeBruine #' #' @export #' @seealso \code{\link{nmf}}, \code{\link{dclust}} #' @md #' @examples #' \dontrun{ #' library(Matrix) #' data(iris) #' A <- as(as.matrix(iris[,1:4]), "dgCMatrix") #' bipartition(A, calc_dist = TRUE) #' } bipartition <- function(A, tol = 1e-5, maxit = 100, nonneg = TRUE, samples = 1:ncol(A), seed = NULL, verbose = FALSE, calc_dist = FALSE, diag = TRUE){ if (is(A, "sparseMatrix")) { A <- as(A, "dgCMatrix") } else if (canCoerce(A, "matrix")) { A <- as.matrix(A) } else stop("'A' was not coercible to a matrix") if(!is.numeric(seed)) seed <- 0 if(min(samples) == 0) stop("sample indices must be strictly positive") if(max(samples) > ncol(A)) stop("sample indices must be strictly less than the number of columns in 'A'") samples <- samples - 1 if(class(A) == "dgCMatrix"){ Rcpp_bipartition_sparse(A, tol, maxit, nonneg, samples, seed, verbose, calc_dist, diag) } else { Rcpp_bipartition_dense(A, tol, maxit, nonneg, samples, seed, verbose, calc_dist, diag) } }RcppML/MD50000644000176200001440000000443214122425662011726 0ustar liggesusersc5b396120c19d467d7f4dc8d06cafdca *DESCRIPTION 3377d5684e74f96f3c2c78f6d7f166fa *NAMESPACE 1fedb39976b1fcdcd0a064ad47ab631c *R/RcppExports.R 9181ceda27553909d50dedfcadb3c985 *R/RcppML.R 62638e98da4e2fb671d3228c0987d6c6 *R/bipartition.R e7e5c6bb66600add88c70494d2d96920 *R/dclust.R d2a7e00df8cf5f7727faaa37628ee17f *R/mse.R 0da3f016442fbaa1c793722152cf77e1 *R/nmf.R ce6ca375a07fe70c45bcfa2de2160aa3 *R/project.R 9712183ad270b6a912e33eda10eee90a *R/threads.R be58b4dc94d8ea288647b7ab29b54fa8 *build/vignette.rds 6a7eba04c33f0af82e9bc5c0a10723fa *inst/doc/RcppML.R 00220b1c42c7736eefe3a01528096f1a *inst/doc/RcppML.Rmd c2c943798bc107224191c6d2583d01a8 *inst/doc/RcppML.html 9a45c2574773503fd3479d93eb41837a *inst/include/RcppML.hpp 1269cf76d78146bd4ab66e8e4f9f69dd *inst/include/RcppML/SparseMatrix.hpp f461d4e6447f0f191b0b966109fe25b4 *inst/include/RcppML/bipartition.hpp 03e60b561e3d4c9b496bb39aefe05a56 *inst/include/RcppML/bits.hpp af16b201b7ed311ff457552c1fb98559 *inst/include/RcppML/cluster.hpp e7655a367ed1a96ae6f609c6bde314b1 *inst/include/RcppML/nmf.hpp b022cdc9ffc948d674dc1da6e6138932 *inst/include/RcppML/project.hpp 3256a6f3e6b66d24f882b2e8da906758 *inst/include/RcppMLCommon.hpp f9f6354a654f809293fa49dde977c5df *man/RcppML.Rd 9d08f92a0694337726875f2cc63feb59 *man/bipartition.Rd 8a40fd15369f4e14b9aed00aaaec3e04 *man/dclust.Rd 39102f0e4594c6a346afb3f77737586a *man/getRcppMLthreads.Rd fae5048377c91c4dc8005d4557dd00b8 *man/mse.Rd 029cd0bcd61808da6de9fddda01b1fed *man/nmf.Rd 2e046d807bdc0f7d7fc29c8e4dc2d7ab *man/nnls.Rd 863469d94bef888befa4238ed0bbf504 *man/project.Rd e22fae8013f7fcf6e5a7bf791831e5dd *man/setRcppMLthreads.Rd 4ce1199386023d946899a147df80fe8e *src/Makevars 4ce1199386023d946899a147df80fe8e *src/Makevars.win bf245f198bdc25054b6960435dd7f54a *src/RcppExports.cpp 5bc93e20f6c3156f0623a170b178eb1b *src/RcppFunctions.cpp fec3faba2aad06bd6c3bc0509d0360ed *tests/testthat.R ac2d5e156b90fa5b3664ec7eb7a00ce7 *tests/testthat/test_bipartition.R 732732aef64806ec13f6d9a0778271ae *tests/testthat/test_dclust.R 52f7a7af058963ce0e0e248cc54ac40f *tests/testthat/test_mse.R fe2cf978640682327099bce7cbd12e7f *tests/testthat/test_nmf.R f69bb289c2cbe0939e1c4cd17c8da2db *tests/testthat/test_nnls.R 98d52249bd808f25890c12915217706d *tests/testthat/test_project.R 00220b1c42c7736eefe3a01528096f1a *vignettes/RcppML.Rmd RcppML/inst/0000755000176200001440000000000014122423360012361 5ustar liggesusersRcppML/inst/doc/0000755000176200001440000000000014122423360013126 5ustar liggesusersRcppML/inst/doc/RcppML.R0000644000176200001440000000316014122423357014414 0ustar liggesusers## ----setup, include = FALSE--------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ## ---- eval = FALSE------------------------------------------------------------ # library(devtools) # install_github("zdebruine/RcppML") ## ----------------------------------------------------------------------------- library(RcppML) library(Matrix) ## ----------------------------------------------------------------------------- # construct a system of equations X <- matrix(rnorm(2000),100,20) btrue <- runif(20) y <- X %*% btrue + rnorm(100) a <- crossprod(X) b <- crossprod(X, y) # solve the system of equations x <- RcppML::nnls(a, b) # use only coordinate descent x <- RcppML::nnls(a, b, fast_nnls = FALSE, cd_maxit = 1000, cd_tol = 1e-8) ## ----------------------------------------------------------------------------- # simulate a sparse matrix A <- rsparsematrix(1000, 100, 0.1) # simulate a linear factor model w <- matrix(runif(1000 * 10), 1000, 10) # project the model h <- RcppML::project(A, w) ## ----------------------------------------------------------------------------- A <- rsparsematrix(100, 100, 0.1) model <- RcppML::nmf(A, 10, verbose = F) w <- model$w d <- model$d h <- model$h model_tolerance <- tail(model$tol, 1) ## ----------------------------------------------------------------------------- A_sym <- as(crossprod(A), "dgCMatrix") model <- RcppML::nmf(A_sym, 10, verbose = F) ## ----------------------------------------------------------------------------- RcppML::mse(A_sym, model$w, model$d, model$h) RcppML/inst/doc/RcppML.Rmd0000644000176200001440000001060414111717072014734 0ustar liggesusers--- title: "Introduction to the RcppML package" author: "Zach DeBruine" date: "`r Sys.Date()`" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Introduction to the RcppML package} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r setup, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ``` The 'RcppML' package provides high-performance machine learning algorithms using Rcpp with a focus on matrix factorization. ## Installation Install the latest development version of RcppML from github: ```{R, eval = FALSE} library(devtools) install_github("zdebruine/RcppML") ``` ```{R} library(RcppML) library(Matrix) ``` ## Non-Negative Least Squares RcppML contains extremely fast NNLS solvers. Use the `nnls` function to solve systems of equations subject to non-negativity constraints. The `RcppML::solve` function solves the equation \eqn{ax = b} for \eqn{x} where \eqn{a} is symmetric positive definite matrix of dimensions \eqn{m x m} and \eqn{b} is a vector of length \eqn{m} or a matrix of dimensions \eqn{m x n}. ```{R} # construct a system of equations X <- matrix(rnorm(2000),100,20) btrue <- runif(20) y <- X %*% btrue + rnorm(100) a <- crossprod(X) b <- crossprod(X, y) # solve the system of equations x <- RcppML::nnls(a, b) # use only coordinate descent x <- RcppML::nnls(a, b, fast_nnls = FALSE, cd_maxit = 1000, cd_tol = 1e-8) ``` `RcppML::solve` implements a new and fastest-in-class algorithm for non-negative least squares: 1. *initialization* is done by solving for the unconstrained least squares solution. 2. *forward active set tuning* (FAST) provides a near-exact solution (often exact for well-conditioned systems) by setting all negative values in the unconstrained solution to zero, re-solving the system for only positive values, and repeating the process until the solution for values not constrained to zero is strictly positive. Set `cd_maxit = 0` to use only the FAST solver. 3. *Coordinate descent* refines the FAST solution and finds the best solution discoverable by gradient descent. The coordinate descent solution is only used if it gives a better error than the FAST solution. Generally, coordinate descent re-introduces variables constrained to zero by FAST back into the feasible set, but does not dramatically change the solution. ## Projecting Linear Models Project dense linear factor models onto real-valued sparse matrices (or any matrix coercible to `Matrix::dgCMatrix`) using `RcppML::project`. `RcppML::project` solves the equation \eqn{A = WH} for \eqn{H}. ```{R} # simulate a sparse matrix A <- rsparsematrix(1000, 100, 0.1) # simulate a linear factor model w <- matrix(runif(1000 * 10), 1000, 10) # project the model h <- RcppML::project(A, w) ``` ## Non-negative Matrix Factorization `RcppML::nmf` finds a non-negative matrix factorization by alternating least squares (alternating projections of linear models \eqn{w} and \eqn{h}). There are several ways in which the NMF algorithm differs from other currently available methods: * Diagonalized scaling of factors to sum to 1, permitting convex L1 regularization along the entire solution path * Fast stopping criteria, based on correlation between models across consecutive iterations * Extremely fast algorithms using the Eigen C++ library, optimized for matrices that are >90% sparse * Support for NMF or unconstrained matrix factorization * Parallelized using OpenMP multithreading The following example runs rank-10 NMF on a random 1000 x 1000 matrix that is 90% sparse: ```{R} A <- rsparsematrix(100, 100, 0.1) model <- RcppML::nmf(A, 10, verbose = F) w <- model$w d <- model$d h <- model$h model_tolerance <- tail(model$tol, 1) ``` Tolerance is simply a measure of the average correlation between \eqn{w_{i-1} and \eqn{w_i} and \eqn{h_{i-1}} and \eqn{h_i} for a given iteration \eqn{i}. For symmetric factorizations (when \code{symmetric = TRUE}), tolerance becomes a measure of the correlation between \eqn{w_i} and \eqn{h_i}, and diagonalization is automatically performed to enforce symmetry: ```{R} A_sym <- as(crossprod(A), "dgCMatrix") model <- RcppML::nmf(A_sym, 10, verbose = F) ``` Mean squared error of a factorization can be calculated for a given model using the `RcppML::mse` function: ```{R} RcppML::mse(A_sym, model$w, model$d, model$h) ``` RcppML/inst/doc/RcppML.html0000644000176200001440000004423514122423360015161 0ustar liggesusers Introduction to the RcppML package

Introduction to the RcppML package

Zach DeBruine

2021-09-21

The ‘RcppML’ package provides high-performance machine learning algorithms using Rcpp with a focus on matrix factorization.

Installation

Install the latest development version of RcppML from github:

library(devtools)
install_github("zdebruine/RcppML")
library(RcppML)
library(Matrix)
#> Warning: package 'Matrix' was built under R version 4.0.5

Non-Negative Least Squares

RcppML contains extremely fast NNLS solvers. Use the nnls function to solve systems of equations subject to non-negativity constraints.

The RcppML::solve function solves the equation for where is symmetric positive definite matrix of dimensions and is a vector of length or a matrix of dimensions .

# construct a system of equations
X <- matrix(rnorm(2000),100,20)
btrue <- runif(20)
y <- X %*% btrue + rnorm(100)
a <- crossprod(X)
b <- crossprod(X, y)

# solve the system of equations
x <- RcppML::nnls(a, b)

# use only coordinate descent
x <- RcppML::nnls(a, b, fast_nnls = FALSE, cd_maxit = 1000, cd_tol = 1e-8)

RcppML::solve implements a new and fastest-in-class algorithm for non-negative least squares:

  1. initialization is done by solving for the unconstrained least squares solution.
  2. forward active set tuning (FAST) provides a near-exact solution (often exact for well-conditioned systems) by setting all negative values in the unconstrained solution to zero, re-solving the system for only positive values, and repeating the process until the solution for values not constrained to zero is strictly positive. Set cd_maxit = 0 to use only the FAST solver.
  3. Coordinate descent refines the FAST solution and finds the best solution discoverable by gradient descent. The coordinate descent solution is only used if it gives a better error than the FAST solution. Generally, coordinate descent re-introduces variables constrained to zero by FAST back into the feasible set, but does not dramatically change the solution.

Projecting Linear Models

Project dense linear factor models onto real-valued sparse matrices (or any matrix coercible to Matrix::dgCMatrix) using RcppML::project.

RcppML::project solves the equation for .

# simulate a sparse matrix
A <- rsparsematrix(1000, 100, 0.1)

# simulate a linear factor model
w <- matrix(runif(1000 * 10), 1000, 10)

# project the model
h <- RcppML::project(A, w)

Non-negative Matrix Factorization

RcppML::nmf finds a non-negative matrix factorization by alternating least squares (alternating projections of linear models and ).

There are several ways in which the NMF algorithm differs from other currently available methods:

  • Diagonalized scaling of factors to sum to 1, permitting convex L1 regularization along the entire solution path
  • Fast stopping criteria, based on correlation between models across consecutive iterations
  • Extremely fast algorithms using the Eigen C++ library, optimized for matrices that are >90% sparse
  • Support for NMF or unconstrained matrix factorization
  • Parallelized using OpenMP multithreading

The following example runs rank-10 NMF on a random 1000 x 1000 matrix that is 90% sparse:

A <- rsparsematrix(100, 100, 0.1)
model <- RcppML::nmf(A, 10, verbose = F)

w <- model$w
d <- model$d
h <- model$h
model_tolerance <- tail(model$tol, 1)

Tolerance is simply a measure of the average correlation between \eqn{w_{i-1} and and and for a given iteration .

For symmetric factorizations (when ), tolerance becomes a measure of the correlation between and , and diagonalization is automatically performed to enforce symmetry:

A_sym <- as(crossprod(A), "dgCMatrix")

model <- RcppML::nmf(A_sym, 10, verbose = F)

Mean squared error of a factorization can be calculated for a given model using the RcppML::mse function:

RcppML::mse(A_sym, model$w, model$d, model$h)
#> [1] 1.319477
RcppML/inst/include/0000755000176200001440000000000014111717072014010 5ustar liggesusersRcppML/inst/include/RcppMLCommon.hpp0000644000176200001440000000157114122376467017046 0ustar liggesusers// This file is part of RcppML, a Rcpp Machine Learning library // // Copyright (C) 2021 Zach DeBruine // // This source code is subject to the terms of the GNU // Public License v. 2.0. #ifndef RcppML_common #define RcppML_common //[[Rcpp::plugins(openmp)]] #ifdef _OPENMP #include #endif #ifndef EIGEN_NO_DEBUG #define EIGEN_NO_DEBUG #endif #ifndef TINY_NUM #define TINY_NUM 1e-15 // epsilon for numerical stability #endif // parameters for coordinate descent #ifndef CD_PARAMS #define CD_TOL 1e-8 #define CD_MAXIT 100 #endif #ifndef EIGEN_INITIALIZE_MATRICES_BY_ZERO #define EIGEN_INITIALIZE_MATRICES_BY_ZERO #endif //[[Rcpp::depends(RcppEigen)]] #ifndef RcppEigen__RcppEigen__h #include #endif #ifndef RcppML_sparsematrix #include #endif #ifndef RcppML_bits #include #endif #endif RcppML/inst/include/RcppML.hpp0000644000176200001440000000066714111717072015667 0ustar liggesusers// This file is part of RcppML, a Rcpp Machine Learning library // // Copyright (C) 2021 Zach DeBruine // // This source code is subject to the terms of the GNU // Public License v. 2.0. #ifndef RcppML_hpp #define RcppML_hpp #include "RcppMLCommon.hpp" #include #include #include #include #include #endif RcppML/inst/include/RcppML/0000755000176200001440000000000014122425662015150 5ustar liggesusersRcppML/inst/include/RcppML/cluster.hpp0000644000176200001440000000672314111717072017347 0ustar liggesusers// This file is part of RcppML, a Rcpp Machine Learning library // // Copyright (C) 2021 Zach DeBruine // // This source code is subject to the terms of the GNU // Public License v. 2.0. #ifndef RcppML_dclust #define RcppML_dclust #ifndef RcppML_common #include #endif #ifndef RcppML_bipartition #include #endif struct cluster { std::string id; std::vector samples; std::vector center; double dist; bool leaf; bool agg; }; std::vector indices_that_are_not_leaves(std::vector& clusters) { std::vector ind; for (unsigned int i = 0; i < clusters.size(); ++i) { if (!clusters[i].leaf) ind.push_back(i); } return ind; } namespace RcppML { class clusterModel { public: RcppML::SparseMatrix A; unsigned int min_samples; double min_dist, tol; bool nonneg, verbose; unsigned int seed, maxit, threads; // constructor requiring min_samples and min_dist. All other parameters must be set individually. clusterModel(RcppML::SparseMatrix& A, const unsigned int min_samples, const double min_dist) : A(A), min_samples(min_samples), min_dist(min_dist) { nonneg = true; verbose = true; tol = 1e-4; seed = 0; maxit = 100; threads = 0; w = randomMatrix(2, A.rows(), seed); calc_dist = (min_dist > 0); } std::vector getClusters() { return clusters; } void dclust() { if (verbose) Rprintf("\n# of divisions: "); std::vector samples = std::vector(A.cols()); std::iota(samples.begin(), samples.end(), (int)0); cluster parent_cluster{ "0", samples, centroid(A, samples), 0, samples.size() < min_samples * 2, false }; clusters.push_back(parent_cluster); // master cluster unsigned int n_splits; do { // attempt to bipartition all clusters that have not yet been determined to be leaves Rcpp::checkUserInterrupt(); n_splits = 0; std::vector to_split = indices_that_are_not_leaves(clusters); std::vector new_clusters(to_split.size()); #ifdef _OPENMP #pragma omp parallel for num_threads(threads) schedule(dynamic) #endif for (unsigned int i = 0; i < to_split.size(); ++i) { bipartitionModel p = c_bipartition_sparse(A, w, clusters[to_split[i]].samples, tol, nonneg, calc_dist, maxit, false); bool successful_split = (p.size1 > min_samples && p.size2 > min_samples); if (calc_dist && successful_split && p.dist < min_dist) successful_split = false; if (successful_split) { // bipartition was successful new_clusters[i] = cluster{ clusters[to_split[i]].id + "1", p.samples2, p.center2, 0, p.size2 < min_samples * 2, false }; clusters[to_split[i]] = cluster{ clusters[to_split[i]].id + "0", p.samples1, p.center1, 0, p.size1 < min_samples * 2, false }; ++n_splits; } else { // bipartition was unsuccessful clusters[to_split[i]].dist = p.dist; clusters[to_split[i]].leaf = true; } } for (unsigned int i = 0; i < new_clusters.size(); ++i) if (new_clusters[i].id.size() > 0) clusters.push_back(new_clusters[i]); if (verbose) Rprintf(", %u", n_splits); } while (n_splits > 0); if (verbose) Rprintf("\n"); } private: std::vector clusters; Eigen::MatrixXd w; bool calc_dist; }; } #endif RcppML/inst/include/RcppML/SparseMatrix.hpp0000644000176200001440000000501714111717072020303 0ustar liggesusers// zero-copy sparse matrix class for access by reference to R objects already in memory // note that Eigen::SparseMatrix requires a deep copy of R objects for use in C++ #ifndef RcppML_sparsematrix #define RcppML_sparsematrix #ifndef Rcpp_hpp #include #endif namespace RcppML { class SparseMatrix { public: Rcpp::IntegerVector i, p, Dim; Rcpp::NumericVector x; // Constructor from Rcpp::S4 object SparseMatrix(Rcpp::S4 m) : i(m.slot("i")), p(m.slot("p")), Dim(m.slot("Dim")), x(m.slot("x")) {} SparseMatrix() {} unsigned int rows() { return Dim[0]; } unsigned int cols() { return Dim[1]; } // const column iterator class InnerIterator { public: InnerIterator(SparseMatrix& ptr, int col) : ptr(ptr) { index = ptr.p[col]; max_index = ptr.p[col + 1]; } operator bool() const { return (index < max_index); } InnerIterator& operator++() { ++index; return *this; } const double& value() const { return ptr.x[index]; } int row() const { return ptr.i[index]; } private: SparseMatrix& ptr; int index, max_index; }; // const row iterator class InnerRowIterator { public: InnerRowIterator(SparseMatrix& ptr, int j) : ptr(ptr) { index = 0, max_index = 0; for (; index < ptr.Dim[1]; ++index) { if (ptr.i[index] == j) break; } for (int r = 0; r < ptr.i.size(); ++r) { if (ptr.i[r] == j) max_index = r; } } operator bool() const { return index <= max_index; }; InnerRowIterator& operator++() { ++index; for (; index <= max_index; ++index) { if (ptr.i[index] == row) break; } return *this; }; int col() { int j = 0; for (; j < ptr.p.size(); ++j) { if (ptr.p[j] > index) break; } return j; }; double& value() const { return ptr.x[index]; }; private: SparseMatrix& ptr; int row = 0, index, max_index; }; // return indices of rows with nonzero values for a given column Rcpp::IntegerVector nonzeroRows(int col) { return i[Rcpp::Range(p[col], p[col + 1] - 1)]; } // transpose SparseMatrix t() { Rcpp::S4 s(std::string("dgCMatrix")); s.slot("i") = i; s.slot("p") = p; s.slot("x") = x; s.slot("Dim") = Dim; Rcpp::Environment base("package:Matrix"); Rcpp::Function t_r = base["t"]; Rcpp::S4 At = t_r(Rcpp::_["x"] = s); return SparseMatrix(At); } }; } #endif RcppML/inst/include/RcppML/project.hpp0000644000176200001440000002545114122421421017323 0ustar liggesusers// This file is part of RcppML, a Rcpp Machine Learning library // // Copyright (C) 2021 Zach DeBruine // // This source code is subject to the terms of the GNU // Public License v. 2.0. #ifndef RcppML_project #define RcppML_project #ifndef RcppML_common #include #endif // solve ax = b given "a", "b", and h.col(sample) giving "x". Coordinate descent. inline void c_nnls(Eigen::MatrixXd& a, Eigen::VectorXd& b, Eigen::MatrixXd& h, const unsigned int sample) { double tol = 1; for (unsigned int it = 0; it < CD_MAXIT && (tol / b.size()) > CD_TOL; ++it) { tol = 0; for (unsigned int i = 0; i < h.rows(); ++i) { double diff = b(i) / a(i, i); if (-diff > h(i, sample)) { if (h(i, sample) != 0) { b -= a.col(i) * -h(i, sample); tol = 1; h(i, sample) = 0; } } else if (diff != 0) { h(i, sample) += diff; b -= a.col(i) * diff; tol += std::abs(diff / (h(i, sample) + TINY_NUM)); } } } } // solves ax_i = b for a two-variable system of equations, where the "i"th column in "x" contains the solution. // "denom" is a pre-conditioned denominator for solving by direct substition. inline void nnls2(const Eigen::Matrix2d& a, const double b0, const double b1, const double denom, Eigen::MatrixXd& x, const unsigned int i, const bool nonneg) { // solve least squares if (nonneg) { const double a01b1 = a(0, 1) * b1; const double a11b0 = a(1, 1) * b0; if (a11b0 < a01b1) { x(0, i) = 0; x(1, i) = b1 / a(1, 1); } else { const double a01b0 = a(0, 1) * b0; const double a00b1 = a(0, 0) * b1; if (a00b1 < a01b0) { x(0, i) = b0 / a(0, 0); x(1, i) = 0; } else { x(0, i) = (a11b0 - a01b1) / denom; x(1, i) = (a00b1 - a01b0) / denom; } } } else { x(0, i) = (a(1, 1) * b0 - a(0, 1) * b1) / denom; x(1, i) = (a(0, 0) * b1 - a(0, 1) * b0) / denom; } } // solves ax = b for a two-variable system of equations, where "b" is given in the "i"th column of "w", and replaced with the solution "x". // "denom" is a pre-conditioned denominator for solving by direct substitution. inline void nnls2InPlace(const Eigen::Matrix2d& a, const double denom, Eigen::MatrixXd& w, const bool nonneg) { for (unsigned int i = 0; i < w.cols(); ++i) { if (nonneg) { const double a01b1 = a(0, 1) * w(1, i); const double a11b0 = a(1, 1) * w(0, i); if (a11b0 < a01b1) { w(0, i) = 0; w(1, i) /= a(1, 1); } else { const double a01b0 = a(0, 1) * w(0, i); const double a00b1 = a(0, 0) * w(1, i); if (a00b1 < a01b0) { w(0, i) /= a(0, 0); w(1, i) = 0; } else { w(0, i) = (a11b0 - a01b1) / denom; w(1, i) = (a00b1 - a01b0) / denom; } } } else { double b0 = w(0, i); w(0, i) = (a(1, 1) * b0 - a(0, 1) * w(1, i)) / denom; w(1, i) = (a(0, 0) * w(1, i) - a(0, 1) * b0) / denom; } } } void project(RcppML::SparseMatrix& A, const Eigen::MatrixXd& w, Eigen::MatrixXd& h, const bool nonneg, const double L1, const unsigned int threads, const bool mask_zeros) { if (w.rows() == 1) { h.setZero(); double a = 0; for (unsigned int i = 0; i < w.cols(); ++i) a += w(0, i) * w(0, i); for (unsigned int i = 0; i < h.cols(); ++i) { for (RcppML::SparseMatrix::InnerIterator it(A, i); it; ++it) h(0, i) += it.value() * w(0, it.row()); h(0, i) /= a; } } else if (w.rows() == 2) { Eigen::Matrix2d a = w * w.transpose(); a.diagonal().array() += TINY_NUM; const double denom = a(0, 0) * a(1, 1) - a(0, 1) * a(0, 1); for (unsigned int i = 0; i < h.cols(); ++i) { double b0 = 0, b1 = 0; for (RcppML::SparseMatrix::InnerIterator it(A, i); it; ++it) { const double val = it.value(); const unsigned int r = it.row(); b0 += val * w(0, r); b1 += val * w(1, r); } nnls2(a, b0, b1, denom, h, i, nonneg); } } else { if (!mask_zeros) { Eigen::MatrixXd a = w * w.transpose(); a.diagonal().array() += TINY_NUM; Eigen::LLT a_llt = a.llt(); #ifdef _OPENMP #pragma omp parallel for num_threads(threads) schedule(dynamic) #endif for (unsigned int i = 0; i < h.cols(); ++i) { Eigen::VectorXd b = Eigen::VectorXd::Zero(h.rows()); for (RcppML::SparseMatrix::InnerIterator it(A, i); it; ++it) b += it.value() * w.col(it.row()); if (L1 != 0) b.array() -= L1; h.col(i) = a_llt.solve(b); if (nonneg && (h.col(i).array() < 0).any()) { // if unconstrained solution contains negative values b -= a * h.col(i); // update h.col(sample) with NNLS solution by coordinate descent c_nnls(a, b, h, i); } } } else { h.setZero(); #ifdef _OPENMP #pragma omp parallel for num_threads(threads) schedule(dynamic) #endif for (unsigned int i = 0; i < h.cols(); ++i) { Eigen::VectorXi nnz = Rcpp::as(A.nonzeroRows(i)); Eigen::MatrixXd w_ = submat(w, nnz); Eigen::MatrixXd a = w_ * w_.transpose(); a.diagonal().array() += TINY_NUM; Eigen::VectorXd b = Eigen::VectorXd::Zero(h.rows()); for (RcppML::SparseMatrix::InnerIterator it(A, i); it; ++it) b += it.value() * w.col(it.row()); if (L1 != 0) b.array() -= L1; if (!nonneg) h.col(i) = a.llt().solve(b); else c_nnls(a, b, h, i); } } } } void project(const Eigen::MatrixXd& A, const Eigen::MatrixXd& w, Eigen::MatrixXd& h, const bool nonneg, const double L1, const unsigned int threads, const bool mask_zeros) { if (w.rows() == 1) { h.setZero(); double a = 0; for (unsigned int i = 0; i < w.cols(); ++i) a += w(0, i) * w(0, i); for (unsigned int i = 0; i < h.cols(); ++i) { for (int j = 0; j < A.rows(); ++j) h(0, i) += A(j, i) * w(0, j); h(0, i) /= a; } } else if (w.rows() == 2) { Eigen::Matrix2d a = w * w.transpose(); a.diagonal().array() += TINY_NUM; const double denom = a(0, 0) * a(1, 1) - a(0, 1) * a(0, 1); for (unsigned int i = 0; i < h.cols(); ++i) { double b0 = 0, b1 = 0; for (int j = 0; j < A.rows(); ++j) { const double val = A(j, i); b0 += val * w(0, j); b1 += val * w(1, j); } nnls2(a, b0, b1, denom, h, i, nonneg); } } else { Eigen::MatrixXd a = w * w.transpose(); a.diagonal().array() += TINY_NUM; Eigen::LLT a_llt = a.llt(); #ifdef _OPENMP #pragma omp parallel for num_threads(threads) schedule(dynamic) #endif for (unsigned int i = 0; i < h.cols(); ++i) { Eigen::VectorXd b = Eigen::VectorXd::Zero(a.rows()); b += w * A.col(i); if (L1 != 0) b.array() -= L1; h.col(i) = a_llt.solve(b); if (nonneg && (h.col(i).array() < 0).any()) { // if unconstrained solution contains negative values b -= a * h.col(i); // update h.col(sample) with NNLS solution by coordinate descent c_nnls(a, b, h, i); } } } } void projectInPlace(RcppML::SparseMatrix& A, const Eigen::MatrixXd& h, Eigen::MatrixXd& w, const bool nonneg, const double L1, const unsigned int threads, const bool mask_zeros) { const unsigned int k = w.rows(); if (k == 1) { w.setZero(); double a = 0; for (unsigned int i = 0; i < h.cols(); ++i) a += h(0, i) * h(0, i); for (unsigned int i = 0; i < h.cols(); ++i) for (RcppML::SparseMatrix::InnerIterator it(A, i); it; ++it) w(0, it.row()) += it.value() * h(0, i); for (unsigned int i = 0; i < w.cols(); ++i) w(0, i) /= a; } else if (k == 2) { Eigen::Matrix2d a = h * h.transpose(); a.diagonal().array() += TINY_NUM; const double denom = a(0, 0) * a(1, 1) - a(0, 1) * a(0, 1); w.setZero(); for (unsigned int i = 0; i < h.cols(); ++i) { for (RcppML::SparseMatrix::InnerIterator it(A, i); it; ++it) for (unsigned int j = 0; j < 2; ++j) w(j, it.row()) += it.value() * h(j, i); } nnls2InPlace(a, denom, w, nonneg); } else { Eigen::MatrixXd a = h * h.transpose(); a.diagonal().array() += TINY_NUM; Eigen::LLT a_llt = a.llt(); #ifdef _OPENMP #pragma omp parallel for num_threads(threads) schedule(dynamic) #endif for (unsigned int i = 0; i < h.cols(); ++i) { for (RcppML::SparseMatrix::InnerIterator it(A, i); it; ++it) for (unsigned int j = 0; j < k; ++j) w(j, it.row()) += it.value() * h(j, i); } if (L1 != 0) w.array() -= L1; #ifdef _OPENMP #pragma omp parallel for num_threads(threads) schedule(dynamic) #endif for (unsigned int i = 0; i < w.cols(); ++i) { Eigen::VectorXd b = w.col(i); w.col(i) = a_llt.solve(b); if (nonneg && (w.col(i).array() < 0).any()) { b -= a * w.col(i); c_nnls(a, b, w, i); } } } } void projectInPlace(const Eigen::MatrixXd& A, const Eigen::MatrixXd& h, Eigen::MatrixXd& w, const bool nonneg, const double L1, const unsigned int threads, const bool mask_zeros) { const unsigned int k = w.rows(); if (k == 1) { w.setZero(); double a = 0; for (unsigned int i = 0; i < h.cols(); ++i) a += h(0, i) * h(0, i); for (unsigned int i = 0; i < h.cols(); ++i) for (int j = 0; j < A.rows(); ++j) w(0, j) += A(j, i) * h(0, i); for (unsigned int i = 0; i < w.cols(); ++i) w(0, i) /= a; } else if (k == 2) { Eigen::Matrix2d a = h * h.transpose(); a.diagonal().array() += TINY_NUM; const double denom = a(0, 0) * a(1, 1) - a(0, 1) * a(0, 1); w.setZero(); for (unsigned int i = 0; i < h.cols(); ++i) { for (int j = 0; j < A.rows(); ++j) for (unsigned int l = 0; l < 2; ++l) w(l, j) += A(j, i) * h(l, i); } nnls2InPlace(a, denom, w, nonneg); } else { Eigen::MatrixXd a = h * h.transpose(); a.diagonal().array() += TINY_NUM; Eigen::LLT a_llt = a.llt(); #ifdef _OPENMP #pragma omp parallel for num_threads(threads) schedule(dynamic) #endif for (unsigned int i = 0; i < h.cols(); ++i) { for (int j = 0; j < A.rows(); ++j) for (unsigned int l = 0; l < k; ++l) w(l, j) += A(j, i) * h(l, i); } if (L1 != 0) w.array() -= L1; #ifdef _OPENMP #pragma omp parallel for num_threads(threads) schedule(dynamic) #endif for (unsigned int i = 0; i < w.cols(); ++i) { Eigen::VectorXd b = w.col(i); w.col(i) = a_llt.solve(b); if (nonneg && (w.col(i).array() < 0).any()) { // if unconstrained solution contains negative values b -= a * w.col(i); // update h.col(sample) with NNLS solution by coordinate descent c_nnls(a, b, w, i); } } } } #endif RcppML/inst/include/RcppML/bipartition.hpp0000644000176200001440000002404014122420416020175 0ustar liggesusers// This file is part of RcppML, a Rcpp Machine Learning library // // Copyright (C) 2021 Zach DeBruine // // This source code is subject to the terms of the GNU // Public License v. 2.0. #ifndef RcppML_bipartition #define RcppML_bipartition #ifndef RcppML_common #include #endif #ifndef RcppML_project #include #endif struct bipartitionModel { std::vector v; double dist; unsigned int size1; unsigned int size2; std::vector samples1; std::vector samples2; std::vector center1; std::vector center2; }; // compute cluster centroid given an ipx sparse matrix and samples in the cluster center inline std::vector centroid(RcppML::SparseMatrix& A, const std::vector& samples) { std::vector center(A.rows()); for (unsigned int s = 0; s < samples.size(); ++s) for (RcppML::SparseMatrix::InnerIterator it(A, samples[s]); it; ++it) center[it.row()] += it.value(); for (unsigned int j = 0; j < A.rows(); ++j) center[j] /= samples.size(); return center; } // dense version inline std::vector centroid(const Eigen::MatrixXd& A, const std::vector& samples) { std::vector center(A.rows()); for (unsigned int s = 0; s < samples.size(); ++s) for (int r = 0; r < A.rows(); ++r) center[r] += A(r, samples[s]); for (int j = 0; j < A.rows(); ++j) center[j] /= samples.size(); return center; } // cosine distance of cells in a cluster to assigned cluster center (in_center) vs. other cluster center (out_cluster), // divided by the cosine distance to assigned cluster center // // tot_dist is given by sum for all samples of cosine distance to cognate cluster (ci) - distance to non-cognate cluster (cj) // divided by distance to cognate cluster (ci): // cosine dist to c_i, dci = sqrt(x cross c_i) / (sqrt(c_i cross c_i) * sqrt(x cross x)) // cosine dist to c_j, dcj = sqrt(x cross c_j) / (sqrt(c_j cross c_j) * sqrt(x cross x)) // tot_dist = (dci - dcj) / dci // this expression simplifies to 1 - (sqrt(c_j cross x) * sqrt(c_i cross c_i)) / (sqrt(c_i cross x) * sqrt(c_j cross c_j)) inline double rel_cosine(RcppML::SparseMatrix& A, const std::vector& samples1, const std::vector& samples2, const std::vector& center1, const std::vector& center2) { double center1_innerprod = std::sqrt(std::inner_product(center1.begin(), center1.end(), center1.begin(), (double)0)); double center2_innerprod = std::sqrt(std::inner_product(center2.begin(), center2.end(), center2.begin(), (double)0)); double dist1 = 0, dist2 = 0; for (unsigned int s = 0; s < samples1.size(); ++s) { double x1_center1 = 0, x1_center2 = 0; for (RcppML::SparseMatrix::InnerIterator it(A, samples1[s]); it; ++it) { x1_center1 += center1[it.row()] * it.value(); x1_center2 += center2[it.row()] * it.value(); } dist1 += (std::sqrt(x1_center2) * center1_innerprod) / (std::sqrt(x1_center1) * center2_innerprod); } for (unsigned int s = 0; s < samples2.size(); ++s) { double x2_center1 = 0, x2_center2 = 0; for (RcppML::SparseMatrix::InnerIterator it(A, samples2[s]); it; ++it) { x2_center1 += center1[it.row()] * it.value(); x2_center2 += center2[it.row()] * it.value(); } dist2 += (std::sqrt(x2_center1) * center2_innerprod) / (std::sqrt(x2_center2) * center1_innerprod); } return (dist1 + dist2) / (2 * A.rows()); } inline double rel_cosine(const Eigen::MatrixXd& A, const std::vector& samples1, const std::vector& samples2, const std::vector& center1, const std::vector& center2) { double center1_innerprod = std::sqrt(std::inner_product(center1.begin(), center1.end(), center1.begin(), (double)0)); double center2_innerprod = std::sqrt(std::inner_product(center2.begin(), center2.end(), center2.begin(), (double)0)); double dist1 = 0, dist2 = 0; for (unsigned int s = 0; s < samples1.size(); ++s) { double x1_center1 = 0, x1_center2 = 0; for (int r = 0; r < A.rows(); ++r) { x1_center1 += center1[r] * A(r, samples1[s]); x1_center2 += center2[r] * A(r, samples1[s]); } dist1 += (std::sqrt(x1_center2) * center1_innerprod) / (std::sqrt(x1_center1) * center2_innerprod); } for (unsigned int s = 0; s < samples2.size(); ++s) { double x2_center1 = 0, x2_center2 = 0; for (int r = 0; r < A.rows(); ++r) { x2_center1 += center1[r] * A(r, samples2[s]); x2_center2 += center2[r] * A(r, samples2[s]); } dist2 += (std::sqrt(x2_center1) * center2_innerprod) / (std::sqrt(x2_center2) * center1_innerprod); } return (dist1 + dist2) / (2 * A.rows()); } void scale(Eigen::VectorXd& d, Eigen::MatrixXd& w) { d = w.rowwise().sum(); d.array() += TINY_NUM; for (unsigned int i = 0; i < w.rows(); ++i) for (unsigned int j = 0; j < w.cols(); ++j) w(i, j) /= d(i); } inline bipartitionModel c_bipartition_sparse( RcppML::SparseMatrix& A, Eigen::MatrixXd w, const std::vector samples, const double tol, const bool nonneg, const bool calc_dist, const unsigned int maxit, const bool verbose) { // rank-2 nmf Eigen::MatrixXd w_it, h(w.rows(), samples.size()); Eigen::VectorXd d = Eigen::VectorXd::Ones(2); if (verbose) Rprintf("\n%4s | %8s \n---------------\n", "iter", "tol"); double tol_ = 1; for (unsigned int iter = 0; iter < maxit && tol_ > tol; ++iter) { w_it = w; // update h Eigen::Matrix2d a = w * w.transpose(); double denom = a(0, 0) * a(1, 1) - a(0, 1) * a(0, 1); for (unsigned int i = 0; i < h.cols(); ++i) { double b0 = 0, b1 = 0; for (RcppML::SparseMatrix::InnerIterator it(A, samples[i]); it; ++it) { const double val = it.value(); const unsigned int r = it.row(); b0 += val * w(0, r); b1 += val * w(1, r); } nnls2(a, b0, b1, denom, h, i, nonneg); } scale(d, h); // update w a = h * h.transpose(); denom = a(0, 0) * a(1, 1) - a(0, 1) * a(0, 1); w.setZero(); for (unsigned int i = 0; i < h.cols(); ++i) { for (RcppML::SparseMatrix::InnerIterator it(A, samples[i]); it; ++it) for (unsigned int j = 0; j < 2; ++j) w(j, it.row()) += it.value() * h(j, i); } nnls2InPlace(a, denom, w, nonneg); scale(d, w); tol_ = cor(w, w_it); if (verbose) Rprintf("%4d | %8.2e\n", iter + 1, tol_); } // calculate bipartitioning vector unsigned int size1 = 0, size2 = 0; std::vector v(h.cols()), center1(w.cols()), center2(w.cols()); if (d(0) > d(1)) { for (unsigned int j = 0; j < h.cols(); ++j) { v[j] = h(0, j) - h(1, j); v[j] > 0 ? ++size1 : ++size2; } } else { for (unsigned int j = 0; j < h.cols(); ++j) { v[j] = h(1, j) - h(0, j); v[j] > 0 ? ++size1 : ++size2; } } std::vector samples1(size1), samples2(size2); double dist = -1; // get indices of samples in both clusters unsigned int s1 = 0, s2 = 0; for (unsigned int j = 0; j < h.cols(); ++j) { if (v[j] > 0) { samples1[s1] = samples[j]; ++s1; } else { samples2[s2] = samples[j]; ++s2; } } if (calc_dist) { // calculate the centers of both clusters center1 = centroid(A, samples1); center2 = centroid(A, samples2); // calculate relative cosine similarity of all samples to ((assigned - other) / assigned) cluster dist = rel_cosine(A, samples1, samples2, center1, center2); } return bipartitionModel{ v, dist, size1, size2, samples1, samples2, center1, center2 }; } inline bipartitionModel c_bipartition_dense( const Eigen::MatrixXd& A, Eigen::MatrixXd w, const std::vector samples, const double tol, const bool nonneg, const bool calc_dist, const unsigned int maxit, const bool verbose) { // rank-2 nmf Eigen::MatrixXd w_it, h(w.rows(), samples.size()); Eigen::VectorXd d = Eigen::VectorXd::Ones(2); if (verbose) Rprintf("\n%4s | %8s \n---------------\n", "iter", "tol"); double tol_ = 1; for (unsigned int iter = 0; iter < maxit && tol_ > tol; ++iter) { w_it = w; // update h Eigen::Matrix2d a = w * w.transpose(); double denom = a(0, 0) * a(1, 1) - a(0, 1) * a(0, 1); for (unsigned int i = 0; i < h.cols(); ++i) { double b0 = 0, b1 = 0; for (int j = 0; j < A.rows(); ++j) { const double val = A(j, samples[i]); b0 += val * w(0, j); b1 += val * w(1, j); } nnls2(a, b0, b1, denom, h, i, nonneg); } scale(d, h); // update w a = h * h.transpose(); denom = a(0, 0) * a(1, 1) - a(0, 1) * a(0, 1); w.setZero(); for (unsigned int i = 0; i < h.cols(); ++i) { for (int j = 0; j < A.rows(); ++j) for (unsigned int l = 0; l < 2; ++l) w(l, j) += A(j, samples[i]) * h(l, i); } nnls2InPlace(a, denom, w, nonneg); scale(d, w); tol_ = cor(w, w_it); if (verbose) Rprintf("%4d | %8.2e\n", iter + 1, tol_); } // calculate bipartitioning vector unsigned int size1 = 0, size2 = 0; std::vector v(h.cols()), center1(w.cols()), center2(w.cols()); if (d(0) > d(1)) { for (unsigned int j = 0; j < h.cols(); ++j) { v[j] = h(0, j) - h(1, j); v[j] > 0 ? ++size1 : ++size2; } } else { for (unsigned int j = 0; j < h.cols(); ++j) { v[j] = h(1, j) - h(0, j); v[j] > 0 ? ++size1 : ++size2; } } std::vector samples1(size1), samples2(size2); double dist = -1; // get indices of samples in both clusters unsigned int s1 = 0, s2 = 0; for (unsigned int j = 0; j < h.cols(); ++j) { if (v[j] > 0) { samples1[s1] = samples[j]; ++s1; } else { samples2[s2] = samples[j]; ++s2; } } if (calc_dist) { // calculate the centers of both clusters center1 = centroid(A, samples1); center2 = centroid(A, samples2); // calculate relative cosine similarity of all samples to ((assigned - other) / assigned) cluster dist = rel_cosine(A, samples1, samples2, center1, center2); } return bipartitionModel{ v, dist, size1, size2, samples1, samples2, center1, center2 }; } #endif RcppML/inst/include/RcppML/nmf.hpp0000644000176200001440000002111314122420475016434 0ustar liggesusers// This file is part of RcppML, a Rcpp Machine Learning library // // Copyright (C) 2021 Zach DeBruine // // This source code is subject to the terms of the GNU // Public License v. 2.0. #ifndef RcppML_nmf #define RcppML_nmf #ifndef RcppML_common #include #endif #ifndef RcppML_project #include #endif inline bool is_appx_symmetric(RcppML::SparseMatrix& A) { if (A.rows() == A.cols()) { RcppML::SparseMatrix::InnerIterator col_it(A, 0); RcppML::SparseMatrix::InnerRowIterator row_it(A, 0); while (++col_it && ++row_it) if (col_it.value() != row_it.value()) return false; return true; } else return false; } inline bool is_appx_symmetric(Eigen::MatrixXd& A) { if (A.rows() == A.cols()) { for (int i = 0; i < A.cols(); ++i) if (A(i, 0) != A(0, i)) return false; return true; } else return false; } namespace RcppML { class MatrixFactorization { public: Eigen::MatrixXd w; Eigen::VectorXd d; Eigen::MatrixXd h; double tol_ = -1; unsigned int iter_ = 0; bool nonneg = true, updateInPlace = false, diag = true, verbose = true, mask_zeros = true; double L1_w = 0, L1_h = 0, tol = 1e-4; unsigned int maxit = 100, threads = 0; MatrixFactorization(const unsigned int k, const unsigned int nrow, const unsigned int ncol, const unsigned int seed = 0) { w = randomMatrix(k, nrow, seed); h = Eigen::MatrixXd(k, ncol); d = Eigen::VectorXd::Ones(k); } MatrixFactorization(Eigen::MatrixXd& w, Eigen::VectorXd& d, Eigen::MatrixXd& h) : w(w), d(d), h(h) { if (w.rows() != h.rows()) Rcpp::stop("number of rows in 'w' and 'h' are not equal!"); if (d.size() != w.rows()) Rcpp::stop("length of 'd' is not equal to number of rows in 'w' and 'h'"); } Eigen::MatrixXd matrixW() { return w; } Eigen::VectorXd vectorD() { return d; } Eigen::MatrixXd matrixH() { return h; } double fit_tol() { return tol_; } unsigned int fit_iter() { return iter_; } // update "h" void projectH(RcppML::SparseMatrix& A) { project(A, w, h, nonneg, L1_h, threads, mask_zeros); } void projectH(Eigen::MatrixXd& A) { project(A, w, h, nonneg, L1_h, threads, mask_zeros); } // update "w" void projectW(RcppML::SparseMatrix& A) { if (is_appx_symmetric(A)) project(A, h, w, nonneg, L1_w, threads, mask_zeros); else projectInPlace(A, h, w, nonneg, L1_w, threads, mask_zeros); } void projectW(Eigen::MatrixXd& A) { if (is_appx_symmetric(A)) project(A, h, w, nonneg, L1_w, threads, mask_zeros); else projectInPlace(A, h, w, nonneg, L1_w, threads, mask_zeros); } double mse(RcppML::SparseMatrix& A) { Eigen::MatrixXd w0 = w.transpose(); // multiply w by diagonal for (unsigned int i = 0; i < w0.cols(); ++i) for (unsigned int j = 0; j < w0.rows(); ++j) w0(j, i) *= d(i); // calculate total loss with parallelization across samples Eigen::VectorXd losses(h.cols()); losses.setZero(); double sq_loss = 0; if (!mask_zeros) { #ifdef _OPENMP #pragma omp parallel for num_threads(threads) schedule(dynamic) #endif for (unsigned int i = 0; i < h.cols(); ++i) { Eigen::VectorXd wh_i = w0 * h.col(i); for (RcppML::SparseMatrix::InnerIterator iter(A, i); iter; ++iter) wh_i(iter.row()) -= iter.value(); sq_loss += wh_i.array().square().sum(); } } else { #ifdef _OPENMP #pragma omp parallel for num_threads(threads) schedule(dynamic) #endif for (unsigned int i = 0; i < h.cols(); ++i) { Eigen::VectorXd wh_i = w0 * h.col(i); for (RcppML::SparseMatrix::InnerIterator iter(A, i); iter; ++iter) sq_loss += std::pow(wh_i(iter.row()) - iter.value(), 2); } } return sq_loss / (h.cols() * w.cols()); } double mse(Eigen::MatrixXd& A) { if(mask_zeros) Rcpp::stop("mask_zeros = TRUE is not supported for mse(Eigen::MatrixXd)"); Eigen::MatrixXd w0 = w.transpose(); for (unsigned int i = 0; i < w0.cols(); ++i) for (unsigned int j = 0; j < w0.rows(); ++j) w0(j, i) *= d(i); Eigen::VectorXd losses(h.cols()); losses.setZero(); #ifdef _OPENMP #pragma omp parallel for num_threads(threads) schedule(dynamic) #endif for (unsigned int i = 0; i < h.cols(); ++i) { Eigen::VectorXd wh_i = w0 * h.col(i); for (int j = 0; j < A.rows(); ++j) wh_i(j) -= A(j, i); for (unsigned int j = 0; j < wh_i.size(); ++j) losses(i) += std::pow(wh_i(j), 2); } return losses.sum() / (h.cols() * w.cols()); } void sortByDiagonal() { if (w.rows() == 2 && d(0) < d(1)) { w.row(1).swap(w.row(0)); h.row(1).swap(h.row(0)); const double d1 = d(1); d(1) = d(0); d(0) = d1; } else if (w.rows() > 2) { std::vector indx = sort_index(d); w = reorder_rows(w, indx); d = reorder(d, indx); h = reorder_rows(h, indx); } } // scale rows in "w" to sum to 1, where "d" is rowsums of "w" void scaleW() { d = w.rowwise().sum(); d.array() += TINY_NUM; for (unsigned int i = 0; i < w.rows(); ++i) for (unsigned int j = 0; j < w.cols(); ++j) w(i, j) /= d(i); } void scaleH() { d = h.rowwise().sum(); d.array() += TINY_NUM; for (unsigned int i = 0; i < h.rows(); ++i) for (unsigned int j = 0; j < h.cols(); ++j) h(i, j) /= d(i); } // fit the model by alternating least squares projections void fit(RcppML::SparseMatrix& A) { if(mask_zeros && updateInPlace) { Rcpp::warning("'mask_zeros = TRUE' is not supported when 'updateInPlace = true'. Setting 'updateInPlace = false'"); updateInPlace = false; } else if(mask_zeros && w.rows() < 3) Rcpp::stop("'mask_zeros = TRUE' is not supported when k = 1 or 2"); if (verbose) Rprintf("\n%4s | %8s \n---------------\n", "iter", "tol"); RcppML::SparseMatrix At; bool is_A_symmetric = is_appx_symmetric(A); if (!is_A_symmetric && !updateInPlace) At = A.t(); for (; iter_ < maxit; ++iter_) { // alternating least squares updates Eigen::MatrixXd w_it = w; // update "h" project(A, w, h, nonneg, L1_h, threads, mask_zeros); if (diag) scaleH(); // reset diagonal and scale "h" // update "w" if (is_A_symmetric) project(A, h, w, nonneg, L1_w, threads, mask_zeros); else if (updateInPlace) projectInPlace(A, h, w, nonneg, L1_w, threads, mask_zeros); else project(At, h, w, nonneg, L1_w, threads, mask_zeros); if (diag) scaleW(); // reset diagonal and scale "w" tol_ = cor(w, w_it); // correlation between "w" across consecutive iterations if (verbose) Rprintf("%4d | %8.2e\n", iter_ + 1, tol_); if (tol_ < tol) break; Rcpp::checkUserInterrupt(); } if (tol_ > tol && iter_ == maxit && verbose) Rprintf("\n convergence not reached in %d iterations\n (actual tol = %4.2e, target tol = %4.2e)", iter_, tol_, tol); if (diag) sortByDiagonal(); } // fit the model by alternating least squares projections void fit(Eigen::MatrixXd& A) { if(mask_zeros) Rcpp::stop("'mask_zeros = TRUE' is not supported for fit(Eigen::MatrixXd)"); if (verbose) Rprintf("\n%4s | %8s \n---------------\n", "iter", "tol"); Eigen::MatrixXd At; bool is_A_symmetric = is_appx_symmetric(A); if (!is_A_symmetric && !updateInPlace) At = A.transpose(); for (; iter_ < maxit; ++iter_) { // alternating least squares updates Eigen::MatrixXd w_it = w; // update "h" project(A, w, h, nonneg, L1_h, threads, mask_zeros); if (diag) scaleH(); // reset diagonal and scale "h" // update "w" if (is_A_symmetric) project(A, h, w, nonneg, L1_w, threads, mask_zeros); else if (updateInPlace) projectInPlace(A, h, w, nonneg, L1_w, threads, mask_zeros); else project(At, h, w, nonneg, L1_w, threads, mask_zeros); if (diag) scaleW(); // reset diagonal and scale "w" tol_ = cor(w, w_it); // correlation between "w" across consecutive iterations if (verbose) Rprintf("%4d | %8.2e\n", iter_ + 1, tol_); if (tol_ < tol) break; Rcpp::checkUserInterrupt(); } if (tol_ > tol && iter_ == maxit && verbose) Rprintf("\n convergence not reached in %d iterations\n (actual tol = %4.2e, target tol = %4.2e)", iter_, tol_, tol); if (diag) sortByDiagonal(); } }; } #endif RcppML/inst/include/RcppML/bits.hpp0000644000176200001440000001210714111717072016620 0ustar liggesusers// This file is part of RcppML, a Rcpp Machine Learning library // // Copyright (C) 2021 Zach DeBruine // // This source code is subject to the terms of the GNU // Public License v. 2.0. #ifndef RcppML_bits #define RcppML_bits #ifndef RcppML_common #include #endif // these functions for matrix subsetting are documented here: // http://eigen.tuxfamily.org/dox-devel/TopicCustomizing_NullaryExpr.html#title1 // official support will likely appear in Eigen 4.0, this is a patch in the meantime template class indexing_functor { const ArgType& m_arg; const RowIndexType& m_rowIndices; const ColIndexType& m_colIndices; public: typedef Eigen::Matrix MatrixType; indexing_functor(const ArgType& arg, const RowIndexType& row_indices, const ColIndexType& col_indices) : m_arg(arg), m_rowIndices(row_indices), m_colIndices(col_indices) {} const typename ArgType::Scalar& operator() (Eigen::Index row, Eigen::Index col) const { return m_arg(m_rowIndices[row], m_colIndices[col]); } }; template Eigen::CwiseNullaryOp, typename indexing_functor::MatrixType> submat(const Eigen::MatrixBase& arg, const RowIndexType& row_indices, const ColIndexType& col_indices) { typedef indexing_functor Func; typedef typename Func::MatrixType MatrixType; return MatrixType::NullaryExpr(row_indices.size(), col_indices.size(), Func(arg.derived(), row_indices, col_indices)); } Eigen::MatrixXd submat(const Eigen::MatrixXd& x, const Eigen::VectorXi& col_indices){ Eigen::MatrixXd x_(x.rows(), col_indices.size()); for(unsigned int i = 0; i < col_indices.size(); ++i) x_.col(i) = x.col(col_indices(i)); return x_; } inline Eigen::VectorXd subvec(const Eigen::MatrixXd& b, const Eigen::VectorXi& ind, const unsigned int col) { Eigen::VectorXd bsub(ind.size()); for (unsigned int i = 0; i < ind.size(); ++i) bsub(i) = b(ind(i), col); return bsub; } inline Eigen::VectorXi find_gtz(const Eigen::MatrixXd& x, const unsigned int col) { unsigned int n_gtz = 0; for (unsigned int i = 0; i < x.rows(); ++i) if (x(i, col) > 0) ++n_gtz; Eigen::VectorXi gtz(n_gtz); unsigned int j = 0; for (unsigned int i = 0; i < x.rows(); ++i) { if (x(i, col) > 0) { gtz(j) = i; ++j; } } return gtz; } // Pearson correlation between two matrices inline double cor(Eigen::MatrixXd& x, Eigen::MatrixXd& y) { double x_i, y_i, sum_x = 0, sum_y = 0, sum_xy = 0, sum_x2 = 0, sum_y2 = 0; const unsigned int n = x.size(); for (unsigned int i = 0; i < n; ++i) { x_i = (*(x.data() + i)); y_i = (*(y.data() + i)); sum_x += x_i; sum_y += y_i; sum_xy += x_i * y_i; sum_x2 += x_i * x_i; sum_y2 += y_i * y_i; } return 1 - (n * sum_xy - sum_x * sum_y) / std::sqrt((n * sum_x2 - sum_x * sum_x) * (n * sum_y2 - sum_y * sum_y)); } // calculate sort index of vector "d" in decreasing order inline std::vector sort_index(const Eigen::VectorXd& d) { std::vector idx(d.size()); std::iota(idx.begin(), idx.end(), 0); sort(idx.begin(), idx.end(), [&d](size_t i1, size_t i2) {return d[i1] > d[i2];}); return idx; } // reorder rows in dynamic matrix "x" by integer vector "ind" inline Eigen::MatrixXd reorder_rows(const Eigen::MatrixXd& x, const std::vector& ind) { Eigen::MatrixXd x_reordered(x.rows(), x.cols()); for (unsigned int i = 0; i < ind.size(); ++i) x_reordered.row(i) = x.row(ind[i]); return x_reordered; } // reorder elements in vector "x" by integer vector "ind" inline Eigen::VectorXd reorder(const Eigen::VectorXd& x, const std::vector& ind) { Eigen::VectorXd x_reordered(x.size()); for (unsigned int i = 0; i < ind.size(); ++i) x_reordered(i) = x(ind[i]); return x_reordered; } std::vector getRandomValues(const unsigned int len, const unsigned int seed){ if(seed > 0){ Rcpp::Environment base_env("package:base"); Rcpp::Function set_seed_r = base_env["set.seed"]; set_seed_r(std::floor(std::fabs(seed))); } Rcpp::NumericVector R_RNG_random_values = Rcpp::runif(len); std::vector random_values = Rcpp::as >(R_RNG_random_values); return random_values; } Eigen::MatrixXd randomMatrix(const unsigned int nrow, const unsigned int ncol, const unsigned int seed){ std::vector random_values = getRandomValues(nrow * ncol, seed); Eigen::MatrixXd x(nrow, ncol); unsigned int indx = 0; for(unsigned int r = 0; r < nrow; ++r) for(unsigned int c = 0; c < ncol; ++c, ++indx) x(r, c) = random_values[indx]; return x; } #endif