maotai/0000755000176200001440000000000014760112542011526 5ustar liggesusersmaotai/tests/0000755000176200001440000000000014753205125012672 5ustar liggesusersmaotai/tests/testthat/0000755000176200001440000000000014760112542014530 5ustar liggesusersmaotai/tests/testthat/test-weiszfeld.R0000644000176200001440000000036414753205125017631 0ustar liggesuserstest_that("weiszfeld", { n = 20 p = 10 t = seq(from=0,to=10,length.out=p) X = array(0,c(n,p)) for (i in 1:n){ X[i,] = sin(t) + stats::rnorm(p, sd=0.5) } vecL1 = as.vector(weiszfeld(X)) expect_equal(length(vecL1), p) }) maotai/tests/testthat/test-cov2funcs.R0000644000176200001440000000064714753205125017551 0ustar liggesuserslibrary(maotai) test_that("cov2corr works", { myp = 5 myn = 50 prep_mat = stats::cov(matrix(rnorm(myn*myp), ncol=myp)) vec_corr = diag(maotai::cov2corr(prep_mat)) expect_equal(vec_corr, rep(1, myp)) }) test_that("cov2pcorr works", { myp = 5 myn = 50 prep_mat = stats::cov(matrix(rnorm(myn*myp), ncol=myp)) vec_pcor = diag(maotai::cov2pcorr(prep_mat)) expect_equal(vec_pcor, rep(1, myp)) })maotai/tests/testthat/test-tSNE.R0000644000176200001440000000050014753205125016436 0ustar liggesuserstest_that("tSNE", { myn = 10 myp = 3 prep_mat = rbind(matrix(rnorm(myn*myp, mean=-2), ncol=myp), matrix(rnorm(myn*myp, mean= 2), ncol=myp)) run_tsne = maotai::tsne(prep_mat, ndim=2, perplexity=5)$embed expect_equal(base::nrow(run_tsne), 2*myn) expect_equal(base::ncol(run_tsne), 2) }) maotai/tests/testthat/test-bmds.R0000644000176200001440000000065114753205125016561 0ustar liggesuserslibrary(maotai) test_that("Bayesian MDS", { myn = 10 myp = 3 prep_mat = rbind(matrix(rnorm(myn*myp, mean=-2), ncol=myp), matrix(rnorm(myn*myp, mean= 2), ncol=myp)) iris2d = maotai::bmds(prep_mat, ndim=2, mc.iter=2, par.step=4, verbose=FALSE) n_rows = base::nrow(iris2d$embed) n_cols = base::ncol(iris2d$embed) expect_equal(n_rows, base::nrow(prep_mat)) expect_equal(n_cols, 2) }) maotai/tests/testthat.R0000644000176200001440000000007014753205125014652 0ustar liggesuserslibrary(testthat) library(maotai) test_check("maotai") maotai/MD50000644000176200001440000001321614760112542012041 0ustar liggesusers707b9a96120606edd0379ddcf130e253 *DESCRIPTION f142f1787fbd58ec16564135c168b659 *LICENSE cb038a63e4d9f51e5724d4c039a1f58e *NAMESPACE e05526acb2f07e08c4c321bf1f92ebd2 *NEWS.md 65776d8d3abd01020dd4a3b1f1e31ae1 *R/RcppExports.R ed91d7038f764a8874f9edc8712a23a4 *R/WLbarycenter.R 251f69d40ea347f8a8519120f492deaf *R/WLpdist.R 51bfb78b30af308931573295bc3498c7 *R/aux_checkers.R 27d726ee03c804e21875eaad5387508f *R/aux_computation.R 5eee2ac697e448495d5e73c97af8336d *R/aux_ecdf.R 67a3002cea011542f3d77b6fe93bae4c *R/aux_hidden_compute.R 525088b01c106f67034ba7024a77444f *R/aux_hidden_dist.R 7107f2e8aac0948395e3cf7627851327 *R/aux_hidden_operation.R eb3423edb2f8e2c7c68a191caa4196d7 *R/bmds.R f4ccc6e3cd6e44edeaeb7dc044e9dbd1 *R/boot.mblock.R c37c6ebf43436e87258d9ec929a4361e *R/boot.stationary.R d2813838bd11ac4df10a6bd6b7cb47d4 *R/cayleymenger.R 004adc13642858eb7e4f0725dd7b8734 *R/checkdist.R 358f16e5d5d6fd2ee363f958003a479f *R/checkmetric.R f0ec222883b44112c09e5ecba47f28ef *R/cmds.R 656fe8643f2c7d4647bf214a81a0db4d *R/cov2corr.R ccebef2c3c8d41131cad0db572c4c9ca *R/cov2pcorr.R 2ffe229f65ceec29d60596c502c6d575 *R/dpmeans.R 779736fd573a442619505a4673d273d4 *R/ecdfdist.R 7a60bf389a581a6e493b875a444b1d59 *R/ecdfdist2.R 7b8f309e06853a3703546f2da84a6798 *R/epmeans.R 5ca02918b848d8d82654cc863923a505 *R/kmeanspp.R 85a577726c1096b73895a5a090e9a0a2 *R/later_findmed.R 65dabf4d39adb27399189e81e826ec17 *R/later_install_scipy.R addeab0b35666f835e6da843ba31281e *R/later_matrix1F1.R 78eeb5fb29b95d8cbe94996406e1cca6 *R/later_matrix2F1.R 9170ad32f09650ce40fa9beadfca785e *R/later_mbeta.R b99168f9714d638fe2600c581ab1299f *R/later_mgamma.R cde19e7f011412372fe3099979763056 *R/later_scalar0F1.R 4e9c65415524f68e5426991d1cb10bdb *R/later_scalar1F1.R 4a5fa8fee3aae16ba5e8c3fafcd5beb1 *R/later_scalar2F1.R 30a56bdbdc6cd8df6e8631fee34a579b *R/later_scalarpFq.R 3b5afe7d2b7beaeb923961699b7ba038 *R/lgpa.R 1b43b5efa6455506f7572fabba2b1ce8 *R/lyapunov.R cfd5dc66dadca55e2128d416b792b726 *R/matderiv.R e2bff016d42f72ec0140f9e7239f973a *R/metricdepth.R f00182fcdb1cbe15777449982f046573 *R/mmd2test.R 8e9324073bec48f3c12763026ee6edf0 *R/movMF_convert.R e9cc1011708fe02fc1475a414604dec0 *R/movMF_info.R ea3c0c4cbd88baed36d04c3889e9af6b *R/movMF_reduce_greedy.R 546cb96ad98d602c9bd052b7d1e2c0f2 *R/movMF_reduce_partitional.R 236a449d527f2f4824087132344a76b2 *R/nef.R b47c9df2c3934a39455f77d302a322db *R/nem.R 218d035618ee31c5daf6ba5130689f66 *R/obsolete_distgmm.R c6bf08a86dd291c64f45d4a2a1702c09 *R/package-maotai.R b82737ec632f146f42893073add7a750 *R/pdeterminant.R 2edf7c839c76a3abf7a8196d5f8bf181 *R/rotationS2.R abb5665931afe18e0f308265c003785e *R/shortestpath.R 96101622e14735fd82afcb26a00e138e *R/sylvester.R e93a665227b80a5161bfe6271cbcb82b *R/trio.R 49d67fddf4878ec77bb043d07bc73b3b *R/tsne.R 65d2e578e49884d4882fc4a561dd37f8 *R/weiszfeld.R 4d5cf1d36ccb02606f5d472714b84d23 *R/zzz.R d4fc0cda9e3e9b4d1e55dbfca80afb2e *README.md 7fff6d82fba9dc387f6e0f1f13af5b8e *build/partial.rdb a2ca9a83ec97f106abee8ec85cc1f5d4 *inst/REFERENCES.bib 8e509cf33257c8230ae3776cc1249a45 *inst/include/maoarma.hpp 17d403e1938655b6eb327af026e91db3 *inst/include/procedures/maoarma_basic.h ab59060da621a19ae1d030accdbe0218 *man/WLbarycenter.Rd 5dc8ba18dd5b3bc8aabfdf40a3bf28b0 *man/WLpdist.Rd 263effb529a3aa96fcea82323c3dfc1f *man/bmds.Rd 2c9d5c0e1a1cc0fae2a0a9c274849074 *man/boot.mblock.Rd 1aa10c2aa5b9c58dc1ba9abfb0c3e438 *man/boot.stationary.Rd 14bd1ae1842487a6ed976be18cffb7d5 *man/cayleymenger.Rd 685627ad39a9ccef8165b906c9db7667 *man/checkdist.Rd 83e44f9bfbea73ba0d84f714d8e5012e *man/checkmetric.Rd e247b900a16d3dadb6e2498d07f74bcc *man/cmds.Rd 2e8cdee0902d41df71a6c4923e8afe3a *man/cov2corr.Rd 9e53ef1a9b727248862c7cc7fdd88f25 *man/cov2pcorr.Rd 1f765107f8927d308fdae227f607e85a *man/dpmeans.Rd 2d49541ed6123690eb30f716826b4796 *man/ecdfdist.Rd fff165ab33a4711115041b60ec09a38f *man/ecdfdist2.Rd 1cdbc976bd38f4b0db2fb1e94a0b3950 *man/epmeans.Rd bff8d60ac1752ac074cb738cb8015aea *man/kmeanspp.Rd 9f1edeb021a361539bc43c4a0078b73f *man/lgpa.Rd 591c77266f9bc75e011408faee4d8b3c *man/lyapunov.Rd 1bdc558932a70f86d93952e5ec2b4f6e *man/matderiv.Rd 0981dd4c5cc1cce7928e9dd64da8b6ac *man/metricdepth.Rd de4bdeb25ad50f6a005871028dd23975 *man/mmd2test.Rd c93dadab3d1d3b589da3226761ec1fad *man/movMF_convert.Rd cc04844bf40ed6e855055d44048441c6 *man/movMF_info.Rd f2f321f7ae5a8f2c6b98247787ae720b *man/movMF_reduce_greedy.Rd 70650cc2ea5eb564f9523a04f6ceb2c5 *man/movMF_reduce_partitional.Rd bb7b9dd89e8cbd62aa2619c0370b1273 *man/nef.Rd 2103286fdc3361b142ce0c88a3750208 *man/nem.Rd f8c60820540144bb5fc88010a0b84e27 *man/pdeterminant.Rd d86db4835b6bdbb11daae1ea99f7dbad *man/rotationS2.Rd a3a8427367e5eb42ab13a66de38218f3 *man/shortestpath.Rd eac282286a43c132dc383b09cf500618 *man/sylvester.Rd 58d05dbbee3cdbd39ad4fdbcc9a7fa33 *man/trio.Rd b6cc5266dcdca322917e361fa2903861 *man/tsne.Rd 6c6d4ef21e830de4144f2cf078a25b07 *man/weiszfeld.Rd e62585629948e7dd143efbad43c2a6eb *src/Makevars e62585629948e7dd143efbad43c2a6eb *src/Makevars.win ae9890c6ec76e8bee6db609245995f11 *src/RcppExports.cpp 543679621f60ce316a64f25819c5225f *src/cpp_WLvMF.cpp 927e5b5be0b72a5ae1d9c549a6a3891c *src/cpp_bmds.cpp 35f104021bce4cac1ce627776766e49b *src/cpp_casket.cpp d306f032a33099427a615c852818bf2a *src/cpp_mmds.cpp 1c569eacbf94379171cc9bb3d6d9ea9f *src/cpp_smacof.cpp 0d286353a8df80d22906beae5d9ecf23 *src/evaluations.cpp 1054100501c3b4e5c1bf8d7294cf908c *src/evaluations.h af4a1f1a1f990bd17dd1d94f223724dd *src/src_computations.cpp f77850c719512917a5c688cbb82027f8 *tests/testthat.R 0398706507f0a43e73b6aa57c54c6139 *tests/testthat/test-bmds.R b2fefd79bc23ff301d3d2b68783c7e09 *tests/testthat/test-cov2funcs.R 9764edc0b140589afab101d0f94b0627 *tests/testthat/test-tSNE.R 656f20c84d6c72b4c4e34ae1c75359e5 *tests/testthat/test-weiszfeld.R maotai/R/0000755000176200001440000000000014757667140011746 5ustar liggesusersmaotai/R/package-maotai.R0000644000176200001440000000166214754242446014734 0ustar liggesusers#' Tools for Matrix Algebra, Optimization and Inference #' #' A matrix is an universal and sometimes primary object/unit in applied mathematics and statistics. We provide a number of algorithms for selected problems in optimization and statistical inference. #' #' @keywords internal #' @name package-maotai #' @import Rdpack #' @noRd #' @importFrom dbscan dbscan #' @importFrom fastcluster hclust #' @importFrom RANN nn2 #' @importFrom cluster pam silhouette #' @importFrom stats as.dist knots ecdf rnorm runif quantile dist rgamma rgeom var cov lm #' @importFrom shapes procGPA #' @importFrom Rtsne Rtsne #' @importFrom pracma cross #' @importFrom utils packageVersion #' @importFrom RSpectra eigs #' @importFrom Matrix rankMatrix #' @importFrom Rcpp evalCpp #' @useDynLib maotai "_PACKAGE" # pack <- "maotai" # path <- find.package(pack) # system(paste(shQuote(file.path(R.home("bin"), "R")), # "CMD", "Rd2pdf", shQuote(path))) maotai/R/cov2corr.R0000644000176200001440000000274714753205125013625 0ustar liggesusers#' Convert Covariance into Correlation Matrix #' #' Given a covariance matrix, return a correlation matrix that has unit diagonals. #' We strictly impose (and check) whether the given input is a symmetric matrix #' of full-rank. #' #' @param mat a \eqn{(p\times p)} covariance matrix. #' #' @return a \eqn{(p\times p)} correlation matrix. #' #' @examples #' \donttest{ #' # generate an empirical covariance scaled #' prep_mat = stats::cov(matrix(rnorm(100*10),ncol=10)) #' prep_vec = diag(as.vector(stats::runif(10, max=5))) #' prep_cov = prep_vec%*%prep_mat%*%prep_vec #' #' # compute correlation matrix #' prep_cor = cov2corr(prep_cov) #' #' # visualize #' opar <- par(no.readonly=TRUE) #' par(mfrow=c(1,2), pty="s") #' image(prep_cov, axes=FALSE, main="covariance") #' image(prep_cor, axes=FALSE, main="correlation") #' par(opar) #' } #' #' @export cov2corr <- function(mat){ # checker if (!check_covariance(mat)){ stop("* cov2corr : an input 'mat' is not a valid covariance matrix.") } dvec = diag(1/sqrt(diag(mat))) return(dvec%*%mat%*%dvec) } # checker ----------------------------------------------------------------- #' @keywords internal #' @noRd check_covariance <- function(mat){ # matrix if (!is.matrix(mat)){ return(FALSE) } # symmetric if (!isSymmetric(mat)){ return(FALSE) } # all positive diagonal if (any(diag(mat)<=0)){ return(FALSE) } if (as.integer(Matrix::rankMatrix(mat)) < base::nrow(mat)){ return(FALSE) } return(TRUE) } maotai/R/movMF_convert.R0000644000176200001440000000170614757654200014652 0ustar liggesusers#' Convert 'movMF' object #' #' Given an output from the movMF package's movMF function, #' convert them into the standard mixture parameter format. #' #' @param movMF_object a movMF object of \eqn{K} components in \eqn{d} dimensions. #' #' @return a named list containing \describe{ #' \item{means}{a \eqn{(K \times d)} matrix of means} #' \item{concentrations}{a \eqn{K} vector of concentrations} #' \item{weights}{a \eqn{K} vector of weights} #' } #' #' @export movMF_convert <- function(movMF_object){ ############################################### # Preprocessing if (!inherits(movMF_object, "movMF")){ stop("* movMF_convert : Input object is not a 'movMF' object") } ############################################### # Change output = list() output$means <- movMF_object$theta/sqrt(rowSums(movMF_object$theta^2)) output$concentrations <- sqrt(rowSums(movMF_object$theta^2)) output$weights <- movMF_object$alpha return(output) } maotai/R/movMF_reduce_greedy.R0000644000176200001440000001117614757665652016017 0ustar liggesusers#' von Mises-Fisher mixture model reduction - Greedy method #' #' When given parameters of the von Mises-Fisher mixture model, this function #' aims at mixture model reduction using a greedy method. #' #' @param means a \eqn{(K \times p)} matrix of means of the von Mises-Fisher components. #' @param concentrations a \eqn{K} vector of concentrations of the von Mises-Fisher components. #' @param weights a \eqn{K} vector of weights of the von Mises-Fisher components. #' @param target.num a desired number of components after reduction. Default is 2. #' #' @return a named list of the reduced mixture model containing \describe{ #' \item{means}{a \eqn{(\code{target.num} \times p)} matrix of means of the von Mises-Fisher components.} #' \item{concentrations}{a \eqn{\code{target.num}} vector of concentrations of the von Mises-Fisher components.} #' \item{weights}{a \eqn{\code{target.num}} vector of weights of the von Mises-Fisher components.} #' } #' @export movMF_reduce_greedy <- function(means, concentrations, weights, target.num=2){ ############################################### # Preprocessing # means if (!is.matrix(means)){ data_means = cpp_WL_normalise(as.matrix(means)) } else { data_means = cpp_WL_normalise(means) } # concentrations data_concentrations = as.vector(concentrations) if (length(data_concentrations)!=base::nrow(data_means)){ stop("* movMF_reduce_greedy : cardinalities of the means and concentrations do not match.") } # weights if ((length(weights)==0)&&(is.null(weights))){ data_weights = rep(1/length(data_concentrations), length(data_concentrations)) } else { data_weights = as.vector(weights) data_weights = data_weights/base::sum(data_weights) } if (any(data_weights < .Machine$double.eps)||(length(data_weights)!=length(data_concentrations))){ stop("* movMF_reduce_greedy : invalid 'weights'. Please see the documentation.") } # desired number K = length(data_weights) par_target_num = round(target.num) if ((par_target_num <= 1)||(par_target_num >= K)){ stop("* movMF_reduce_greedy : 'target.num' must be greater than 1 and less than the number of components.") } ############################################### # MAIN ROUTINE old_vmf <- list(means=data_means, concentrations=data_concentrations, weights=data_weights) while (length(old_vmf$weights) > par_target_num){ # compute the pairwise distances dists = maotai::WLpdist(old_vmf$means, old_vmf$concentrations) # find the closest pair min_dist = Inf min_i = 0 min_j = 0 for (i in 1:(length(old_vmf$weights)-1)){ for (j in (i+1):length(old_vmf$weights)){ if (dists[i,j] < min_dist){ min_dist = dists[i,j] min_i = i min_j = j } } } # save the non-closest pair new_means = old_vmf$means[-c(min_i, min_j),] new_concentrations = old_vmf$concentrations[-c(min_i, min_j)] new_weights = old_vmf$weights[-c(min_i, min_j)] # merge the closest pairs tgt_means = old_vmf$means[c(min_i, min_j),] tgt_kappa = old_vmf$concentrations[c(min_i, min_j)] tgt_alpha = old_vmf$weights[c(min_i, min_j)] tgt_alpha_norm = tgt_alpha/base::sum(tgt_alpha) tgt_barycenter <- maotai::WLbarycenter(tgt_means, tgt_kappa, tgt_alpha_norm) # update the old one old_vmf = list(means=rbind(new_means, tgt_barycenter$mean), concentrations=c(new_concentrations, tgt_barycenter$concentration), weights=c(new_weights, base::sum(tgt_alpha))) } # return return(old_vmf) } # # simple example # # data matrix normalized # X = as.matrix(iris[,2:4]) # X = as.matrix(scale(X, center=TRUE, scale=FALSE)) # X = X%*%eigen(cov(X))$vectors[,1:2] # apply PCA # X = X/sqrt(rowSums(X^2)) # # # fit the model with movMF package # big_movMF <- movMF::movMF(X, 10) # clust_big_movMF <- predict(big_movMF, X) # convert_movMF <- movMF_convert(big_movMF) # # big_means <- convert_movMF$means # big_weights <- convert_movMF$weights # big_concentrations <- convert_movMF$concentrations # # # reduce to 3 components using different methods # red3 <- movMF_reduce_greedy(big_means, big_concentrations, big_weights, target.num=3) # red5 <- movMF_reduce_greedy(big_means, big_concentrations, big_weights, target.num=5) # # clust3 <- movMF_info(X, red3$means, red3$concentrations, red3$weights)$clustering # clust5 <- movMF_info(X, red5$means, red5$concentrations, red5$weights)$clustering # # # visualize # par(mfrow=c(1,3), pty="s") # plot(X, col=clust_big_movMF, pch=19, main="Original") # plot(X, col=clust3, pch=19, main="Greedy K=3") # plot(X, col=clust5, pch=19, main="Greedy K=5") maotai/R/later_scalarpFq.R0000644000176200001440000000122714753205125015161 0ustar liggesusers#' General Form of Hypergeometric Function #' #' #' @keywords internal #' @noRd scalarpFq <- function(veca, vecb, z){ p = length(veca) q = length(vecb) no.stop = TRUE Mval = 1 n = 0 while (no.stop){ n = n+1 terma = 0 for (i in 1:p){ terma = terma + sum(log((veca[i] + seq(from=0, to=(n-1), by=1)))) } termb = 0 for (j in 1:q){ termb = termb + sum(log((vecb[j] + seq(from=0,to=(n-1),by=1)))) } Mnow = exp(n*log(z) + terma - termb - base::lfactorial(n)) Mval = Mnow + Mval if (abs(Mnow) < 1e-10){ no.stop=FALSE } if (n>100){ no.stop=FALSE } } return(Mval) }maotai/R/shortestpath.R0000644000176200001440000000317014757667025014624 0ustar liggesusers#' Find Shortest Path using Floyd-Warshall Algorithm #' #' This is a fast implementation of Floyd-Warshall algorithm to find the #' shortest path in a pairwise sense using \code{RcppArmadillo}. A logical input #' is also accepted. The given matrix should contain pairwise distance values \eqn{d_{i,j}} where #' \eqn{0} means there exists no path for node \eqn{i} and \eqn{j}. #' #' @param dist either an \eqn{(n\times n)} matrix or a \code{dist} class object. #' #' @return an \eqn{(n\times n)} matrix containing pairwise shortest path length. #' #' @examples #' ## simple example : a ring graph #' # edges exist for pairs #' A = array(0,c(10,10)) #' for (i in 1:9){ #' A[i,i+1] = 1 #' A[i+1,i] = 1 #' } #' A[10,1] <- A[1,10] <- 1 #' #' # compute shortest-path and show the matrix #' sdA <- shortestpath(A) #' #' # visualize #' opar <- par(no.readonly=TRUE) #' par(pty="s") #' image(sdA, main="shortest path length for a ring graph") #' par(opar) #' #' @references #' \insertRef{floyd_algorithm_1962}{maotai} #' #' \insertRef{warshall_theorem_1962}{maotai} #' #' @export shortestpath <- function(dist){ input = dist # class determination if (inherits(dist,"matrix")){ distnaive = dist } else if (inherits(dist, "dist")){ distnaive = as.matrix(dist) } else { stop("* shortestpath : input 'dist' should be either (n*n) matrix or 'dist' class object.") } # consider logical input if (any(is.logical(distnaive))){ distnaive = distnaive*1 } # set as -Inf for 0 values mepsil = .Machine$double.eps distnaive[which(distnaive<5*mepsil)] = -Inf distgeo = aux_shortestpath(distnaive) return(distgeo) } maotai/R/RcppExports.R0000644000176200001440000000742514757667140014372 0ustar liggesusers# Generated by using Rcpp::compileAttributes() -> do not edit by hand # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 sphere_dist <- function(vecx, vecy) { .Call('_maotai_sphere_dist', PACKAGE = 'maotai', vecx, vecy) } cpp_WL_normalise <- function(X) { .Call('_maotai_cpp_WL_normalise', PACKAGE = 'maotai', X) } cpp_WL_weighted_mean <- function(X, weights) { .Call('_maotai_cpp_WL_weighted_mean', PACKAGE = 'maotai', X, weights) } compute_SSR <- function(D, Delta) { .Call('_maotai_compute_SSR', PACKAGE = 'maotai', D, Delta) } compute_stress <- function(D, Dhat) { .Call('_maotai_compute_stress', PACKAGE = 'maotai', D, Dhat) } main_bmds <- function(D, X0, sigg0, a, alpha, maxiter, constant, verbose, betas) { .Call('_maotai_main_bmds', PACKAGE = 'maotai', D, X0, sigg0, a, alpha, maxiter, constant, verbose, betas) } aux_shortestpath <- function(wmat) { .Call('_maotai_aux_shortestpath', PACKAGE = 'maotai', wmat) } cppsub_2007Wang <- function(V0, mm, d, Spu, Stu, maxiter, eps) { .Call('_maotai_cppsub_2007Wang', PACKAGE = 'maotai', V0, mm, d, Spu, Stu, maxiter, eps) } gradF <- function(func, xnow, h) { .Call('_maotai_gradF', PACKAGE = 'maotai', func, xnow, h) } dat2centers <- function(data, centers) { .Call('_maotai_dat2centers', PACKAGE = 'maotai', data, centers) } cpp_sylvester <- function(A, B, C) { .Call('_maotai_cpp_sylvester', PACKAGE = 'maotai', A, B, C) } solve_lyapunov <- function(A, B, C) { .Call('_maotai_solve_lyapunov', PACKAGE = 'maotai', A, B, C) } cpp_weiszfeld <- function(X, abstol, maxiter, xinit, weights, epsnum) { .Call('_maotai_cpp_weiszfeld', PACKAGE = 'maotai', X, abstol, maxiter, xinit, weights, epsnum) } cpp_kmeans <- function(data, k) { .Call('_maotai_cpp_kmeans', PACKAGE = 'maotai', data, k) } emds_gamma0 <- function(dmat) { .Call('_maotai_emds_gamma0', PACKAGE = 'maotai', dmat) } cpp_pairwise_L2 <- function(muA, muB, covA, covB) { .Call('_maotai_cpp_pairwise_L2', PACKAGE = 'maotai', muA, muB, covA, covB) } integrate_1d <- function(tseq, fval) { .Call('_maotai_integrate_1d', PACKAGE = 'maotai', tseq, fval) } cpp_pdist <- function(X) { .Call('_maotai_cpp_pdist', PACKAGE = 'maotai', X) } cpp_geigen <- function(A, B) { .Call('_maotai_cpp_geigen', PACKAGE = 'maotai', A, B) } cpp_triangle <- function(D) { .Call('_maotai_cpp_triangle', PACKAGE = 'maotai', D) } cpp_metricdepth <- function(D) { .Call('_maotai_cpp_metricdepth', PACKAGE = 'maotai', D) } cpp_mmds <- function(D, ndim, maxiter, abstol) { .Call('_maotai_cpp_mmds', PACKAGE = 'maotai', D, ndim, maxiter, abstol) } src_smacof <- function(D, W, ndim, maxiter, abstol, use_gutman) { .Call('_maotai_src_smacof', PACKAGE = 'maotai', D, W, ndim, maxiter, abstol, use_gutman) } eval_gaussian <- function(x, mu, cov) { .Call('_maotai_eval_gaussian', PACKAGE = 'maotai', x, mu, cov) } eval_gaussian_data <- function(X, mu, cov) { .Call('_maotai_eval_gaussian_data', PACKAGE = 'maotai', X, mu, cov) } eval_gmm_data <- function(X, mus, covs, weight) { .Call('_maotai_eval_gmm_data', PACKAGE = 'maotai', X, mus, covs, weight) } eval_gmm <- function(x, mus, covs, weight) { .Call('_maotai_eval_gmm', PACKAGE = 'maotai', x, mus, covs, weight) } src_construct_by_knn <- function(nn_idx, intersection) { .Call('_maotai_src_construct_by_knn', PACKAGE = 'maotai', nn_idx, intersection) } src_gaussbary_2002R <- function(array3d, weight, maxiter, abstol) { .Call('_maotai_src_gaussbary_2002R', PACKAGE = 'maotai', array3d, weight, maxiter, abstol) } src_gaussbary_2016A <- function(array3d, weight, maxiter, abstol) { .Call('_maotai_src_gaussbary_2016A', PACKAGE = 'maotai', array3d, weight, maxiter, abstol) } src_cov2corr <- function(covmat) { .Call('_maotai_src_cov2corr', PACKAGE = 'maotai', covmat) } maotai/R/later_matrix2F1.R0000644000176200001440000000365614753205125015032 0ustar liggesusers#' Gauss Confluent Hypergeometric Function of Matrix Argument #' #' @references #' \insertRef{butler_laplace_2002}{maotai} #' #' @keywords internal #' @noRd matrix2F1 <- function(a, b, c, Z, method=c("laplace")){ # PREPARE if ((!isSymmetric(Z))||(!is.matrix(Z))){ stop("* matrix2F1 : input 'Z' should be a symmetric matrix.") } mymethod = ifelse(missing(method),"laplace", match.arg(tolower(method), c("laplace"))) # RUN output = switch(mymethod, laplace = matrix2F1.laplace(a,b,c,Z)) return(output) } #' @keywords internal #' @noRd matrix2F1.laplace <- function(a, b, c, X){ p = base::nrow(X) vec.xx = base::eigen(X)$values vec.yy = rep(0,p) for (i in 1:p){ tau = (vec.xx[i]*(b-a)) - c vec.yy[i] = (2*a)/(sqrt((tau^2) - (4*a*vec.xx[i]*(c-b))) - tau) } vec.Li = rep(0,p) for (i in 1:p){ vec.Li[i] = (vec.xx[i]*vec.yy[i]*(1-vec.yy[i]))/(1-(vec.xx[i]*vec.yy[i])) } matR21 = array(0,c(p,p)) for (i in 1:p){ xi = vec.xx[i] yi = vec.yy[i] for (j in 1:p){ xj = vec.xx[j] yj = vec.yy[j] term1 = exp(log(yi)+log(yj)-log(a)) term2 = exp(log(1-yi)+log(1-yj)-log(c-a)) term3top = log(b)+log(xi)+log(xj)+log(yi)+log(yj)+log(1-yi)+log(1-yj) term3bot = log(1-(xi*yi))+log(1-(xj*yj))+log(a)+log(c-a) term3 = exp(term3top-term3bot) matR21[i,j] = term1+term2-term3 } } log1 = ((c*p) - (p*(p+1)/4))*log(c) log2 = -0.5*sum(log(matR21)) log3 = (a*(log(vec.yy)-log(a)))+((c-a)*(log(1-vec.yy)-log(c-a)))-(b*log(1-(vec.xx*vec.yy))) return(base::exp(log1+log2+log3)) } # # # special case # myz = runif(1, min=-1, max=1) # # asin(myz)/myz # scalar2F1(1/2,1/2,3/2,(myz^2), method = "series") # scalar2F1(1/2,1/2,3/2,(myz^2), method = "integral") # scalar2F1(1/2,1/2,3/2,(myz^2), method = "laplace") # # matrix2F1(1/2,1/2,3/2,matrix((myz^2)), method = "laplace")maotai/R/sylvester.R0000644000176200001440000000326414753205125014121 0ustar liggesusers#' Solve Sylvester Equation #' #' The Sylvester equation is of form #' \deqn{AX + XB = C} #' where \eqn{X} is the unknown and others are given. Though it's possible to have non-square \eqn{A} and \eqn{B} matrices, #' we currently support square matrices only. This is a wrapper of \code{armadillo}'s \code{sylvester} function. #' #' @param A a \eqn{(p\times p)} matrix as above. #' @param B a \eqn{(p\times p)} matrix as above. #' @param C a \eqn{(p\times p)} matrix as above. #' #' @return a solution matrix \eqn{X} of size \eqn{(p\times p)}. #' #' @examples #' ## simulated example #' # generate square matrices #' A = matrix(rnorm(25),nrow=5) #' X = matrix(rnorm(25),nrow=5) #' B = matrix(rnorm(25),nrow=5) #' C = A%*%X + X%*%B #' #' # solve using 'sylvester' function #' solX = sylvester(A,B,C) #' pm1 = "* Experiment with Sylvester Solver" #' pm2 = paste("* Absolute Error : ",norm(solX-X,"f"),sep="") #' pm3 = paste("* Relative Error : ",norm(solX-X,"f")/norm(X,"f"),sep="") #' cat(paste(pm1,"\n",pm2,"\n",pm3,sep="")) #' #' #' @references #' \insertRef{sanderson_armadillo_2016}{maotai} #' #' \insertRef{eddelbuettel_rcpparmadillo_2014}{maotai} #' #' @export sylvester <- function(A,B,C){ ################################################################### # check square matrix if (!check_sqmat(A)){ stop("* sylvester : an input 'A' should be a square matrix.") } if (!check_sqmat(B)){ stop("* sylvester : an input 'B' should be a square matrix.") } if (!check_sqmat(C)){ stop("* sylvester : an input 'C' should be a square matrix.") } ################################################################### # arrange and solve return(cpp_sylvester(A,B,-C)) }maotai/R/ecdfdist.R0000644000176200001440000001040414753205125013640 0ustar liggesusers#' Distance Measures between Multiple Empirical Cumulative Distribution Functions #' #' We measure distance between two empirical cumulative distribution functions (ECDF). For #' simplicity, we only take an input of \code{\link[stats]{ecdf}} objects from \pkg{stats} package. #' #' @param elist a length \eqn{N} list of \code{ecdf} objects. #' @param method name of the distance/dissimilarity measure. Case insensitive. #' @param p exponent for \code{Lp} or \code{Wasserstein} distance. #' @param as.dist a logical; \code{TRUE} to return \code{dist} object, \code{FALSE} to return an \eqn{(N\times N)} symmetric matrix of pairwise distances. #' #' @return either \code{dist} object of an \eqn{(N\times N)} symmetric matrix of pairwise distances by \code{as.dist} argument. #' #' @seealso \code{\link[stats]{ecdf}} #' #' @examples #' \donttest{ #' ## toy example : 10 of random and uniform distributions #' mylist = list() #' for (i in 1:10){ #' mylist[[i]] = stats::ecdf(stats::rnorm(50, sd=2)) #' } #' for (i in 11:20){ #' mylist[[i]] = stats::ecdf(stats::runif(50, min=-5)) #' } #' #' ## compute Kolmogorov-Smirnov distance #' dm = ecdfdist(mylist, method="KS") #' #' ## visualize #' mks =" KS distances of 2 Types" #' opar = par(no.readonly=TRUE) #' par(pty="s") #' image(dm[,nrow(dm):1], axes=FALSE, main=mks) #' par(opar) #' } #' #' @export ecdfdist <- function(elist, method=c("KS","Lp","Wasserstein"), p=2, as.dist=FALSE){ ############################################### # Preprocessing if (!elist_check(elist)){ stop("* ecdfdist : input 'elist' should be a list of 'ecdf' objects.") } methodss = c("ks","wasserstein","lp") mymethod = tolower(method) mymethod = match.arg(mymethod, methodss) myp = round(p) if (myp <= 0){ stop("* ecdfdist : exponent 'p' should be a nonnegative number.") } ############################################### # Computation output = switch(mymethod, "ks" = dist_ks(elist), "wasserstein" = dist_wasserstein(elist, myp), "lp" = dist_lp(elist, myp)) ############################################### # Report if (as.dist){ return(stats::as.dist(output)) } else { return(output) } } # single functions -------------------------------------------------------- # (1) dist_ks : kolmogorov-smirnov # (2) dist_wasserstein : 1d wasserstein distance # (3) dist_lp : Lp distance #' @keywords internal #' @noRd dist_ks <- function(elist){ trflist = elist_fform(elist) flist = trflist$fval nlist = length(flist) output = array(0,c(nlist,nlist)) for (i in 1:(nlist-1)){ fi = flist[[i]] for (j in (i+1):nlist){ fj = flist[[j]] theval = max(abs(fi-fj)) output[i,j] <- output[j,i] <- theval[1] } } return(output) } #' @keywords internal #' @noRd dist_lp <- function(elist, p){ nlist = length(elist) trflist = elist_fform(elist) flist = trflist$fval nlist = length(flist) output = array(0,c(nlist,nlist)) if (is.infinite(p)){ for (i in 1:(nlist-1)){ fi = flist[[i]] for (j in (i+1):nlist){ fj = flist[[j]] output[i,j] <- output[j,i] <- base::max(base::abs(fi-fj))[1] } } } else { for (i in 1:(nlist-1)){ fi = flist[[i]] for (j in (i+1):nlist){ fj = flist[[j]] theval = ((integrate_1d(trflist$tseq, (abs(fi-fj)^p)))^(1/p)) output[i,j] <- output[j,i] <- theval } } } return(output) } #' @keywords internal #' @noRd dist_wasserstein <- function(elist, p){ nlist = length(elist) qseq = base::seq(from=1e-6, to=1-(1e-6), length.out=8128) quants = list() # compute quantile functions first for (i in 1:nlist){ quants[[i]] = as.double(stats::quantile(elist[[i]], qseq)) } output = array(0,c(nlist,nlist)) for (i in 1:(nlist-1)){ vali = quants[[i]] for (j in (i+1):nlist){ valj = quants[[j]] valij = abs(vali-valj) if (is.infinite(p)){ output[i,j] <- output[j,i] <- base::max(valij) } else { theval <- ((integrate_1d(qseq, valij^p))^(1/p)) output[i,j] <- output[j,i] <- theval } } } return(output) } ## wasserstein : http://www-users.math.umn.edu/~bobko001/preprints/2016_BL_Order.statistics_Revised.version.pdfmaotai/R/kmeanspp.R0000644000176200001440000000352214753205125013674 0ustar liggesusers#' K-Means++ Clustering Algorithm #' #' \eqn{k}-means++ algorithm is known to be a smart, careful initialization #' technique. It is originally intended to return a set of \eqn{k} points #' as initial centers though it can still be used as a rough clustering algorithm #' by assigning points to the nearest points. #' #' @param data an \eqn{(n\times p)} matrix whose rows are observations. #' @param k the number of clusters. #' #' @return a length-\eqn{n} vector of class labels. #' #' @examples #' ## use simple example of iris dataset #' data(iris) #' mydata = as.matrix(iris[,1:4]) #' mycol = as.factor(iris[,5]) #' #' ## find the low-dimensional embedding for visualization #' my2d = cmds(mydata, ndim=2)$embed #' #' ## apply 'kmeanspp' with different numbers of k's. #' k2 = kmeanspp(mydata, k=2) #' k3 = kmeanspp(mydata, k=3) #' k4 = kmeanspp(mydata, k=4) #' k5 = kmeanspp(mydata, k=5) #' k6 = kmeanspp(mydata, k=6) #' #' ## visualize #' opar <- par(no.readonly=TRUE) #' par(mfrow=c(2,3)) #' plot(my2d, col=k2, main="k=2", pch=19, cex=0.5) #' plot(my2d, col=k3, main="k=3", pch=19, cex=0.5) #' plot(my2d, col=k4, main="k=4", pch=19, cex=0.5) #' plot(my2d, col=k5, main="k=5", pch=19, cex=0.5) #' plot(my2d, col=k6, main="k=6", pch=19, cex=0.5) #' plot(my2d, col=mycol, main="true cluster", pch=19, cex=0.5) #' par(opar) #' #' @references #' \insertRef{arthur_kmeans_2007}{maotai} #' #' @export kmeanspp <- function(data, k=2){ ############################################################ # Preprocessing if (!check_datamat(data)){ stop("* kmeanspp : an input 'data' should be a matrix without any missing/infinite values.") } xdiss = stats::as.dist(cpp_pdist(data)) myk = round(k) ############################################################ # Run and Return output = hidden_kmeanspp(xdiss,k=myk)$cluster return(output) }maotai/R/later_scalar2F1.R0000644000176200001440000000434714753205125014771 0ustar liggesusers#' Gauss Confluent Hypergeometric Function of Scalar Argument #' #' #' @references #' \insertRef{butler_laplace_2002}{maotai} #' #' @keywords internal #' @noRd scalar2F1 <- function(a, b, c, z, method=c("series","integral")){ # PREPARE # if (abs(z) >= 1){ # stop("* scalar2F1 : '|z| < 1' is required.") # } mymethod = ifelse(missing(method),"series", match.arg(tolower(method), c("laplace","integral","series"))) # COMPUTE output = switch(mymethod, integral = scalar2F1.integral(a,b,c,z), series = scalar2F1.series(a,b,c,z), laplace = scalar2F1.laplace(a,b,c,z)) return(output) } #' @keywords internal #' @noRd scalar2F1.integral <- function(a,b,c,z){ # conditions not met # INTEGRATION func.int <- function(y){ return((y^(a-1))*((1-y)^(c-a-1))*((1-(z*y))^(-b))) } myeps = 10*.Machine$double.eps term1 = stats::integrate(func.int, lower=(10*.Machine$double.eps), upper=1)$value term2 = 1/base::beta(a,c-a) return(term1*term2) } #' @keywords internal #' @noRd scalar2F1.series <- function(a,b,c,z){ no.stop = TRUE Mval = 1 n = 0 while (no.stop){ n = n+1 term1 = n*log(z) + sum(log((a + seq(from=0, to=(n-1), by=1)))) + sum(log((b + seq(from=0, to=(n-1), by=1)))) term2 = sum(log((c + seq(from=0, to=(n-1), by=1)))) + base::lfactorial(n) Mnow = exp(term1-term2) Mval = Mval + Mnow if (abs(Mnow) < 1e-10){ no.stop = FALSE } if (n > 100){ no.stop = FALSE } } return(Mval) } #' @keywords internal #' @noRd scalar2F1.laplace <- function(a,b,c,x){ tau = (x*(b-a)) - c yhat = (2*a)/(sqrt((tau^2) - (4*a*x*(c-b))) - tau) r21 = ((yhat^2)/a) + (((1-yhat)^2)/(c-a)) - exp((log(b) + 2*log(x) + 2*log(yhat) + 2*log(1-yhat))-(2*log(1-(x*yhat)) + log(a) + log(c-a))) log1 = (c-0.5)*log(c) log2 = -0.5*log(r21) log3 = a*(log(yhat)-log(a)) + (c-a)*(log(1-yhat)-log(c-a)) + -b*log(1-(x*yhat)) return(exp(log1+log2+log3)) } # # special case # myz = runif(1, min=-1, max=1) # asin(myz)/myz # scalar2F1(1/2,1/2,3/2,(myz^2), method = "series") # scalar2F1(1/2,1/2,3/2,(myz^2), method = "integral") # scalar2F1(1/2,1/2,3/2,(myz^2), method = "laplace") maotai/R/matderiv.R0000644000176200001440000000475114753205125013676 0ustar liggesusers#' Numerical Approximation to Gradient of a Function with Matrix Argument #' #' For a given function \eqn{f:\mathbf{R}^{n\times p} \rightarrow \mathbf{R}}, #' we use finite difference scheme that approximates a gradient at a given point \eqn{x}. #' In Riemannian optimization, this can be used as a proxy for #' ambient gradient. Use with care since it may accumulate numerical error. #' #' @param fn a function that takes a matrix of size \eqn{(n\times p)} and returns a scalar value. #' @param x an \eqn{(n\times p)} matrix where the gradient is to be computed. #' @param h step size for centered difference scheme. #' #' @return an approximate numerical gradient matrix of size \eqn{(n\times p)}. #' #' @examples #' ## function f(X) = for two vectors 'a' and 'b' #' # derivative w.r.t X is ab' #' # take an example of (5x5) symmetric positive definite matrix #' #' # problem settings #' a <- rnorm(5) #' b <- rnorm(5) #' ftn <- function(X){ #' return(sum(as.vector(X%*%b)*a)) #' } # function to be taken derivative #' myX <- matrix(rnorm(25),nrow=5) # point where derivative is evaluated #' myX <- myX%*%t(myX) #' #' # main computation #' sol.true <- base::outer(a,b) #' sol.num1 <- matderiv(ftn, myX, h=1e-1) # step size : 1e-1 #' sol.num2 <- matderiv(ftn, myX, h=1e-5) # 1e-3 #' sol.num3 <- matderiv(ftn, myX, h=1e-9) # 1e-5 #' #' ## visualize/print the results #' expar = par(no.readonly=TRUE) #' par(mfrow=c(2,2),pty="s") #' image(sol.true, main="true solution") #' image(sol.num1, main="h=1e-1") #' image(sol.num2, main="h=1e-5") #' image(sol.num3, main="h=1e-9") #' par(expar) #' #' \donttest{ #' ntrue = norm(sol.true,"f") #' cat('* Relative Errors in Frobenius Norm ') #' cat(paste("* h=1e-1 : ",norm(sol.true-sol.num1,"f")/ntrue,sep="")) #' cat(paste("* h=1e-5 : ",norm(sol.true-sol.num2,"f")/ntrue,sep="")) #' cat(paste("* h=1e-9 : ",norm(sol.true-sol.num3,"f")/ntrue,sep="")) #' } #' #' @references #' \insertRef{kincaid_numerical_2009}{maotai} #' #' @export matderiv <- function(fn, x, h=0.001){ if (h <= 0){ stop("* matderiv : 'h' should be a nonnegative real number.") } hval = max(sqrt(.Machine$double.eps), h) return(gradF(fn,x,hval)) } # h = 0.001 # X = matrix(rnorm(9),nrow=3) # X = X%*%t(X) # dX = array(0,c(3,3)) # fX = function(x){return(sum(diag(x%*%x)))} # for (i in 1:3){ # for (j in 1:3){ # Xp = X # Xm = X # Xp[i,j] = Xp[i,j] + h # Xm[i,j] = Xm[i,j] - h # dX[i,j] = (fX(Xp)-fX(Xm))/(2*h) # } # } # dXmaotai/R/later_mgamma.R0000644000176200001440000000115014753205125014477 0ustar liggesusers#' Multivariate Gamma #' #' \deqn{\Gamma_m (a)} #' #' @keywords internal #' @noRd mgamma <- function(m, a, log=FALSE){ m = round(m) if (length(a)==1){ logval = (base::sum(base::lgamma(a - 0.5*((1:m)-1))))*(pi^(m*(m-1)/4)) if (log){ return(logval) } else { return(base::exp(logval)) } } else { if (length(a)!=m){ stop("* mgamma : for a vector-valued 'a', its length must be equal to 'm'.") } logval = base::exp(base::sum(base::lgamma(a - 0.5*((1:m)-1))))*(pi^(m*(m-1)/4)) if (log){ return(logval) } else { return(base::exp(logval)) } } }maotai/R/cayleymenger.R0000644000176200001440000000245214753205125014543 0ustar liggesusers#' Cayley-Menger Determinant #' #' Cayley-Menger determinant is a formula of a \eqn{n}-dimensional simplex #' with respect to the squares of all pairwise distances of its vertices. #' #' @param data an \eqn{(n\times p)} matrix of row-stacked observations. #' @return a list containing\describe{ #' \item{det}{determinant value.} #' \item{vol}{volume attained from the determinant.} #' } #' #' @examples #' ## USE 'IRIS' DATASET #' data(iris) #' X = as.matrix(iris[,1:4]) #' #' ## COMPUTE CAYLEY-MENGER DETERMINANT #' # since k=4 < n=149, it should be zero. #' cayleymenger(X) #' #' @export cayleymenger <- function(data){ # Preprocessing if (!check_datamat(data)){ stop("* cayleymenger : an input 'data' should be a matrix without any missing/infinite values.") } # compute pairwise distance Dtmp = stats::as.dist(cpp_pdist(data)) # compute and return return(cayleymenger_internal(Dtmp)) } #' @keywords internal #' @noRd cayleymenger_internal <- function(distobj){ Dold = (as.matrix(distobj)^2) n = base::nrow(Dold) Dnew = rbind(cbind(Dold, rep(1,n)), c(rep(1,n),0)) val.det = base::det(Dnew) n = n+1 val.vol = base::sqrt(base::exp(base::log(((-1)^(n+1))*val.det) - n*log(2) - (2*base::lfactorial(n)))) output = list(det=val.det, vol=val.vol) return(output) }maotai/R/aux_ecdf.R0000644000176200001440000000342314753205125013634 0ustar liggesusers# auxiliary functions to deal with ECDF objects # (1) elist_check : list of 'ecdf' objects # (2) elist_fform : make a function form in a discrete grid # (3) elist_epmeans : either a vector or something # (1) elist_check --------------------------------------------------------- #' @keywords internal #' @noRd elist_check <- function(elist){ cond1 = (is.list(elist)) cond2 = all(unlist(lapply(elist, inherits, "ecdf"))==TRUE) if (cond1&&cond2){ return(TRUE) } else { return(FALSE) } } # (2) elist_fform --------------------------------------------------------- #' @keywords internal #' @noRd elist_fform <- function(elist){ nlist = length(elist) # compute knot points allknots = array(0,c(nlist,2)) for (i in 1:nlist){ tgt = stats::knots(elist[[i]]) allknots[i,] = c(min(tgt), max(tgt)) } mint = min(allknots[,1]) - 0.01 maxt = max(allknots[,2]) + 0.01 ssize = min((maxt-mint)/1000, 0.001) tseq = seq(mint, maxt, by=ssize) # return the list of y values outY = list() for (i in 1:nlist){ tgt = elist[[i]] outY[[i]] = tgt(tseq) } # return the result output = list() output$tseq = tseq output$fval = outY # list of function values return(output) } # (3) elist_epmeans ------------------------------------------------------- #' @keywords internal #' @noRd elist_epmeans <- function(elist){ N = length(elist) output = list() for (n in 1:N){ tgt = elist[[n]] if (is.vector(tgt)&&(!any(is.infinite(tgt)))&&(!any(is.na(tgt)))){ # Case 1. just a vector output[[n]] = stats::ecdf(tgt) } else if (inherits(tgt, "ecdf")){ output[[n]] = tgt } else { smsg = paste("* epmeans : ",n,"-th element from 'elist' is neither an 'ecdf' object nor a vector.") stop(smsg) } } return(output) }maotai/R/mmd2test.R0000644000176200001440000001260714753205125013621 0ustar liggesusers#' Kernel Two-sample Test with Maximum Mean Discrepancy #' #' Maximum Mean Discrepancy (MMD) as a measure of discrepancy between #' samples is employed as a test statistic for two-sample hypothesis test #' of equal distributions. Kernel matrix \eqn{K} is a symmetric square matrix #' that is positive semidefinite. #' #' @param K kernel matrix or an object of \code{kernelMatrix} class from \pkg{kernlab} package. #' @param label label vector of class indices. #' @param method type of estimator to be used. \code{"b"} for biased and \code{"u"} for unbiased estimator of MMD. #' @param mc.iter the number of Monte Carlo resampling iterations. #' #' @return a (list) object of \code{S3} class \code{htest} containing: \describe{ #' \item{statistic}{a test statistic.} #' \item{p.value}{\eqn{p}-value under \eqn{H_0}.} #' \item{alternative}{alternative hypothesis.} #' \item{method}{name of the test.} #' \item{data.name}{name(s) of provided kernel matrix.} #' } #' #' @examples #' ## small test for CRAN submission #' dat1 <- matrix(rnorm(60, mean= 1), ncol=2) # group 1 : 30 obs of mean 1 #' dat2 <- matrix(rnorm(50, mean=-1), ncol=2) # group 2 : 25 obs of mean -1 #' #' dmat <- as.matrix(dist(rbind(dat1, dat2))) # Euclidean distance matrix #' kmat <- exp(-(dmat^2)) # build a gaussian kernel matrix #' lab <- c(rep(1,30), rep(2,25)) # corresponding label #' #' mmd2test(kmat, lab) # run the code ! #' #' \dontrun{ #' ## WARNING: computationally heavy. #' # Let's compute empirical Type 1 error at alpha=0.05 #' niter = 496 #' pvals1 = rep(0,niter) #' pvals2 = rep(0,niter) #' for (i in 1:niter){ #' dat = matrix(rnorm(200),ncol=2) #' lab = c(rep(1,50), rep(2,50)) #' lbd = 0.1 #' kmat = exp(-lbd*(as.matrix(dist(dat))^2)) #' pvals1[i] = mmd2test(kmat, lab, method="b")$p.value #' pvals2[i] = mmd2test(kmat, lab, method="u")$p.value #' print(paste("iteration ",i," complete..",sep="")) #' } #' #' # Visualize the above at multiple significance levels #' alphas = seq(from=0.001, to=0.999, length.out=100) #' errors1 = rep(0,100) #' errors2 = rep(0,100) #' for (i in 1:100){ #' errors1[i] = sum(pvals1<=alphas[i])/niter #' errors2[i] = sum(pvals2<=alphas[i])/niter #' } #' #' opar <- par(no.readonly=TRUE) #' par(mfrow=c(1,2), pty="s") #' plot(alphas, errors1, "b", main="Biased Estimator Error", #' xlab="alpha", ylab="error", cex=0.5) #' abline(a=0,b=1, lwd=1.5, col="red") #' plot(alphas, errors2, "b", main="Unbiased Estimator Error", #' xlab="alpha", ylab="error", cex=0.5) #' abline(a=0,b=1, lwd=1.5, col="blue") #' par(opar) #' } #' #' @references #' \insertRef{gretton_kernel_2012}{maotai} #' #' @export mmd2test <- function(K, label, method=c("b","u"), mc.iter=999){ ############################################### # Preprocessing DNAME = deparse(substitute(K)) # 1. K : kernel matrix if (inherits(K, "kernelMatrix")){ kmat = as.matrix(K) } else { kmat = as.matrix(K) } cond1 = (is.matrix(kmat)) cond2 = (nrow(K)==ncol(kmat)) cond3 = isSymmetric(kmat) if (!(cond1&&cond2&&cond3)){ stop("* mmd2test : 'K' should be a kernel matrix.") } mineval = min(base::eigen(kmat, only.values = TRUE)$values) if (mineval<0){ wm = paste("* mmd2test : 'K' may not be PD. Minimum eigenvalue is ",mineval,".",sep="") warning(wm) } # 2. label label = as.vector(as.integer(as.factor(label))) if ((length(label)!=nrow(kmat))||(length(unique(label))!=2)){ stop("* mmd2test : 'label' should be a vector of proper length with 2 classes.") } ulabel = unique(label) # 3. method allmmm = c("b","u") method = match.arg(tolower(method), allmmm) ############################################### # compute statistic id1 = which(label==ulabel[1]); m = length(id1) id2 = which(label==ulabel[2]); n = length(id2) thestat = switch(method, "b" = mmd_biased(kmat[id1,id1], kmat[id2,id2], kmat[id1,id2]), "u" = mmd_unbiased(kmat[id1,id1], kmat[id2,id2], kmat[id1,id2])) ############################################### # Iteration mciter = round(mc.iter) itervals = rep(0,mciter) for (i in 1:mciter){ permuted = sample(m+n) tmpid1 = permuted[1:m] tmpid2 = permuted[(m+1):(m+n)] itervals[i] = switch(method, "b" = mmd_biased(kmat[tmpid1,tmpid1], kmat[tmpid2,tmpid2], kmat[tmpid1,tmpid2]), "u" = mmd_unbiased(kmat[tmpid1,tmpid1], kmat[tmpid2,tmpid2], kmat[tmpid1,tmpid2])) } pvalue = (sum(itervals>=thestat)+1)/(mciter+1) ############################################### # REPORT hname = "Kernel Two-sample Test with Maximum Mean Discrepancy" Ha = "two distributions are not equal" names(thestat) = "MMD" res = list(statistic=thestat, p.value=pvalue, alternative = Ha, method=hname, data.name = DNAME) class(res) = "htest" return(res) } # compute two squared statistics ------------------------------------------ #' @keywords internal #' @noRd mmd_biased <- function(XX, YY, XY){ # parameters m = nrow(XX) n = nrow(YY) # computation return((sum(XX)/(m^2)) + (sum(YY)/(n^2)) - ((2/(m*n))*sum(XY))) } #' @keywords internal #' @noRd mmd_unbiased <- function(XX, YY, XY){ # parameters m = nrow(XX) n = nrow(YY) # computation term1 = (sum(XX)-sum(diag(XX)))/(m*(m-1)) term2 = (sum(YY)-sum(diag(YY)))/(n*(n-1)) term3 = (2/(m*n))*sum(XY) return((term1+term2-term3)) } maotai/R/rotationS2.R0000644000176200001440000000424014753205125014120 0ustar liggesusers#' Compute a Rotation on the 2-dimensional Sphere #' #' A vector of unit norm is an element on the hypersphere. When two unit-norm #' vectors \eqn{x} and \eqn{y} in 3-dimensional space are given, this function #' computes a rotation matrix \eqn{Q} on the 2-dimensional sphere such that #' \deqn{y=Qx}. #' #' @param x a length-\eqn{3} vector. If \eqn{\|x\|\neq 1}, normalization is applied. #' @param y a length-\eqn{3} vector. If \eqn{\|y\|\neq 1}, normalization is applied. #' #' @return a \eqn{(3\times 3)} rotation matrix. #' #' @examples #' \donttest{ #' ## generate two data points #' # one randomly and another on the north pole #' x = stats::rnorm(3) #' x = x/sqrt(sum(x^2)) #' y = c(0,0,1) #' #' ## compute the rotation #' Q = rotationS2(x,y) #' #' ## compare #' Qx = as.vector(Q%*%x) #' #' ## print #' printmat = rbind(Qx, y) #' rownames(printmat) = c("rotated:", "target:") #' print(printmat) #' } #' #' @export rotationS2 <- function(x, y){ ############################################################ # Preprocessing vec_x = as.vector(x); vec_x = vec_x/sqrt(sum(vec_x^2)) vec_y = as.vector(y); vec_y = vec_y/sqrt(sum(vec_y^2)) if (length(vec_x)!=3){ stop("rotationS2 : an input 'x' is not of length 3.") } if (length(vec_y)!=3){ stop("rotationS2 : an input 'y' is not of length 3.") } ############################################################ # Run and Return output = rotateS2_main(vec_x, vec_y) return(output) } # auxiliary --------------------------------------------------------------- #' @keywords internal #' @noRd rotateS2_main <- function(vec_u, vec_v){ # orthonormal vector 'n' vec_n = pracma::cross(vec_u, vec_v) vec_n = vec_n/sqrt(sum(vec_n^2)) # another orthonormal vector vec_t = pracma::cross(vec_n, vec_u) vec_t = vec_t/sqrt(sum(vec_t^2)) # compute the angle alpha = base::atan2(sum(vec_v*vec_t), sum(vec_v*vec_u)) # Rn(alpha) Rnalpha = cbind(c(base::cos(alpha), base::sin(alpha), 0), c(-base::sin(alpha), base::cos(alpha), 0), c(0,0,1)) # T mat_T = cbind(vec_u,vec_t,vec_n) # rotator output = mat_T%*%Rnalpha%*%base::solve(mat_T) return(output) } maotai/R/WLbarycenter.R0000644000176200001440000000765314754242227014475 0ustar liggesusers#' Barycenter of vMF Distributions Under a Wasserstein-Like Geometry #' #' Given a collection of von Mises-Fisher (vMF) distributions, each characterized #' by a mean direction \eqn{\mathbf{\mu}} and a concentration parameter \eqn{\kappa}, #' this function solves the geometric mean problem to compute the barycentric vMF #' distribution under an approximate Wasserstein geometry. #' #' @param means An \eqn{(n \times p)} matrix where each row represents the mean #' direction of one of the \eqn{n} vMF distributions. #' @param concentrations A length-\eqn{n} vector of nonnegative concentration parameters. #' @param weights A weight vector of length \eqn{n}. If \code{NULL}, equal weights #' (\code{rep(1/n, n)}) are used. #' #' @return A named list containing: #' \describe{ #' \item{mean}{A length-\eqn{p} vector representing the barycenter direction.} #' \item{concentration}{A scalar representing the barycenter concentration.} #' } #' #' @examples #' \donttest{ #' # Set seed for reproducibility #' set.seed(123) #' #' # Number of vMF distributions #' n <- 5 #' #' # Generate mean directions concentrated around a specific angle (e.g., 45 degrees) #' base_angle <- pi / 4 # 45 degrees in radians #' angles <- rnorm(n, mean = base_angle, sd = pi / 20) # Small deviation from base_angle #' means <- cbind(cos(angles), sin(angles)) # Convert angles to unit vectors #' #' # Generate concentration parameters with large magnitudes (tight distributions) #' concentrations <- rnorm(n, mean = 50, sd = 5) # Large values around 50 #' #' # Compute the barycenter under the Wasserstein-like geometry #' barycenter <- WLbarycenter(means, concentrations) #' #' # Convert barycenter mean direction to an angle #' bary_angle <- atan2(barycenter$mean[2], barycenter$mean[1]) #' #' ## Visualize #' opar <- par(no.readonly=TRUE) #' par(mfrow=c(1,2), pty="s") #' #' # Plot the unit circle #' plot(cos(seq(0, 2 * pi, length.out = 200)), sin(seq(0, 2 * pi, length.out = 200)), #' type = "l", col = "gray", lwd = 2, xlab = "x", ylab = "y", #' main = "Barycenter of vMF Distributions on S^1") #' #' # Add input mean directions #' points(means[,1], means[,2], col = "blue", pch = 19, cex = 1.5) #' #' # Add the computed barycenter #' points(cos(bary_angle), sin(bary_angle), col = "red", pch = 17, cex = 2) #' #' # Add legend #' legend("bottomleft", legend = c("vMF Means", "Barycenter"), col = c("blue", "red"), #' pch = c(19, 17), cex = 1) #' #' # Plot the concentration parameters #' hist(concentrations, main = "Concentration Parameters", xlab = "Concentration") #' abline(v=barycenter$concentration, col="red", lwd=2) #' par(opar) #' } #' #' @export WLbarycenter <- function(means, concentrations, weights=NULL){ ############################################### # Preprocessing # means if (!is.matrix(means)){ data_X = cpp_WL_normalise(as.matrix(means)) } else { data_X = cpp_WL_normalise(means) } # concentrations data_kappa = as.vector(concentrations) if (length(data_kappa)!=base::nrow(data_X)){ stop("* WLbarycenter : cardinalities of the means and concentrations do not match.") } # weights if ((length(weights)==0)&&(is.null(weights))){ data_weights = rep(1/length(data_kappa), length(data_kappa)) } else { data_weights = as.vector(weights) data_weights = data_weights/base::sum(data_weights) } if (any(data_weights < .Machine$double.eps)||(length(data_weights)!=length(data_kappa))){ stop("* WLbarycenter : invalid 'weights'. Please see the documentation.") } ############################################### # Computation # mean direction output_mean = as.vector(cpp_WL_weighted_mean(data_X, data_weights)) # concentration kappa_tmp = base::sum(data_weights/sqrt(data_kappa)) kappa_output = 1/(kappa_tmp*kappa_tmp) ############################################### # Return output = list(mean=output_mean, concentration=kappa_output) return(output) } maotai/R/boot.mblock.R0000644000176200001440000000410614753205125014266 0ustar liggesusers#' Generate Index for Moving Block Bootstrapping #' #' Assuming data being dependent with cardinality \code{N}, \code{boot.mblock} returns #' a vector of index that is used for moving block bootstrapping. #' #' @param N the number of observations. #' @param b the size of a block to be drawn. #' #' @return a vector of length \code{N} for moving block bootstrap sampling. #' #' @examples #' \donttest{ #' ## example : bootstrap confidence interval of mean and variances #' vec.x = seq(from=0,to=10,length.out=100) #' vec.y = sin(1.21*vec.x) + 2*cos(3.14*vec.x) + rnorm(100,sd=1.5) #' data.mu = mean(vec.y) #' data.var = var(vec.y) #' #' ## apply moving block bootstrapping #' nreps = 50 #' vec.mu = rep(0,nreps) #' vec.var = rep(0,nreps) #' for (i in 1:nreps){ #' sample.id = boot.mblock(100, b=10) #' sample.y = vec.y[sample.id] #' vec.mu[i] = mean(sample.y) #' vec.var[i] = var(sample.y) #' print(paste("iteration ",i,"/",nreps," complete.", sep="")) #' } #' #' ## visualize #' opar <- par(no.readonly=TRUE) #' par(mfrow=c(1,3), pty="s") #' plot(vec.x, vec.y, type="l", main="1d signal") # 1d signal #' hist(vec.mu, main="mean CI", xlab="mu") # mean #' abline(v=data.mu, col="red", lwd=4) #' hist(vec.var, main="variance CI", xlab="sigma") # variance #' abline(v=data.var, col="blue", lwd=4) #' par(opar) #' } #' #' @references #' \insertRef{kunsch_jackknife_1989}{maotai} #' #' @export boot.mblock <- function(N, b=max(2,round(N/10))){ ################################################################### # Preprocessing myn = round(N) myb = round(b) vec1N = c(1:myn,1:myn,1:myn) ################################################################### # Preparation id0 = 1 idb = (myn-myb+1) id0b = (id0:idb) # starting point ################################################################### # Computation output = c() while (length(output) 100){ no.stop = FALSE } } return(Mval) } maotai/R/later_matrix1F1.R0000644000176200001440000000255614753205125015027 0ustar liggesusers#' Confluent Hypergeometric Function of Matrix Argument #' #' #' @references #' \insertRef{butler_laplace_2002}{maotai} #' #' @keywords internal #' @noRd matrix1F1 <- function(a, b, Z, method=c("laplace")){ # PREPARE if ((!isSymmetric(Z))||(!is.matrix(Z))){ stop("* matrix1F1 : input 'Z' should be a symmetric matrix.") } mymethod = ifelse(missing(method),"laplace", match.arg(tolower(method), c("laplace"))) # COMPUTATION output = switch(mymethod, laplace = matrix1F1.laplace(a,b,Z)) return(output) } #' @keywords internal #' @noRd matrix1F1.laplace <- function(a, b, X){ # checked in 1-dimension # Preliminary p = base::nrow(X) vec.xi = base::eigen(X)$values vec.yi = rep(0,p) for (i in 1:p){ xi = vec.xi[i] vec.yi[i] = (2*a)/(b-xi+sqrt(((xi-b)^2) + (4*a*xi))) } matR11 = array(0,c(p,p)) for (i in 1:p){ yi = vec.yi[i] for (j in 1:p){ yj = vec.yi[j] matR11[i,j] = ((yi*yj)/a) + ((1-yi)*(1-yj)/(b-a)) } } veclast = rep(0,p) for (i in 1:p){ xi = vec.xi[i] yi = vec.yi[i] veclast[i] = base::exp(a*(log(yi)-log(a)) + (b-a)*(log(1-yi)-log(b-a)) + (xi*yi)) } # Main log1 = ((b*p) - (p*(p+1)/4))*log(b) log2 = -0.5*base::sum(base::log(matR11)) log3 = base::sum(base::log(veclast)) return(base::exp(log1+log2+log3)) }maotai/R/later_scalar1F1.R0000644000176200001440000000407614753205125014767 0ustar liggesusers#' Kummer's Confluent Hypergeometric Function of Scalar Argument #' #' @references #' \insertRef{butler_laplace_2002}{maotai} #' #' @keywords internal #' @noRd scalar1F1 <- function(a, b, z, method=c("series","laplace","integral")){ # PREPARE mymethod = ifelse(missing(method),"series", match.arg(tolower(method), c("laplace","integral","series"))) # COMPUTE output = switch(mymethod, integral = scalar1F1.integral(a,b,z), series = scalar1F1.series(a,b,z), laplace = scalar1F1.laplace(a,b,z)) return(output) } #' @keywords internal #' @noRd scalar1F1.integral <- function(a, b, z){ # REQUIREMENT if (!((b>a)&&(a>0))){ stop("scalar1F1 : 'integral' method requires 'b > a > 0'.") } # INTEGRATION func.int <- function(y){ return((y^(a-1))*((1-y)^(b-a-1))*exp(z*y)) } myeps = 10*.Machine$double.eps term1 = stats::integrate(func.int, lower=(10*.Machine$double.eps), upper=1)$value term2 = 1/base::beta(a,b-a) return(term1*term2) } #' @keywords internal #' @noRd scalar1F1.series <- function(a, b, z){ no.stop = TRUE Mval = 1 n = 0 while (no.stop){ n = n+1 M.now = exp(sum(log((a + seq(from=0, to=(n-1), by=1)))) + n*log(z) - sum(log((b + seq(from=0, to=(n-1), by=1)))) - base::lfactorial(n)) Mval = Mval + M.now if (abs(M.now) < 1e-10){ no.stop = FALSE } if (n > 100){ no.stop = FALSE } } return(Mval) } #' @keywords internal #' @noRd scalar1F1.laplace <- function(a, b, x){ yhat = (2*a)/(b-x+sqrt(((x-b)^2) + (4*a*x))) r11 = (yhat^2)/a + ((1-yhat)^2)/(b-a) log1 = (b-0.5)*log(b) log2 = -0.5*log(r11) log3 = a*(log(yhat)-log(a)) log4 = (b-a)*(log(1-yhat)-log(b-a)) log5 = x*yhat output = exp(log1+log2+log3+log4+log5) return(output) } # mya = 1/2 # myb = sample(2:20, 1)/2 # myx = abs(1+rnorm(1, sd=10)) # # fAsianOptions::kummerM(myx,mya,myb) # scalar1F1(mya,myb,myx,method="integral") # scalar1F1(mya,myb,myx,method="series") # scalar1F1(mya,myb,myx,method="laplace") maotai/R/later_mbeta.R0000644000176200001440000000042614753205125014335 0ustar liggesusers#' Multivariate Beta #' #' #' @keywords internal #' @noRd mbeta <- function(m, a, b, log=FALSE){ m = round(m) logval = mgamma(m,a,log=TRUE) + mgamma(m,b,log=TRUE) - mgamma(m,(a+b),log=TRUE) if (log){ return(logval) } else { return(base::exp(logval)) } }maotai/R/metricdepth.R0000644000176200001440000000307114753205125014365 0ustar liggesusers#' Metric Depth #' #' Compute the metric depth proposed by \insertCite{geenens_2023_StatisticalDepthAbstract;textual}{maotai}, which is #' one generalization of statistical depth function onto the arbitrary metric space. Our implementation assumes that #' given the multivariate data it computes the (empirical) depth for all observations using under the Euclidean regime. #' #' @param data an \eqn{(n\times p)} matrix whose rows are observations. #' @return a length-\eqn{n} vector of empirical metric depth values. #' #' @examples #' \dontrun{ #' ## use simple example of iris dataset #' data(iris) #' X <- as.matrix(iris[,1:4]) #' y <- as.factor(iris[,5]) #' #' ## compute the metric depth #' mdX <- metricdepth(X) #' #' ## visualize #' # 2-d embedding for plotting by MDS #' X2d <- maotai::cmds(X, ndim=2)$embed #' #' # get a color code for the metric depth #' pal = colorRampPalette(c("yellow","red")) #' #' # draw #' opar <- par(no.readonly=TRUE) #' par(mfrow=c(1,2), pty="s") #' plot(X2d, pch=19, main="by class", xlab="", ylab="", col=y) #' plot(X2d, pch=19, main="by depth", xlab="", ylab="", col=pal(150)[order(mdX)]) #' legend("bottomright", col=pal(2), pch=19, legend=round(range(mdX), 2)) #' par(opar) #' } #' #' @references #' \insertAllCited{} #' #' @export metricdepth <- function(data){ ## PREPROCESSING if (!check_datamat(data)){ stop("* metricdepth : an input 'data' should be a matrix without any missing/infinite values.") } xdiss = stats::as.dist(cpp_pdist(data)) ## RUN AND RETURN output = hidden_metricdepth(xdiss) return(output) } maotai/R/movMF_info.R0000644000176200001440000001034314757441436014127 0ustar liggesusers#' Extract meaningful information from the von Mises-Fisher mixture model #' #' Given a mixture of von Mises-Fisher distributions, this function computes #' several related quantities of the data on the unit hypersphere with respect #' to the specified model. #' #' @param data an \eqn{(n\times d)} data matrix. #' @param means an \eqn{(k\times d)} matrix of means. #' @param concentrations a vector of length \eqn{k} of concentration parameters. #' @param weights a vector of length \eqn{k} of mixing weights. #' #' @return a named list containing \describe{ #' \item{densities}{a vector of length \eqn{n} of the densities of the data points.} #' \item{clustering}{a vector of length \eqn{n} of the hard clustering results.} #' \item{loglkd}{the log-likelihood of the data.} #' \item{AIC}{the Akaike information criterion.} #' \item{BIC}{the Bayesian information criterion.} #' } #' #' @export movMF_info <- function(data, means, concentrations, weights){ ############################################### # Preprocessing # data if (!is.matrix(data)){ my_data = cpp_WL_normalise(as.matrix(data)) } else { my_data = cpp_WL_normalise(data) } # means if (!is.matrix(means)){ my_means = cpp_WL_normalise(as.matrix(means)) } else { my_means = cpp_WL_normalise(means) } # concentrations my_concentrations = as.vector(concentrations) if (length(my_concentrations)!=base::nrow(my_means)){ stop("* movMF_info : cardinalities of the means and concentrations do not match.") } # weights if ((length(weights)==0)&&(is.null(weights))){ my_weights = rep(1/length(my_concentrations), length(my_concentrations)) } else { my_weights = as.vector(weights) my_weights = my_weights/base::sum(my_weights) } if (any(my_weights < .Machine$double.eps)||(length(my_weights)!=length(my_concentrations))){ stop("* movMF_info : invalid 'weights'. Please see the documentation.") } ############################################### # BASIC COMPUTATION par_n = base::nrow(my_data) par_p = base::ncol(my_data) par_k = base::length(my_weights) membership <- array(0,c(par_n, par_k)) for (k in 1:par_k){ membership[,k] <- aux_vmf_density(my_data, my_means[k,], my_concentrations[k])*my_weights[k] } ############################################### # ADVANCED # density (n,) vector out_densities = base::rowSums(membership) # hard clustering results out_cluster = rep(0, par_n) for (i in 1:par_n){ tmp_clust <- as.vector(membership[i,]) tmp_clust <- tmp_clust/base::sum(tmp_clust) out_cluster[i] <- which.max(tmp_clust) } # log-likelihood out_loglkd <- base::sum(base::log(out_densities)) # information criteria df <- (par_k - 1) + par_k*((par_p-1)+1) out_aic <- -2*out_loglkd + 2*df out_bic <- -2*out_loglkd + base::log(par_n)*df ############################################### # RETURN output = list() output$densities = out_densities output$clustering = out_cluster output$loglkd <- out_loglkd output$AIC <- out_aic output$BIC <- out_bic return(output) } # auxiliary functions ----------------------------------------------------- #' @keywords internal #' @noRd aux_vmf_density <- function(x, mu, kappa){ # x : (n x d) data matrix # mu : (d,) mean vector # kappa : >0 concentration parameter # initialization d <- base::ncol(x) # normalizing constant log_c_p_kappa <- (d/2 - 1) * log(kappa) - (d/2) * log(2 * pi) - log(base::besselI(kappa, nu = d/2 - 1, expon.scaled = TRUE)) - kappa c_p_kappa <- exp(log_c_p_kappa) # iterate densities = rep(0, base::nrow(x)) for (i in 1:base::nrow(x)){ densities[i] <- base::exp(kappa*base::sum(as.vector(x[i,])*mu))*c_p_kappa } return(densities) } # # simple example # # data matrix normalized # X = as.matrix(iris[,2:4]) # X = as.matrix(scale(X, center=TRUE, scale=FALSE)) # X = X/sqrt(rowSums(X^2)) # # # fit the model with movMF package # out_movMF <- movMF::movMF(X, 3) # clust_movMF <- predict(out_movMF, X) # # # use my function # fit_weights <- out_movMF$alpha # fit_means <- out_movMF$theta/sqrt(rowSums(out_movMF$theta^2)) # fit_concentrations <- sqrt(rowSums(out_movMF$theta^2)) # clust_mine <- movMF_info(X, fit_means, fit_concentrations, fit_weights)$clustering maotai/R/obsolete_distgmm.R0000644000176200001440000001561414753205125015423 0ustar liggesusers#' #' Distance Measures between Multiple Samples using Gaussian Mixture Models #' #' #' #' Taking multiple observations (a sample) as a unit of analysis requires #' #' a measure of discrepancy between samples. \code{distgmm} fits finite #' #' Gaussian mixture models to each sample and use the fitted model as #' #' a representation of a single sample. A single model is selected via #' #' Bayesian Information Criterion (BIC). #' #' #' #' @param datalist a length \eqn{N} list of samples. All elements of the list should be of same type, either \code{vector} or \code{matrix} of same dimension (number of columns). #' #' @param method name of the distance/dissimilarity measure. #' #' @param maxk maximum number of clusters to be fitted using GMM. #' #' @param as.dist a logical; \code{TRUE} to return \code{dist} object, \code{FALSE} to return an \eqn{(N\times N)} symmetric matrix of pairwise distances. #' #' #' #' @return either \code{dist} object of an \eqn{(N\times N)} symmetric matrix of pairwise distances by \code{as.dist} argument. #' #' #' #' @examples #' #' ## let's try two-dimensional data of 30 samples #' #' ## single or mixture of two and three gaussian distributions. #' #' dlist = list() #' #' for (i in 1:10){ #' #' dlist[[i]] = matrix(rnorm(120),ncol=2) #' #' } #' #' for (i in 11:20){ #' #' A = matrix(rnorm(60,mean=-4),ncol=2) #' #' B = matrix(rnorm(60,mean= 4),ncol=2) #' #' dlist[[i]] = rbind(A,B) #' #' } #' #' for (i in 21:30){ #' #' A = matrix(rnorm(40,mean=-4),ncol=2) #' #' B = matrix(rnorm(40),ncol=2) #' #' C = matrix(rnorm(40,mean= 4),ncol=2) #' #' dlist[[i]] = rbind(A,B,C) #' #' } #' #' #' #' ## compute pairwise distances, expecting (3 x 3) block structure. #' #' mm = distgmm(dlist, maxk=5) #' #' #' #' ## visualize #' #' opar <- par(no.readonly=TRUE) #' #' par(pty="s") #' #' image(mm[,nrow(mm):1], main="3-block pattern as expected") #' #' par(opar) #' #' #' #' @keywords internal #' #' @noRd #' distgmm <- function(datalist, method=c("L2"), maxk=5, as.dist=FALSE){ #' ####################################################### #' # Preprocessing : checkers #' if (!check_datalist(datalist)){ #' stop("* distgmm : an input should be a list containing samples of same dimension.") #' } #' maxk = round(maxk) #' method = match.arg(method) #' nlist = length(datalist) #' #' ####################################################### #' # Compute : GMM #' list.gmm = list() #' if (is.vector(datalist[[1]])){ #' vec.flag = TRUE #' for (n in 1:nlist){ #' list.gmm[[n]] = mclust::Mclust(datalist[[n]], G=1:maxk, modelNames="V", verbose=FALSE)$parameters #' } #' } else { #' vec.flag = FALSE #' for (n in 1:nlist){ #' list.gmm[[n]] = mclust::Mclust(datalist[[n]], G=1:maxk, verbose=FALSE, modelNames="VVV")$parameters #' } #' } #' #' ####################################################### #' # Compute : Pairwise Distance #' output = array(0,c(nlist,nlist)) #' for (i in 1:(nlist-1)){ #' objA = list.gmm[[i]] #' for (j in (i+1):nlist){ #' objB = list.gmm[[j]] #' if (vec.flag){ #' theval = switch(method, #' "L2" = distgmm_l2_1d(objA, objB)) #' output[i,j] <- output[j,i] <- theval #' } else { #' theval = switch(method, #' "L2" = distgmm_l2_nd(objA, objB)) #' output[i,j] <- output[j,i] <- theval #' } #' } #' } #' #' ####################################################### #' if (as.dist){ #' return(stats::as.dist(output)) #' } else { #' return(output) #' } #' } #' #' #' # use Mclust 'parameters' object ------------------------------------------ #' #' @keywords internal #' #' @noRd #' distgmm_l2_1d <- function(objA, objB){ #' weightA = as.vector(objA$pro) #' muA = matrix(objA$mean, ncol=1) #' covA = array(0,c(1,1,length(weightA))) #' for (i in 1:length(weightA)){ #' covA[,,i] = objA$variance$sigmasq[i] #' } #' weightB = as.vector(objB$pro) #' muB = matrix(objB$mean, ncol=1) #' covB = array(0,c(1,1,length(weightB))) #' for (i in 1:length(weightB)){ #' covB[,,i] = objB$variance$sigmasq[i] #' } #' #' ## run CPP (same for both 1d and nd cases) #' cpp.res = cpp_pairwise_L2(muA, muB, covA, covB) #' A = cpp.res$A #' B = cpp.res$B #' C = cpp.res$C #' #' ## matrix multiplication #' term1 = base::sum(as.vector(A%*%weightA)*weightA) #' term2 = base::sum(as.vector(B%*%weightB)*weightB) #' term3 = -2*base::sum(as.vector(C%*%weightB)*weightA) #' #' ## return distance/ L2 needs to be taken square root. #' return(base::sqrt(term1+term2+term3)) #' } #' #' @keywords internal #' #' @noRd #' distgmm_l2_nd <- function(objA, objB){ #' weightA = as.vector(objA$pro) #' muA = t(objA$mean) #' covA = objA$variance$sigma #' #' weightB = as.vector(objB$pro) #' muB = t(objB$mean) #' covB = objB$variance$sigma #' #' if (length(dim(covA)) < 3){ #' tmpA = covA #' covA = array(0,c(ncol(muA),ncol(muA),1)) #' covA[,,1] = as.matrix(tmpA) #' } #' if (length(dim(covB)) < 3){ #' tmpB = covB #' covB = array(0,c(ncol(muB),ncol(muB),1)) #' covB[,,1] = as.matrix(tmpB) #' } #' #' ## run CPP (same for both 1d and nd cases) #' cpp.res = cpp_pairwise_L2(muA, muB, covA, covB) #' A = cpp.res$A #' B = cpp.res$B #' C = cpp.res$C #' #' ## matrix multiplication #' term1 = base::sum(as.vector(A%*%weightA)*weightA) #' term2 = base::sum(as.vector(B%*%weightB)*weightB) #' term3 = -2*base::sum(as.vector(C%*%weightB)*weightA) #' #' ## return distance/ L2 needs to be taken square root. #' return(base::sqrt(term1+term2+term3)) #' } #' # # personal experiment ----------------------------------------------------- # x = list() # for (i in 1:20){ # x[[i]] = matrix(rnorm(300*2),ncol=2) # } # for (i in 21:40){ # x[[i]] = rbind(matrix(rnorm(150*2,mean=-4),ncol=2), matrix(rnorm(150*2,mean=4),ncol=2)) # } # for (i in 41:60){ # x[[i]] = rbind(matrix(rnorm(100*2,mean=-4),ncol=2), matrix(rnorm(100*2),ncol=2), matrix(rnorm(150*2,mean=4),ncol=2)) # } # mm = distgmm(x, maxk=10) # image(mm[,nrow(mm):1]) # bestgmm <- function(dat){ # # belows are all automatically implemented in mclustBIC # # # run mclustBIC # # opt_gmm <- (mclust::mclustBIC(dat, G=1:9, verbose = FALSE)) # # colgmm <- colnames(opt_gmm) # # rowgmm <- 1:9 # # # # # extract mclustBIC information # # mm <- matrix(opt_gmm, nrow=nrow(opt_gmm)) # # mm[is.na(mm)] = -Inf # # idmax <- as.integer(which(mm == max(mm), arr.ind = TRUE)) # show the # # # # nclust <- rowgmm[idmax[1]] # # vartype <- colgmm[idmax[2]] # # # run Mclust # runobj <- mclust::Mclust(dat, G=1:10, verbose=FALSE) # # # run GMM with prespecified results # output = list() # output$weight = runobj$parameters$pro # output$mu = t(runobj$parameters$mean) # output$cov = runobj$parameters$variance # return(output) # } maotai/R/zzz.R0000644000176200001440000000217514753205357012725 0ustar liggesusers## RETICULATE : global reference # .pkgenv <- new.env(parent = emptyenv()) .onAttach <- function(...){ ## Retrieve Year Information date <- date() x <- regexpr("[0-9]{4}", date) this.year <- substr(date, x[1], x[1] + attr(x, "match.length") - 1) # Retrieve Current Version this.version = packageVersion("maotai") ## Print on Screen packageStartupMessage("**-----------------------------------------------------------------**") packageStartupMessage("** maotai") packageStartupMessage("** - Tools for Matrix Algebra, Optimization and Inference Problems") packageStartupMessage("**") packageStartupMessage("** Version : ",this.version," (",this.year,")",sep="") packageStartupMessage("** Maintainer : Kisung You (kisung.you@outlook.com)") packageStartupMessage("** Website : https://www.kisungyou.com/maotai") packageStartupMessage("**") packageStartupMessage("** Please share any bugs or suggestions to the maintainer.") packageStartupMessage("**-----------------------------------------------------------------**") } .onUnload <- function(libpath) { library.dynam.unload("maotai", libpath) } maotai/R/ecdfdist2.R0000644000176200001440000001113314753205125013722 0ustar liggesusers#' Pairwise Measures for Two Sets of Empirical CDFs #' #' We measure distance between two sets of empirical cumulative distribution functions (ECDF). For #' simplicity, we only take an input of \code{\link[stats]{ecdf}} objects from \pkg{stats} package. #' #' @param elist1 a length \eqn{M} list of \code{ecdf} objects. #' @param elist2 a length \eqn{N} list of \code{ecdf} objects. #' @param method name of the distance/dissimilarity measure. Case insensitive. #' @param p exponent for \code{Lp} or \code{Wasserstein} distance. #' #' @return an \eqn{(M\times N)} matrix of pairwise distances. #' #' @seealso \code{\link[stats]{ecdf}} \code{\link{ecdfdist}} #' #' @examples #' \donttest{ #' ## toy example #' # first list : 10 of random and uniform distributions #' mylist1 = list() #' for (i in 1:10){ mylist1[[i]] = stats::ecdf(stats::rnorm(50, sd=2))} #' for (i in 11:20){mylist1[[i]] = stats::ecdf(stats::runif(50, min=-5))} #' #' # second list : 15 uniform and random distributions #' mylist2 = list() #' for (i in 1:15){ mylist2[[i]] = stats::ecdf(stats::runif(50, min=-5))} #' for (i in 16:30){mylist2[[i]] = stats::ecdf(stats::rnorm(50, sd=2))} #' #' ## compute Kolmogorov-Smirnov distance #' dm2ks = ecdfdist2(mylist1, mylist2, method="KS") #' dm2lp = ecdfdist2(mylist1, mylist2, method="lp") #' dm2wa = ecdfdist2(mylist1, mylist2, method="wasserstein") #' nrs = nrow(dm2ks) #' #' ## visualize #' opar = par(no.readonly=TRUE) #' par(mfrow=c(1,3), pty="s") #' image(dm2ks[,nrs:1], axes=FALSE, main="Kolmogorov-Smirnov") #' image(dm2lp[,nrs:1], axes=FALSE, main="L2") #' image(dm2wa[,nrs:1], axes=FALSE, main="Wasserstein") #' par(opar) #' } #' #' @export ecdfdist2 <- function(elist1, elist2, method=c("KS","Lp","Wasserstein"), p=2){ ############################################### # Preprocessing if (!elist_check(elist1)){stop("* ecdfdist2 : input 'elist1' should be a list of 'ecdf' objects.")} if (!elist_check(elist2)){stop("* ecdfdist2 : input 'elist2' should be a list of 'ecdf' objects.")} methodss = c("ks","wasserstein","lp") mymethod = tolower(method) mymethod = match.arg(mymethod, methodss) myp = as.integer(p) if (myp <= 0){ stop("* ecdfdist2 : exponent 'p' should be a nonnegative number.") } ############################################### # Computation output = switch(mymethod, "ks" = dist2_ks(elist1, elist2), "wasserstein" = dist2_wasserstein(elist1, elist2, myp), "lp" = dist2_lp(elist1, elist2, myp)) ############################################### # Return return(output) } # single functions -------------------------------------------------------- # (1) dist2_ks : kolmogorov-smirnov # (2) dist2_wasserstein : 1d wasserstein distance # (3) dist2_lp : Lp distance #' @keywords internal #' @noRd dist2_ks <- function(elist1, elist2){ M = length(elist1) N = length(elist2) trflst = elist_fform(c(elist1, elist2)) flist1 = trflst$fval[1:M] flist2 = trflst$fval[(M+1):(M+N)] output = array(0,c(M,N)) for (i in 1:M){ fi = flist1[[i]] for (j in 1:N){ fj = flist2[[j]] theval = max(abs(fi-fj)) output[i,j] <- theval[1] } } return(output) } #' @keywords internal #' @noRd dist2_lp <- function(elist1, elist2, p){ M = length(elist1) N = length(elist2) trflst = elist_fform(c(elist1, elist2)) flist1 = trflst$fval[1:M] flist2 = trflst$fval[(M+1):(M+N)] output = array(0,c(M,N)) for (i in 1:M){ fi = flist1[[i]] for (j in 1:N){ fj = flist2[[j]] if (is.infinite(p)){ output[i,j] = base::max(base::abs(fi-fj))[1] } else { output[i,j] <- ((integrate_1d(trflst$tseq, (abs(fi-fj)^p)))^(1/p)) } } } return(output) } #' @keywords internal #' @noRd dist2_wasserstein <- function(elist1, elist2, p){ M = length(elist1) N = length(elist2) trflst = elist_fform(c(elist1, elist2)) flist1 = trflst$fval[1:M] flist2 = trflst$fval[(M+1):(M+N)] qseq = base::seq(from=1e-6, to=1-(1e-6), length.out=8128) quants1 = list() # compute quantile functions first quants2 = list() for (i in 1:M){ quants1[[i]] = as.double(stats::quantile(elist1[[i]], qseq)) } for (j in 1:N){ quants2[[j]] = as.double(stats::quantile(elist2[[j]], qseq)) } output = array(0,c(M,N)) for (i in 1:M){ vali = quants1[[i]] for (j in 1:N){ valj = quants2[[j]] valij = abs(vali-valj) if (is.infinite(p)){ output[i,j] = base::max(valij) } else { output[i,j] <- ((integrate_1d(qseq, valij^p))^(1/p)) } } } return(output) }maotai/R/aux_checkers.R0000644000176200001440000000406514753205125014525 0ustar liggesusers# CHECKERS ---------------------------------------------------------------- # 01. check_sqmat : if a square matrix # 02. check_symm : if a square, symmetric matrix # 03. check_datalist : if a list of same-dimensional data # 04. check_datamat : if a matrix without weird values # 01. check_sqmat --------------------------------------------------------- #' @keywords internal #' @noRd check_sqmat <- function(x){ cond1 = is.matrix(x) cond2 = (nrow(x)==ncol(x)) cond3 = (!(any(is.infinite(x))||any(is.null(x)))) if (cond1&&cond2&&cond3){ return(TRUE) } else { return(FALSE) } } # 02. check_symm ---------------------------------------------------------- #' @keywords internal #' @noRd check_symm <- function(x){ cond1 = check_sqmat(x) cond2 = isSymmetric(x) if (cond1&&cond2){ return(TRUE) } else { return(FALSE) } } # 03. check_datalist ------------------------------------------------------ #' @keywords internal #' @noRd check_datalist <- function(dlist){ cond1 = (is.list(dlist)) if (is.vector(dlist[[1]])){ cond2 = all(unlist(lapply(dlist, is.vector))==TRUE) cond3 = (unlist(lapply(dlist, check_datavec))==TRUE) if (cond1&&cond2&&cond3){ return(TRUE) } else { return(FALSE) } } else { cond2 = all(unlist(lapply(dlist, is.matrix))==TRUE) cond3 = (length(unique(unlist(lapply(dlist, ncol))))==1) cond4 = all(unlist(lapply(dlist, check_datamat))==TRUE) if (cond1&&cond2&&cond3&&cond4){ return(TRUE) } else { return(FALSE) } } } # 04. check_datamat ------------------------------------------------------- #' @keywords internal #' @noRd check_datamat <- function(data){ cond1 = (is.matrix(data)) cond2 = all(!is.na(data)) cond3 = all(!is.infinite(data)) if (cond1&&cond2&&cond3){ return(TRUE) } else { return(FALSE) } } #' @keywords internal #' @noRd check_datavec <- function(data){ cond1 = (is.vector(data)) cond2 = all(!is.na(data)) cond3 = all(!is.infinite(data)) if (cond1&&cond2&&cond3){ return(TRUE) } else { return(FALSE) } }maotai/R/WLpdist.R0000644000176200001440000000446514754263021013453 0ustar liggesusers#' Pairwise Wasserstein-like Distance between two vMF distributions #' #' Given a collection of von Misees-Fisher (vMF) distributions, compute the pairwise #' distance using the Wasserstein-like distance from an approximate Wasserstein geometry. #' #' @param means An \eqn{(n \times p)} matrix where each row represents the mean #' direction of one of the \eqn{n} vMF distributions. #' @param concentrations A length-\eqn{n} vector of nonnegative concentration parameters. #' #' @return An \eqn{(n \times n)} matrix of pairwise distances. #' #' @examples #' \donttest{ #' # Set seed for reproducibility #' set.seed(123) #' #' # Generate two classes of mean directions around north and south poles #' means1 = array(0,c(50,2)); means1[,2] = rnorm(50, mean=1, sd=0.25) #' means2 = array(0,c(50,2)); means2[,2] = rnorm(50, mean=-1, sd=0.25) #' means1 = means1/sqrt(rowSums(means1^2)) #' means2 = means2/sqrt(rowSums(means2^2)) #' #' # Concatenate the mean directions #' data_means = rbind(means1, means2) #' #' # Generate concentration parameters #' data_concentrations = rnorm(100, mean=20, sd=1) #' #' # Compute the pairwise distance matrix #' pdmat = WLpdist(data_means, data_concentrations) #' #' # Visualise the pairwise distance matrix #' opar <- par(no.readonly=TRUE) #' image(pdmat, main="Pairwise Wasserstein-like Distance") #' par(opar) #' } #' #' @export WLpdist <- function(means, concentrations){ ############################################### # Preprocessing # means if (!is.matrix(means)){ data_means = cpp_WL_normalise(as.matrix(means)) } else { data_means = cpp_WL_normalise(means) } # concentrations data_kappa = as.vector(concentrations) if (length(data_kappa)!=base::nrow(data_means)){ stop("* WLpdist : cardinalities of the means and concentrations do not match.") } # parameters N = base::nrow(means) d = base::ncol(means) ############################################### # COMPUTATION output = array(0,c(N,N)) for (i in 1:(N-1)){ for (j in (i+1):N){ term1 = sphere_dist(as.vector(data_means[i,]), as.vector(data_means[j,])) term2sq = (d-1)*((1/sqrt(data_kappa[i]) - 1/sqrt(data_kappa[j]))^2) output[i,j] <- output[j,i] <- sqrt((term1*term1) + term2sq) } } ############################################### # RETURN return(output) } maotai/R/dpmeans.R0000644000176200001440000001252714753205125013512 0ustar liggesusers#' DP-means Algorithm for Clustering Euclidean Data #' #' DP-means is a nonparametric clustering method motivated by DP mixture model in that #' the number of clusters is determined by a parameter \eqn{\lambda}. The larger #' the \eqn{\lambda} value is, the smaller the number of clusters is attained. #' In addition to the original paper, we added an option to randomly permute #' an order of updating for each observation's membership as a common #' heuristic in the literature of cluster analysis. #' #' @param data an \eqn{(n\times p)} data matrix for each row being an observation. #' @param lambda a threshold to define a new cluster. #' @param maxiter maximum number of iterations. #' @param abstol stopping criterion #' @param permute.order a logical; \code{TRUE} if random order for permutation is used, \code{FALSE} otherwise. #' #' @return a named list containing #' \describe{ #' \item{cluster}{an \eqn{(n\times ndim)} matrix whose rows are embedded observations.} #' \item{centers}{a list containing information for out-of-sample prediction.} #' } #' #' @examples #' ## define data matrix of two clusters #' x1 = matrix(rnorm(50*3,mean= 2), ncol=3) #' x2 = matrix(rnorm(50*3,mean=-2), ncol=3) #' X = rbind(x1,x2) #' lab = c(rep(1,50),rep(2,50)) #' #' ## run dpmeans with several lambda values #' solA <- dpmeans(X, lambda= 5)$cluster #' solB <- dpmeans(X, lambda=10)$cluster #' solC <- dpmeans(X, lambda=20)$cluster #' #' ## visualize the results #' opar <- par(no.readonly=TRUE) #' par(mfrow=c(1,4), pty="s") #' plot(X,col=lab, pch=19, cex=.8, main="True", xlab="x", ylab="y") #' plot(X,col=solA, pch=19, cex=.8, main="dpmeans lbd=5", xlab="x", ylab="y") #' plot(X,col=solB, pch=19, cex=.8, main="dpmeans lbd=10", xlab="x", ylab="y") #' plot(X,col=solC, pch=19, cex=.8, main="dpmeans lbd=20", xlab="x", ylab="y") #' par(opar) #' #' \donttest{ #' ## let's find variations by permuting orders of update #' ## used setting : lambda=20, we will 8 runs #' sol8 <- list() #' for (i in 1:8){ #' sol8[[i]] = dpmeans(X, lambda=20, permute.order=TRUE)$cluster #' } #' #' ## let's visualize #' vpar <- par(no.readonly=TRUE) #' par(mfrow=c(2,4), pty="s") #' for (i in 1:8){ #' pm = paste("permute no.",i,sep="") #' plot(X,col=sol8[[i]], pch=19, cex=.8, main=pm, xlab="x", ylab="y") #' } #' par(vpar) #' } #' #' @references #' \insertRef{kulis_revisiting_2012}{maotai} #' #' @export dpmeans <- function(data, lambda=1, maxiter=1234, abstol=1e-6, permute.order=FALSE){ ############################################################ # Preprocessing if (!check_datamat(data)){ stop("* dpmeans : an input 'data' should be a matrix without any missing/infinite values.") } # Parameter and Initialization n = nrow(data) p = ncol(data) k = 1 # set k=1 labels = rep(1,n) # labels={1,2,...,n} mu = matrix(colMeans(data), nrow=1) # global mean lambda = as.double(lambda) ############################################################ # Main Iteration ss.old = compute.ss(data, labels, mu)+ k*lambda ss.new = 0 for (iter in 1:maxiter){ # 0. updating order of observations if (permute.order){ idseq = sample(1:n) } else { idseq = 1:n } # 1. update the class membership per each class for (i in idseq){ # 1-1. compute distances to the centers # dic = rep(0, k); for (j in 1:k){dic[j] = sum((as.vector(data[i,])-as.vector(mu[j,]))^2)} dic = as.vector(dat2centers(data[i,], mu)); # cpp conversion # 1-2. assign new or stay if (min(dic) > lambda){ k = k+1 labels[i] = k mu = rbind(mu, data[i,]) } else { idmins = which(dic==min(dic)) if (length(idmins)>1){ labels[i] = sample(idmins, 1) } else { labels[i] = idmins } } } # 2. rearrange the label (remove empty ones) labels = as.factor(labels) ulabel = sort(unique(labels)) labnew = rep(0,n) for (i in 1:length(ulabel)){ labnew[(labels==ulabel[i])] = i } labels = labnew k = round(max(labels)) # 3. compute per-class means uassign = sort(unique(labels)) mu = array(0,c(k,p)) for (i in 1:k){ idmean = which(labels==uassign[i]) if (length(idmean)==1){ mu[i,] = as.vector(data[idmean,]) } else { mu[i,] = as.vector(colMeans(data[idmean,])) } } # 4. compute DPMEANS objective function ss.new = compute.ss(data, labels, mu) + k*lambda ss.delta = ss.old-ss.new ss.old = ss.new # 5. stop if updating is not significant if (ss.delta < abstol){ break } } ############################################################ # Return the results output = list() output$cluster = as.factor(labels) output$centers = mu return(output) } # auxiliary functions ----------------------------------------------------- #' @keywords internal #' @noRd compute.ss <- function(data, label, centers){ p = ncol(data) if (is.vector(centers)){ centers = matrix(centers, nrow=1) } ulabel = sort(unique(label)) output = 0 for (i in 1:length(ulabel)){ subdata = data[(label==ulabel[i]),] if (!is.vector(subdata)){ nn = nrow(subdata) for (j in 1:nn){ output = output + sum((as.vector(subdata[j,])-as.vector(centers[i,]))^2) } } } return(output) } maotai/R/checkdist.R0000644000176200001440000000270614753205125014022 0ustar liggesusers#' Check for Distance Matrix #' #' This function checks whether the distance matrix \eqn{D:=d_{ij} = d(x_i, x_j)} satisfies #' three axioms to make itself a semimetric, which are (1) \eqn{d_{ii} = 0}, (2) \eqn{d_{ij} > 0} for \eqn{i\neq j}, and #' (3) \eqn{d_{ij} = d_{ji}}. #' #' @param d \code{"dist"} object or \eqn{(N\times N)} matrix of pairwise distances. #' #' @return a logical; \code{TRUE} if it satisfies metric property, \code{FALSE} otherwise. #' #' @examples #' ## Let's use L2 distance matrix of iris dataset #' data(iris) #' dx = as.matrix(stats::dist(iris[,1:4])) #' #' # perturb d(i,j) #' dy = dx #' dy[1,2] <- dy[2,1] <- 10 #' #' # run the algorithm #' checkdist(dx) #' checkdist(dy) #' #' @seealso \code{\link{checkmetric}} #' @export checkdist <- function(d){ if (inherits(d, "dist")){ d = as.matrix(d) } else { if (!is.matrix(d)){ stop("* checkdist : input 'd' should be a matrix.") } } # 1. square matrix if (nrow(d)!=ncol(d)){ message(" checkdist : input 'd' is not a square matrix.") return(FALSE) } # 2. zero diagonals if (any(diag(d)!=0)){ message(" checkdist : input 'd' has non-zero diagonals.") return(FALSE) } # 3. all positive elements if (any(d < 0)){ message(" checkdist : input 'd' has negative values.") return(FALSE) } # 4. symmetric if (!base::isSymmetric(d)){ message(" checkdist : input 'd' is not symmetric.") return(FALSE) } return(TRUE) }maotai/R/checkmetric.R0000644000176200001440000000332114753205125014334 0ustar liggesusers#' Check for Metric Matrix #' #' This function checks whether the distance matrix \eqn{D:=d_{ij} = d(x_i, x_j)} satisfies #' four axioms to make itself a semimetric, which are (1) \eqn{d_{ii} = 0}, (2) \eqn{d_{ij} > 0} for \eqn{i\neq j}, #' (3) \eqn{d_{ij} = d_{ji}}, and (4) \eqn{d_{ij} \leq d_{ik} + d_{kj}}. #' #' @param d \code{"dist"} object or \eqn{(N\times N)} matrix of pairwise distances. #' #' @return a logical; \code{TRUE} if it satisfies metric property, \code{FALSE} otherwise. #' #' @examples #' ## Let's use L2 distance matrix of iris dataset #' data(iris) #' dx = as.matrix(stats::dist(iris[,1:4])) #' #' # perturb d(i,j) #' dy = dx #' dy[1,2] <- dy[2,1] <- 10 #' #' # run the algorithm #' checkmetric(dx) #' checkmetric(dy) #' #' @seealso \code{\link{checkdist}} #' @export checkmetric <- function(d){ if (inherits(d, "dist")){ d = as.matrix(d) } else { if (!is.matrix(d)){ stop("* checkmetric : input 'd' should be a matrix.") } } # 1. square matrix if (nrow(d)!=ncol(d)){ message(" checkmetric : input 'd' is not a square matrix.") return(FALSE) } # 2. zero diagonals if (any(diag(d)!=0)){ message(" checkmetric : input 'd' has non-zero diagonals.") return(FALSE) } # 3. all positive elements if (any(d < 0)){ message(" checkmetric : input 'd' contains negative values.") return(FALSE) } # 4. symmetric if (!base::isSymmetric(d)){ message(" checkmetric : input 'd' is not symmetric.") return(FALSE) } # 5. triangle inequality return(cpp_triangle(d)) } # data(iris) # xx = as.matrix(iris[,1:4]) # dx = stats::dist(xx) # dd = as.matrix(dx) # # checkdist(dx) # checkmetric(dx) # # i=4 # j=11 # k=8 # # dd[i,j] # dd[i,k]+dd[k,j]maotai/R/later_install_scipy.R0000644000176200001440000000032614753205125016121 0ustar liggesusers#' Install 'SciPy' Python Module #' #' #' @keywords internal #' @noRd install_scipy <- function(method = "auto", conda = "auto") { # reticulate::py_install("scipy", method = method, conda = conda) return(1) }maotai/R/epmeans.R0000644000176200001440000000635114753205125013511 0ustar liggesusers#' EP-means Algorithm for Clustering Empirical Distributions #' #' EP-means is a variant of k-means algorithm adapted to cluster #' multiple empirical cumulative distribution functions under metric structure #' induced by Earth Mover's Distance. #' #' @param elist a length \eqn{N} list of either vector or \code{ecdf} objects. #' @param k the number of clusters. #' #' @return a named list containing \describe{ #' \item{cluster}{an integer vector indicating the cluster to which each \code{ecdf} is allocated.} #' \item{centers}{a length \eqn{k} list of centroid \code{ecdf} objects.} #' } #' #' @examples #' \donttest{ #' ## two sets of 1d samples, 10 each and add some noise #' # set 1 : mixture of two gaussians #' # set 2 : single gamma distribution #' #' # generate data #' elist = list() #' for (i in 1:10){ #' elist[[i]] = stats::ecdf(c(rnorm(100, mean=-2), rnorm(50, mean=2))) #' } #' for (j in 11:20){ #' elist[[j]] = stats::ecdf(rgamma(100,1) + rnorm(100, sd=sqrt(0.5))) #' } #' #' # run EP-means with k clusters #' # change the value below to see different settings #' myk = 2 #' epout = epmeans(elist, k=myk) #' #' # visualize #' opar = par(no.readonly=TRUE) #' par(mfrow=c(1,myk)) #' for (k in 1:myk){ #' idk = which(epout$cluster==k) #' for (i in 1:length(idk)){ #' if (i<2){ #' pm = paste("class ",k," (size=",length(idk),")",sep="") #' plot(elist[[idk[i]]], verticals=TRUE, lwd=0.25, do.points=FALSE, main=pm) #' } else { #' plot(elist[[idk[i]]], add=TRUE, verticals=TRUE, lwd=0.25, do.points=FALSE) #' } #' plot(epout$centers[[k]], add=TRUE, verticals=TRUE, lwd=2, col="red", do.points=FALSE) #' } #' } #' par(opar) #' } #' #' @references #' \insertRef{henderson_epmeans_2015}{maotai} #' #' @export epmeans <- function(elist, k=2){ ############################################### # Preprocessing clist = elist_epmeans(elist) # will use quantized ones only / flist = elist_fform(qlist) myk = round(k) myn = length(clist) # Quantization mylength = 1000 qseq = seq(from=1e-6, to=1-(1e-6), length.out=mylength) qmat = array(0,c(myn,mylength)) for (n in 1:myn){ qmat[n,] = as.vector(stats::quantile(clist[[n]], qseq)) } ############################################### # Rcpp k-means tmpcpp = cpp_kmeans(qmat, myk)$means ############################################### # Pairwise Distance Computation # wrap mylist1 = list() mylist2 = list() for (n in 1:myn){ mylist1[[n]] = stats::ecdf(as.vector(qmat[n,])) } for (k in 1:myk){ mylist2[[k]] = stats::ecdf(as.vector(tmpcpp[k,])) } # compute pairwise distance using Earth Mover's Distance pdistmat = dist2_wasserstein(mylist1, mylist2, 1) # index label = base::apply(pdistmat, 1, which.min) ############################################### # Return : we want to add 'Silhouette' output = list() output$cluster = as.integer(label) output$centers = mylist2 return(output) } # ## personal examples # cdf0 = stats::ecdf(rnorm(100, sd=3)) # original ECDF # qseq = seq(from=0,to=1,length.out=1000) # quantile sequence # quant = stats::quantile(cdf0, qseq) # cdf1 = stats::ecdf(quant) # # par(mfrow=c(1,2)) # plot(cdf0, main="Original") # plot(cdf1, main="Recovered") maotai/R/lyapunov.R0000644000176200001440000000313414753205125013732 0ustar liggesusers#' Solve Lyapunov Equation #' #' The Lyapunov equation is of form #' \deqn{AX + XA^\top = Q} #' where \eqn{A} and \eqn{Q} are square matrices of same size. Above form is also known as \emph{continuous} form. #' This is a wrapper of \code{armadillo}'s \code{sylvester} function. #' #' @param A a \eqn{(p\times p)} matrix as above. #' @param Q a \eqn{(p\times p)} matrix as above. #' #' @return a solution matrix \eqn{X} of size \eqn{(p\times p)}. #' #' @examples #' ## simulated example #' # generate square matrices #' A = matrix(rnorm(25),nrow=5) #' X = matrix(rnorm(25),nrow=5) #' Q = A%*%X + X%*%t(A) #' #' # solve using 'lyapunov' function #' solX = lyapunov(A,Q) #' \dontrun{ #' pm1 = "* Experiment with Lyapunov Solver" #' pm2 = paste("* Absolute Error : ",norm(solX-X,"f"),sep="") #' pm3 = paste("* Relative Error : ",norm(solX-X,"f")/norm(X,"f"),sep="") #' cat(paste(pm1,"\n",pm2,"\n",pm3,sep="")) #' } #' #' @references #' \insertRef{sanderson_armadillo_2016}{maotai} #' #' \insertRef{eddelbuettel_rcpparmadillo_2014}{maotai} #' #' @export lyapunov <- function(A, Q){ ################################################################### # check square matrix if (!check_sqmat(A)){ stop("* lyapunov : an input 'A' should be a square matrix.") } if (!check_sqmat(Q)){ stop("* lyapunov : an input 'Q' should be a square matrix.") } ################################################################### # arrange for RcppArmadillo format B = t(A) C = -Q ################################################################### # pass and return return(solve_lyapunov(A,B,C)) }maotai/R/nem.R0000644000176200001440000000217114753205125012634 0ustar liggesusers#' Negative Eigenvalue Magnitude #' #' Negative Eigenvalue Magnitude (NEM) is a measure of distortion for the data #' whether they are lying in Euclidean manner or not. When the value is exactly 0, it means #' the data is Euclidean. On the other hand, when NEM is far away from 0, it means not Euclidean. #' The concept of NEM is closely related to the definiteness of a Gram matrix. #' #' @param data an \eqn{(n\times p)} matrix whose rows are observations. #' #' @return a nonnegative NEM value. #' #' @examples #' ## use simple example of iris dataset #' data(iris) #' mydat = as.matrix(iris[,1:4]) #' #' ## calculate NEM #' nem(mydat) #' #' @references #' \insertRef{pekalska_noneuclidean_2006}{maotai} #' #' @export nem <- function(data){ ############################################################ # Preprocessing if (!check_datamat(data)){ stop("* nem : an input 'data' should be a matrix without any missing/infinite values.") } xdiss = stats::as.dist(cpp_pdist(data)) ############################################################ # Compute and Return output = hidden_nem(xdiss) return(output) } maotai/R/trio.R0000644000176200001440000002241214753205125013032 0ustar liggesusers#' Trace Ratio Optimation #' #' This function provides several algorithms to solve the following problem #' \deqn{\textrm{max} \frac{tr(V^\top A V)}{tr(V^\top B V)} \textrm{ such that } V^\top C V = I} #' where \eqn{V} is a projection matrix, i.e., \eqn{V^\top V = I}. Trace ratio optimization #' is pertained to various linear dimension reduction methods. It should be noted that #' when \eqn{C = I}, the above problem is often reformulated as a generalized eigenvalue problem #' since it's an easier proxy with faster computation. #' #' @param A a \eqn{(p\times p)} symmetric matrix in the numerator term. #' @param B a \eqn{(p\times p)} symmetric matrix in the denomiator term. #' @param C a \eqn{(p\times p)} symmetric constraint matrix. If not provided, it is set as identical matrix automatically. #' @param dim an integer for target dimension. It can be considered as the number of loadings. #' @param method the name of algorithm to be used. Default is \code{2003Guo}. #' @param maxiter maximum number of iterations to be performed. #' @param eps stopping criterion for iterative algorithms. #' #' @return a named list containing #' \describe{ #' \item{V}{a \eqn{(p\times dim)} projection matrix.} #' \item{tr.val}{an attained maximum scalar value.} #' } #' #' @examples #' ## simple test #' # problem setting #' p = 5 #' mydim = 2 #' A = matrix(rnorm(p^2),nrow=p); A=A%*%t(A) #' B = matrix(runif(p^2),nrow=p); B=B%*%t(B) #' C = diag(p) #' #' # approximate solution via determinant ratio problem formulation #' eigAB = eigen(solve(B,A)) #' V = eigAB$vectors[,1:mydim] #' eigval = sum(diag(t(V)%*%A%*%V))/sum(diag(t(V)%*%B%*%V)) #' #' # solve using 4 algorithms #' m12 = trio(A,B,dim=mydim, method="2012Ngo") #' m09 = trio(A,B,dim=mydim, method="2009Jia") #' m07 = trio(A,B,dim=mydim, method="2007Wang") #' m03 = trio(A,B,dim=mydim, method="2003Guo") #' #' # print the results #' line1 = '* Evaluation of the cost function' #' line2 = paste("* approx. via determinant : ",eigval,sep="") #' line3 = paste("* trio by 2012Ngo : ",m12$tr.val, sep="") #' line4 = paste("* trio by 2009Jia : ",m09$tr.val, sep="") #' line5 = paste("* trio by 2007Wang : ",m07$tr.val, sep="") #' line6 = paste("* trio by 2003Guo : ",m03$tr.val, sep="") #' cat(line1,"\n",line2,"\n",line3,"\n",line4,"\n",line5,"\n",line6) #' #' @references #' \insertRef{guo_generalized_2003}{maotai} #' #' \insertRef{wang_trace_2007}{maotai} #' #' \insertRef{yangqingjia_trace_2009}{maotai} #' #' \insertRef{ngo_trace_2012}{maotai} #' #' @export trio <- function(A, B, C, dim=2, method=c("2003Guo","2007Wang","2009Jia","2012Ngo"), maxiter=1000, eps=1e-10){ ################################################################### # not completed yet. if (missing(C)){ C = diag(nrow(A)) myflag = TRUE } else { myflag = FALSE } if (!check_symm(A)){ stop("* trio : an input matrix 'A' should be a square, symmetric matrix.") } if (!check_symm(B)){ stop("* trio : an input matrix 'B' should be a square, symmetric matrix.") } if (!check_symm(C)){ stop("* trio : an input matrix 'C' should be a square, symmetric matrix.") } sizes = rep(0,3) sizes[1] = nrow(A) sizes[2] = nrow(B) sizes[3] = nrow(C) if (length(unique(sizes))!=1){ stop("* trio : all input matrices should be of same size.") } if (!myflag){ eigC = eigen(C) Cinv2 = eigC$vectors%*%diag(1/sqrt(eigC$values))%*%t(eigC$vectors) A = Cinv2%*%A%*%Cinv2 B = Cinv2%*%B%*%Cinv2 } # 2009 Jia's note : B should have rank >= (m-d) if (as.integer(Matrix::rankMatrix(B))<(nrow(B)-dim)){ warning("* trio : null space of 'B' is excessive. trace ratio value may diverge.") } ################################################################### # switch case V = switch(method, "2007Wang" = trio2007Wang(A, B, dim, eps, maxiter), "2003Guo" = trio2003Guo(A, B, dim, eps, maxiter), "2009Jia" = trio2009Jia(A, B, dim, eps, maxiter), "2012Ngo" = trio2012Ngo(A, B, dim, eps, maxiter)) output = list() output$V = V output$tr.val = sum(diag(t(V)%*%A%*%V))/sum(diag(t(V)%*%B%*%V)) return(output) } # subroutines ------------------------------------------------------------- #' 2003 Guo et al. #' Title : A generalized Foley-Sammon transform based on generalized fisher discriminant ... #' #' @keywords internal #' @noRd trio2003Guo <- function(A, B, dim, eps, maxiter){ ## translate into the language d = dim Sp = A Sl = B ## bisection # 1. initialization lbd1 = 0; f1 = evalGuoDiff(lbd1, Sp, Sl, d) lbd2 = 1; f2 = evalGuoDiff(lbd2, Sp, Sl, d) if (f2 >= 0){ while (f2 > 0){ lbd1 = lbd2; f1 = f2; lbd2 = lbd2*2; f2 = evalGuoDiff(lbd2, Sp, Sl, d) } } for (i in 1:maxiter){ lbdm = (lbd1+lbd2)/2 fm = evalGuoDiff(lbdm, Sp, Sl, d) if (fm > 0){ lbd1 = lbdm f1 = fm } else { lbd2 = lbdm f2 = fm } if (abs(lbd1-lbd2) < eps){ break } } lbdm = (lbd1+lbd2)/2 # W = eigen(Sp-lbdm*Sl)$vectors[,1:d] ## use RSpectra for only top 'd' ones W = RSpectra::eigs(Sp-lbdm*Sl,d,which="LR")$vectors ## let's try to return ! return(W) } #' @keywords internal #' @noRd evalGuoDiff <- function(lbd, A, B, dim){ W = RSpectra::eigs(A-lbd*B,dim,which="LR")$vectors # W = eigen(A-lbd*B)$vectors[,1:dim] ## use RSpectra for only top 'd' ones return(sum(diag(t(W)%*%(A-lbd*B)%*%W))) } #' 2007 Wang et al. #' Title : Trace Ratio vs. Ratio Trace for Dimensionality Reduction #' #' @keywords internal #' @noRd trio2007Wang <- function(A, B, dim, eps, maxiter){ ## translate into this language Sp = A St = A+B m = nrow(A) d = dim eigSt = base::eigen(St, symmetric = TRUE) mm = sum(eigSt$values > 0) if (mm < 1){ stop("* (A+B) has at least one nonnegative eigenvalues.") } U = eigSt$vectors[,1:mm] ## transform into the problem of V now. Spu = t(U)%*%Sp%*%U Stu = t(U)%*%St%*%U Vold = qr.Q(qr(matrix(rnorm(mm*d),ncol=d))) ## main computation V = cppsub_2007Wang(Vold, mm, d, Spu, Stu, maxiter, eps) ## adjust back to the original problem W = U%*%V ## let's try to return ! return(W) } #' 2009 Jia et al #' Title : Trace Ratio Problem Revisited (DNM) #' #' @keywords internal #' @noRd trio2009Jia <- function(A, B, dim, eps, maxiter){ ## translate into the language d = dim Sp = A Sl = B ## Decomposed Newton Method lbdold = 0 for (i in 1:maxiter){ ## 1. compute eigendecomposition eigS = RSpectra::eigs(Sp-lbdold*Sl,d,which="LR") top.val = eigS$values # top 'd' eigenvalues top.vec = eigS$vectors # top 'd' eigenvectors ## 2. lbdnew lbdnew = solve2009Jia(lbdold, top.val, top.vec, Sl) inc = abs(lbdnew-lbdold) ## 3. updating information lbdold = lbdnew if (inc